From 528faed344119cc339a8425acef7794e05a9d505 Mon Sep 17 00:00:00 2001
From: Bo Zhang <105368690+zhangbopd@users.noreply.github.com>
Date: Mon, 4 Dec 2023 17:00:41 +0800
Subject: [PATCH] =?UTF-8?q?[Cmake=20=E6=B2=BB=E7=90=86]=20Move=20DDim=20et?=
 =?UTF-8?q?c.=20to=20common=20(#59105)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix conflict

* exception

* kunlun ci

* WIN_CI

* setup.py

* bug_fix

* hash

* auto_code_gen_WIN_CI

* inference_CI

* use_common_enforce

* delete pir_enforce

* delete_error

* change_cmake

* conflict

* cmake

* mac_CI

* inference_copy

* delete_pybind_common

* paddle_test

* split ddim constructor

* cc_test

* use cinn::common

* copy_infer

* delete_layer_test_new

* bug_fix

* infer

* fix inference bug

* conflict

---------

Co-authored-by: winter-wang <1030748926@qq.com>
---
 cmake/generic.cmake                           |   2 +-
 cmake/inference_lib.cmake                     |  44 +--
 paddle/cinn/api/tensor_node.h                 |  18 +-
 paddle/cinn/ast_gen_ius/ast_gen.cc            |   6 +-
 .../cinn/auto_schedule/analysis/analyze_ir.cc |   4 +-
 .../cinn/auto_schedule/analysis/analyze_ir.h  |   2 +-
 .../auto_schedule/analysis/analyze_ir_test.cc |  12 +-
 paddle/cinn/auto_schedule/auto_tuner.cc       |   4 +-
 paddle/cinn/auto_schedule/auto_tuner.h        |   4 +-
 paddle/cinn/auto_schedule/auto_tuner_test.cc  |   4 +-
 .../cost_model/expr_cost_model.cc             |   6 +-
 .../cost_model/expr_cost_model.h              |   6 +-
 .../cinn/auto_schedule/cost_model/feature.cc  |   6 +-
 .../cinn/auto_schedule/cost_model/feature.h   |   4 +-
 .../cost_model/feature_extractor.cc           |  16 +-
 .../cost_model/feature_extractor.h            |   3 +-
 .../cost_model/feature_extractor_test.cc      |   8 +-
 .../cost_model/xgb_cost_model.cc              |   2 +-
 .../database/jsonfile_database_test.cc        |   2 +-
 .../auto_schedule/measure/measurer_test.cc    |   6 +-
 .../auto_schedule/measure/simple_runner.cc    |  18 +-
 .../measure/simple_runner_test.cc             |  10 +-
 .../cooperative_process_test.cc               |   2 +-
 .../search_space/auto_gen_rule/auto_bind.h    |   2 +-
 .../auto_gen_rule/auto_bind_test.cc           |   4 +-
 .../auto_gen_rule/auto_gen_rule.cc            |   3 +-
 .../auto_gen_rule/auto_gen_rule.h             |   4 +-
 .../search_space/auto_gen_rule/auto_inline.cc |   2 +-
 .../search_space/auto_gen_rule/auto_inline.h  |   2 +-
 .../auto_gen_rule/auto_inline_test.cc         |  14 +-
 .../search_space/auto_gen_rule/auto_unroll.h  |   3 +-
 .../auto_gen_rule/auto_unroll_test.cc         |   8 +-
 .../auto_gen_rule/mix_rules_test.cc           |   2 +-
 .../auto_gen_rule/multi_level_tiling.cc       |   8 +-
 .../auto_gen_rule/multi_level_tiling.h        |   4 +-
 .../auto_gen_rule/multi_level_tiling_test.cc  |  22 +-
 .../auto_gen_rule/reduction_factoring.cc      |   2 +-
 .../auto_gen_rule/reduction_factoring.h       |   2 +-
 .../auto_gen_rule/reduction_factoring_test.cc |   4 +-
 .../search_space/auto_gen_rule/skip_rule.cc   |   2 +-
 .../search_space/auto_gen_rule/skip_rule.h    |   2 +-
 .../auto_gen_rule/skip_rule_test.cc           |   8 +-
 .../search_space/auto_gen_rule/test_helper.cc |  20 +-
 .../search_space/auto_gen_rule/test_helper.h  |   6 +-
 .../search_space/rule_sampler_test.cc         |   4 +-
 .../search_space/search_state.cc              |   3 +-
 .../auto_schedule/search_space/search_state.h |   4 +-
 .../search_space/search_state_test.cc         |   2 +-
 .../evolutionary_search_test.cc               |  10 +-
 .../mutate_rule/mutate_tile_size_test.cc      |   4 +-
 .../auto_schedule/task/task_creator_test.cc   |   4 +-
 .../cinn/auto_schedule/task/task_optimizer.cc |   8 +-
 .../auto_schedule/task/task_registry_test.cc  |  10 +-
 paddle/cinn/auto_schedule/task/tune_task.cc   |   3 +-
 paddle/cinn/auto_schedule/task/tune_task.h    |   2 +-
 .../cinn/auto_schedule/task/tune_task_test.cc |  18 +-
 .../tests/performance_comparison_test.cc      |  15 +-
 paddle/cinn/backends/codegen_c.cc             |   9 +-
 paddle/cinn/backends/codegen_c_test.cc        |   8 +-
 paddle/cinn/backends/codegen_cuda_dev.cc      |   5 +-
 paddle/cinn/backends/codegen_cuda_util.h      |   5 +-
 paddle/cinn/backends/codegen_debug_test.cc    |   2 +-
 paddle/cinn/backends/compiler_test.cc         |  63 ++--
 paddle/cinn/backends/ir_schedule_test.cc      |  94 +++---
 paddle/cinn/backends/llvm/codegen_llvm.cc     |  17 +-
 paddle/cinn/backends/llvm/codegen_llvm.h      |   2 +-
 .../cinn/backends/llvm/codegen_llvm_test.cc   | 111 ++++---
 paddle/cinn/backends/llvm/codegen_x86.cc      |   4 +-
 paddle/cinn/backends/llvm/codegen_x86_test.cc |  15 +-
 .../backends/llvm/execution_engine_test.cc    |  14 +-
 paddle/cinn/backends/llvm/llvm_intrin_rule.h  |   2 +-
 paddle/cinn/backends/llvm/llvm_util.cc        |  10 +-
 paddle/cinn/backends/llvm/llvm_util.h         |   2 +-
 paddle/cinn/backends/nvrtc/nvrtc_util.cc      |   4 +-
 paddle/cinn/common/arithmatic.cc              |   2 +-
 paddle/cinn/common/axis.cc                    |   2 +-
 paddle/cinn/common/cas.cc                     |   6 +-
 paddle/cinn/common/cas_test.cc                |   6 +-
 paddle/cinn/common/cinn_value.h               |   2 +-
 paddle/cinn/common/cinn_value_test.cc         |   2 +-
 paddle/cinn/common/common.h                   |  32 +-
 .../cinn/common/equation_graph_topo_walker.h  |   2 +-
 paddle/cinn/common/graph_utils.cc             |   2 +-
 paddle/cinn/common/graph_utils.h              |  12 +-
 paddle/cinn/common/ir_util.cc                 |   6 +-
 paddle/cinn/common/ir_util.h                  |   4 +-
 paddle/cinn/common/make_subgraph_walker.h     |  14 +-
 paddle/cinn/common/union_find.h               |   2 +-
 paddle/cinn/frontend/computation_test.cc      |  20 +-
 paddle/cinn/frontend/decomposer/activation.cc |  22 +-
 .../frontend/decomposer/activation_test.cc    |   2 +-
 paddle/cinn/frontend/decomposer/batch_norm.cc |  57 ++--
 .../frontend/decomposer/batch_norm_test.cc    |   4 +-
 paddle/cinn/frontend/decomposer/test_helper.h |   4 +-
 paddle/cinn/frontend/decomposer/top_k_test.cc |   2 +-
 paddle/cinn/frontend/decomposer_registry.h    |   4 +-
 .../cinn/frontend/decomposer_registry_test.cc |   2 +-
 paddle/cinn/frontend/interpreter_test.cc      |   3 +-
 paddle/cinn/frontend/net_builder.cc           |  11 +-
 paddle/cinn/frontend/net_builder.h            |  17 +-
 paddle/cinn/frontend/net_builder_test.cc      |  98 +++---
 paddle/cinn/frontend/op_mapper_registry.h     |   6 +-
 .../cinn/frontend/op_mappers/common_utils.h   |   2 +-
 .../cinn/frontend/op_mappers/paddle/clip.cc   |  19 +-
 .../frontend/op_mappers/paddle/constant.cc    |   4 +-
 .../cinn/frontend/op_mappers/paddle/cumsum.cc |   2 +-
 .../frontend/op_mappers/paddle/elementwise.cc |   2 +-
 .../frontend/op_mappers/paddle/layer_norm.cc  |  17 +-
 .../cinn/frontend/op_mappers/paddle/norm.cc   |  13 +-
 .../cinn/frontend/op_mappers/paddle/reduce.cc |   2 +-
 .../cinn/frontend/op_mappers/paddle/scale.cc  |   5 +-
 .../frontend/op_mappers/paddle/scatter.cc     |  12 +-
 .../frontend/op_mappers/science/broadcast.cc  |   2 +-
 .../frontend/op_mappers/science/transform.cc  |   2 +-
 paddle/cinn/frontend/optimize.cc              |   4 +-
 paddle/cinn/frontend/optimize.h               |   4 +-
 paddle/cinn/frontend/paddle/model_parser.cc   |  10 +-
 paddle/cinn/frontend/paddle/model_parser.h    |  25 +-
 .../cinn/frontend/paddle_model_convertor.cc   |   6 +-
 paddle/cinn/frontend/paddle_model_convertor.h |   4 +-
 .../frontend/paddle_model_convertor_test.cc   |   2 +-
 .../cinn/frontend/paddle_model_to_program.cc  |   3 +-
 .../cinn/frontend/paddle_model_to_program.h   |   4 +-
 paddle/cinn/frontend/pass/auto_broadcast.cc   |   2 +-
 paddle/cinn/frontend/pass/auto_cast.cc        |   7 +-
 paddle/cinn/frontend/pass/auto_cast_test.cc   |  18 +-
 paddle/cinn/frontend/pass/cast_collapsing.cc  |   2 +-
 .../frontend/pass/cast_collapsing_test.cc     |  16 +-
 .../cinn/frontend/pass/dead_code_eliminate.cc |   2 +-
 .../frontend/pass/dead_code_eliminate_test.cc |   4 +-
 paddle/cinn/frontend/pass/decomposer.cc       |   2 +-
 paddle/cinn/frontend/pass/decomposer_test.cc  |   4 +-
 .../frontend/pass/expand_zero_dim_pass.cc     |   2 +-
 .../pass/expand_zero_dim_pass_test.cc         |   4 +-
 .../frontend/pass/fill_constant_folding.cc    |   2 +-
 .../pass/fill_constant_folding_test.cc        |   8 +-
 .../frontend/pass/fill_constant_rewriter.cc   |   2 +-
 paddle/cinn/frontend/pass/gemm_rewriter.cc    |   2 +-
 .../cinn/frontend/pass/gemm_rewriter_test.cc  |  20 +-
 paddle/cinn/frontend/pass/pass_test_helper.h  |   8 +-
 .../frontend/pass/program_topoerror_test.cc   |   2 +-
 paddle/cinn/frontend/pass/remove_identity.cc  |   2 +-
 paddle/cinn/frontend/pass/test_helper.h       |   2 +-
 .../frontend/pass/transpose_collapsing.cc     |   2 +-
 .../pass/transpose_collapsing_test.cc         |  22 +-
 .../frontend/pass/transpose_folding_base.h    |   2 +-
 .../pass/transpose_folding_input_test.cc      |  18 +-
 .../pass/transpose_folding_output_test.cc     |  40 +--
 .../pass/transpose_scale_folding_test.cc      |  28 +-
 paddle/cinn/frontend/program_pass.cc          |   2 +-
 paddle/cinn/frontend/program_pass.h           |   6 +-
 paddle/cinn/frontend/syntax.cc                |   9 +-
 paddle/cinn/frontend/syntax.h                 |  37 +--
 paddle/cinn/frontend/syntax_test.cc           |   6 +-
 paddle/cinn/frontend/var_type_utils.h         |   7 +-
 .../hlir/dialect/operator/ir/manual_op.cc     |   2 +-
 .../add_broadcast_to_elementwise_pass.cc      |   2 +-
 .../group_with_group_merge_pass.cc            |  16 +-
 .../group_with_group_merge_pass_utils.h       |   5 +-
 .../group_merge/group_with_group_merge_util.h |  24 +-
 .../group_merge/op_with_group_merge_pass.cc   |   7 +-
 .../group_merge/op_with_group_merge_util.h    |  30 +-
 .../hlir/dialect/runtime/ir/jit_kernel_op.cc  |   2 +-
 .../cinn/hlir/framework/accuracy_checker.cc   |   8 +-
 .../hlir/framework/accuracy_checker_test.cc   |  10 +-
 paddle/cinn/hlir/framework/buffer.cc          |  10 +-
 paddle/cinn/hlir/framework/buffer.h           |  16 +-
 paddle/cinn/hlir/framework/buffer_test.cc     |   4 +-
 paddle/cinn/hlir/framework/graph.cc           |  20 +-
 paddle/cinn/hlir/framework/graph.h            |  12 +-
 paddle/cinn/hlir/framework/graph_compiler.cc  |  20 +-
 paddle/cinn/hlir/framework/graph_compiler.h   |   2 +-
 .../hlir/framework/graph_compiler_test.cc     |  14 +-
 paddle/cinn/hlir/framework/graph_test.cc      |   4 +-
 paddle/cinn/hlir/framework/instruction.cc     |  12 +-
 paddle/cinn/hlir/framework/instruction.h      |   2 +-
 .../cinn/hlir/framework/instruction_test.cc   |  68 +++--
 paddle/cinn/hlir/framework/memory.cc          |   2 +-
 paddle/cinn/hlir/framework/memory.h           |   5 +-
 paddle/cinn/hlir/framework/node.cc            |  49 +--
 paddle/cinn/hlir/framework/node.h             |  24 +-
 paddle/cinn/hlir/framework/op_lowering.h      |   2 +-
 .../cinn/hlir/framework/op_lowering_impl.cc   |  37 +--
 paddle/cinn/hlir/framework/op_lowering_impl.h |   2 +-
 .../cinn/hlir/framework/op_lowering_test.cc   |  22 +-
 .../cinn/hlir/framework/op_lowering_util.cc   |  16 +-
 paddle/cinn/hlir/framework/op_strategy.h      |  10 +-
 paddle/cinn/hlir/framework/op_test.cc         |  10 +-
 .../cinn/hlir/framework/parallel_compiler.cc  |   5 +-
 .../hlir/framework/parallel_compiler_test.cc  |   6 +-
 .../hlir/framework/pir/compilation_task.cc    |   3 +-
 .../hlir/framework/pir/op_lowering_impl.cc    |  49 +--
 .../hlir/framework/pir/op_lowering_impl.h     |   2 +-
 .../hlir/framework/pir/op_lowering_util.cc    |   8 +-
 paddle/cinn/hlir/framework/pir/utils.cc       |   8 +-
 paddle/cinn/hlir/framework/pir/utils.h        |   4 +-
 .../hlir/framework/print_graph_pass_test.cc   |   2 +-
 paddle/cinn/hlir/framework/schedule.h         |   2 +-
 paddle/cinn/hlir/framework/scope_test.cc      |   2 +-
 paddle/cinn/hlir/framework/tensor.h           |   8 +-
 paddle/cinn/hlir/framework/tensor_test.cc     |   2 +-
 .../cinn/hlir/framework/visualize_helper.cc   |   6 +-
 paddle/cinn/hlir/framework/visualize_helper.h |   4 +-
 paddle/cinn/hlir/op/broadcast.cc              |   6 +-
 paddle/cinn/hlir/op/contrib/argmax.cc         |  16 +-
 paddle/cinn/hlir/op/contrib/argmax.h          |   2 +-
 paddle/cinn/hlir/op/contrib/argmax_test.cc    |   4 +-
 paddle/cinn/hlir/op/contrib/argmin.cc         |  16 +-
 paddle/cinn/hlir/op/contrib/argmin.h          |   2 +-
 paddle/cinn/hlir/op/contrib/argmin_test.cc    |   4 +-
 paddle/cinn/hlir/op/contrib/assert_true.cc    |   4 +-
 .../cinn/hlir/op/contrib/bitcast_convert.cc   |  10 +-
 paddle/cinn/hlir/op/contrib/cholesky.cc       |   4 +-
 paddle/cinn/hlir/op/contrib/gather_nd.cc      |  21 +-
 paddle/cinn/hlir/op/contrib/gather_nd_test.cc |   4 +-
 .../cinn/hlir/op/contrib/gaussian_random.cc   |   6 +-
 .../hlir/op/contrib/logical_right_shift.cc    |  10 +-
 .../op/contrib/logical_right_shift_test.cc    |   4 +-
 paddle/cinn/hlir/op/contrib/lookup_table.cc   |   8 +-
 .../cinn/hlir/op/contrib/lookup_table_test.cc |   8 +-
 paddle/cinn/hlir/op/contrib/one_hot.cc        |  18 +-
 paddle/cinn/hlir/op/contrib/one_hot_test.cc   |   6 +-
 paddle/cinn/hlir/op/contrib/randint.cc        |   6 +-
 paddle/cinn/hlir/op/contrib/reciprocal.cc     |   8 +-
 .../cinn/hlir/op/contrib/reciprocal_test.cc   |   4 +-
 paddle/cinn/hlir/op/contrib/repeat.cc         |  20 +-
 paddle/cinn/hlir/op/contrib/repeat_test.cc    |   4 +-
 paddle/cinn/hlir/op/contrib/resize.cc         |  48 +--
 paddle/cinn/hlir/op/contrib/resize.h          |   2 +-
 paddle/cinn/hlir/op/contrib/sort.cc           |  36 +--
 paddle/cinn/hlir/op/contrib/sort.h            |   4 +-
 paddle/cinn/hlir/op/contrib/sort_test.cc      |   8 +-
 .../cinn/hlir/op/contrib/triangular_solve.cc  |   4 +-
 paddle/cinn/hlir/op/contrib/uniform_random.cc |   6 +-
 paddle/cinn/hlir/op/custom_call.cc            |  98 +++---
 paddle/cinn/hlir/op/elementwise.cc            |  46 +--
 paddle/cinn/hlir/op/external_api_registry.cc  |  10 +-
 paddle/cinn/hlir/op/external_api_registry.h   |   9 +-
 .../hlir/op/external_api_registry_test.cc     |  14 +-
 paddle/cinn/hlir/op/nn.cc                     |  16 +-
 paddle/cinn/hlir/op/op_broadcast_test.cc      |  80 ++---
 paddle/cinn/hlir/op/op_nn_test.cc             | 105 ++++---
 paddle/cinn/hlir/op/op_util.cc                |  24 +-
 paddle/cinn/hlir/op/op_util.h                 |   9 +-
 paddle/cinn/hlir/op/reduction.cc              |  12 +-
 paddle/cinn/hlir/op/reduction_test.cc         |  37 +--
 paddle/cinn/hlir/op/transform.cc              |   6 +-
 paddle/cinn/hlir/op/transform_test.cc         |  18 +-
 paddle/cinn/hlir/pass/alterlayout.cc          |  52 ++--
 paddle/cinn/hlir/pass/alterlayout_test.cc     |  16 +-
 .../hlir/pass/check_fusion_accuracy_pass.cc   |  12 +-
 .../pass/check_fusion_accuracy_pass_test.cc   |  32 +-
 .../pass/common_subexpression_elimination.cc  |   6 +-
 .../common_subexpression_elimination_test.cc  |   6 +-
 paddle/cinn/hlir/pass/const_propagate.cc      |   2 +-
 paddle/cinn/hlir/pass/const_propagate_test.cc |   4 +-
 .../cinn/hlir/pass/constant_folding_pass.cc   |   4 +-
 .../hlir/pass/constant_folding_pass_test.cc   |   2 +-
 .../hlir/pass/constant_folding_pass_util.cc   |   4 +-
 paddle/cinn/hlir/pass/custom_call_pass.cc     |   6 +-
 paddle/cinn/hlir/pass/dce_pass.cc             |   4 +-
 paddle/cinn/hlir/pass/dce_pass_test.cc        |   4 +-
 paddle/cinn/hlir/pass/dense_merge_pass.cc     |   4 +-
 .../cinn/hlir/pass/dense_merge_pass_test.cc   |   2 +-
 paddle/cinn/hlir/pass/dot_merger.cc           |  12 +-
 paddle/cinn/hlir/pass/dot_merger_test.cc      |   2 +-
 paddle/cinn/hlir/pass/fusion_helper_base.h    |   5 +-
 paddle/cinn/hlir/pass/fusion_merge_pass.cc    |   4 +-
 .../cinn/hlir/pass/fusion_merge_pass_test.cc  |  36 +--
 .../cinn/hlir/pass/fusion_merge_pass_util.h   |   4 +-
 .../hlir/pass/general_fusion_merge_pass.cc    |   4 +-
 .../graph_group_fuse_helper.h                 |   4 +-
 .../pass/general_fusion_merge_pass_utils.h    |   2 +-
 paddle/cinn/hlir/pass/infershape.cc           |   4 +-
 paddle/cinn/hlir/pass/infershape.h            |   2 +-
 paddle/cinn/hlir/pass/op_fusion_pass.cc       |   4 +-
 paddle/cinn/hlir/pass/op_fusion_pass_test.cc  |  22 +-
 paddle/cinn/hlir/pass/op_fusion_pass_util.h   |   4 +-
 paddle/cinn/hlir/pass/opfusion.cc             |  10 +-
 paddle/cinn/hlir/pass/opfusion_test.cc        |  20 +-
 paddle/cinn/hlir/pass/reduce_split_pass.cc    |  34 +--
 .../cinn/hlir/pass/reduce_split_pass_test.cc  |   2 +-
 .../hlir/pass/single_group_optimize_pass.cc   |   8 +-
 paddle/cinn/hlir/pass/test_dot_merger.cc      |   4 +-
 paddle/cinn/hlir/pass/test_primitive_ops.cc   |   6 +-
 paddle/cinn/hlir/pe/broadcast.cc              |   6 +-
 paddle/cinn/hlir/pe/broadcast.h               |  18 +-
 paddle/cinn/hlir/pe/elementwise.cc            |   2 +-
 paddle/cinn/hlir/pe/elementwise.h             |  15 +-
 paddle/cinn/hlir/pe/ir_schedule_pe.cc         |  50 +--
 paddle/cinn/hlir/pe/ir_schedule_pe.h          |  38 +--
 paddle/cinn/hlir/pe/load_params_test.cc       |   2 +-
 paddle/cinn/hlir/pe/map_expr_to_ir.cc         |   6 +-
 paddle/cinn/hlir/pe/map_expr_to_ir.h          |   2 +-
 paddle/cinn/hlir/pe/nn.cc                     | 147 ++++-----
 paddle/cinn/hlir/pe/nn.h                      |   4 +-
 paddle/cinn/hlir/pe/nn_util.cc                |  10 +-
 paddle/cinn/hlir/pe/pe_broadcast_test.cc      |  48 +--
 paddle/cinn/hlir/pe/pe_elementwise_test.cc    |   8 +-
 paddle/cinn/hlir/pe/pe_transform_test.cc      |  17 +-
 paddle/cinn/hlir/pe/reduction.cc              |  18 +-
 paddle/cinn/hlir/pe/reduction.h               |   2 +-
 paddle/cinn/hlir/pe/schedule.cc               | 104 +++----
 paddle/cinn/hlir/pe/schedule.h                |  68 ++---
 paddle/cinn/hlir/pe/transform.cc              | 132 ++++----
 paddle/cinn/hlir/pe/transform.h               |  16 +-
 paddle/cinn/ir/buffer.cc                      |   6 +-
 paddle/cinn/ir/dim.cc                         |   2 +-
 .../ir/group_schedule/base_group_scheduler.cc |   2 +-
 .../ir/group_schedule/base_group_scheduler.h  |   6 +-
 .../group_schedule/dy_shape_group_scheduler.h |   2 +-
 .../st_shape_group_scheduler.cc               |  16 +-
 .../group_schedule/st_shape_group_scheduler.h |   2 +-
 paddle/cinn/ir/ir.cc                          |  10 +-
 paddle/cinn/ir/ir.h                           |   4 +-
 paddle/cinn/ir/ir_analyzer/ir_analyzer.cc     |   6 +-
 paddle/cinn/ir/ir_base.h                      |  16 +-
 paddle/cinn/ir/ir_printer.cc                  |   4 +-
 paddle/cinn/ir/lowered_func.cc                |  14 +-
 paddle/cinn/ir/module.h                       |   2 +-
 paddle/cinn/ir/op/ir_operators.cc             |  16 +-
 paddle/cinn/ir/operation.cc                   |   2 +-
 paddle/cinn/ir/schedule/factorize_reduction.h |   2 +-
 paddle/cinn/ir/schedule/impl/base.cc          |   2 +-
 paddle/cinn/ir/schedule/impl/for_type.cc      |   3 +-
 .../ir/schedule/impl/loop_transformation.cc   |   8 +-
 paddle/cinn/ir/schedule/ir_schedule_util.cc   |  33 +-
 paddle/cinn/ir/schedule/ir_schedule_util.h    |   8 +-
 paddle/cinn/ir/schedule_block_graph.cc        |  22 +-
 paddle/cinn/ir/schedule_block_graph.h         |  13 +-
 paddle/cinn/ir/tensor.cc                      |  14 +-
 paddle/cinn/ir/tensor.h                       |   4 +-
 paddle/cinn/ir/test/ir_compare_test.cc        |   2 +-
 .../cinn/ir/test/schedule_block_graph_test.cc |   9 +-
 paddle/cinn/ir/test/schedule_desc_test.cc     |   2 +-
 .../ir/test/st_shape_group_scheduler_test.cc  |   4 +-
 paddle/cinn/ir/test/tensor_test.cc            |   8 +-
 paddle/cinn/ir/utils/ir_copy.cc               |   4 +-
 paddle/cinn/lang/buffer.cc                    |   2 +-
 paddle/cinn/lang/builtin.cc                   |  16 +-
 paddle/cinn/lang/compute.cc                   |   4 +-
 paddle/cinn/lang/lower.cc                     |   6 +-
 paddle/cinn/lang/lower.h                      |  15 +-
 paddle/cinn/lang/lower_impl.cc                |  34 +--
 paddle/cinn/lang/lower_impl.h                 |  10 +-
 paddle/cinn/lang/lower_tensor_group.cc        |   4 +-
 paddle/cinn/lang/lower_tensor_group.h         |   2 +-
 paddle/cinn/lang/lower_test.cc                |   2 +-
 paddle/cinn/lang/packed_func.h                |   6 +-
 paddle/cinn/lang/packed_func_test.cc          |   5 +-
 paddle/cinn/lang/placeholder.h                |   4 +-
 paddle/cinn/optim/buffer_assign.cc            |  10 +-
 paddle/cinn/optim/buffer_assign.h             |   2 +-
 .../cinn/optim/call_arg_list_to_pod_value.cc  |   2 +-
 paddle/cinn/optim/compute_inline_expand.cc    |   6 +-
 paddle/cinn/optim/ir_simplify.cc              |  53 ++--
 paddle/cinn/optim/map_extern_call.cc          |   4 +-
 .../cinn/optim/remove_schedule_block_test.cc  |   2 +-
 .../replace_cross_thread_reduction_test.cc    |   2 +-
 paddle/cinn/optim/replace_var_with_expr.cc    |   2 +-
 paddle/cinn/optim/transform_gpu_forloop.cc    |  24 +-
 paddle/cinn/optim/transform_polyfor_to_for.cc |   6 +-
 paddle/cinn/optim/unroll_loops_test.cc        |   4 +-
 paddle/cinn/optim/var_mod_simplify.cc         |   4 +-
 paddle/cinn/optim/vectorize_loops.cc          |  84 +++---
 paddle/cinn/optim/vectorize_loops_test.cc     |  12 +-
 paddle/cinn/poly/domain.cc                    |   2 +-
 paddle/cinn/poly/graph.cc                     |   4 +-
 paddle/cinn/poly/graph.h                      |   8 +-
 paddle/cinn/poly/poly_scheduler.cc            |  25 +-
 paddle/cinn/poly/schedule.cc                  |   2 +-
 paddle/cinn/poly/schedule.h                   |  10 +-
 paddle/cinn/poly/stage.cc                     |   8 +-
 paddle/cinn/poly/stage.h                      |   2 +-
 paddle/cinn/poly/stage_test.cc                |  34 ++-
 paddle/cinn/pybind/CMakeLists.txt             |   5 +-
 paddle/cinn/pybind/bind_utils.h               |   6 +-
 paddle/cinn/pybind/common.cc                  |  64 ++--
 paddle/cinn/pybind/framework.cc               |  26 +-
 paddle/cinn/pybind/frontend.cc                |  32 +-
 paddle/cinn/pybind/ir/ir.cc                   |   4 +-
 paddle/cinn/pybind/ir/ir_api.cc               |  14 +-
 paddle/cinn/pybind/ir/ir_context.cc           |   2 +-
 paddle/cinn/pybind/ir/ir_context.h            |   8 +-
 paddle/cinn/pybind/lang.cc                    |  10 +-
 paddle/cinn/pybind/pe.cc                      |   4 +-
 paddle/cinn/pybind/runtime.cc                 |   6 +-
 paddle/cinn/runtime/cpu/cblas.cc              |  14 +-
 .../cinn/runtime/cpu/host_intrinsics_test.cc  |  57 ++--
 paddle/cinn/runtime/cpu/mkl_math_test.cc      |  67 +++--
 paddle/cinn/runtime/cpu/mkldnn_math.cc        |  26 +-
 paddle/cinn/runtime/cpu/mkldnn_math_test.cc   |  22 +-
 paddle/cinn/runtime/cpu/thread_backend.cc     |   2 +-
 paddle/cinn/runtime/cuda/cublas_util.h        |   8 +-
 paddle/cinn/runtime/cuda/cuda_module_test.cc  |   4 +-
 paddle/cinn/runtime/cuda/cuda_util.cc         |  12 +-
 paddle/cinn/runtime/cuda/cuda_util.h          |  13 +-
 paddle/cinn/runtime/custom_function.cc        |  14 +-
 paddle/cinn/runtime/custom_function_test.cc   |  52 ++--
 paddle/cinn/runtime/flags.cc                  |   9 +-
 paddle/cinn/runtime/flags.h                   |   6 +-
 paddle/cinn/runtime/intrinsic.cc              |   2 +-
 paddle/cinn/utils/data_util.cc                |  32 +-
 paddle/cinn/utils/data_util.h                 |   6 +-
 paddle/common/CMakeLists.txt                  |   3 +
 paddle/common/array.h                         |   5 +
 paddle/common/ddim.cc                         |  51 ++++
 paddle/common/ddim.h                          | 103 +++----
 paddle/common/dim.h                           |  13 +
 paddle/common/enforce.h                       |  90 ++++--
 paddle/{phi/core => common}/errors.cc         |   8 +-
 paddle/common/errors.h                        |   6 +
 paddle/{phi => }/common/layout.h              |  18 +-
 paddle/common/macros.h                        |  22 +-
 .../distributed/auto_parallel/CMakeLists.txt  |   2 +-
 .../auto_parallel/spmd_rules/CMakeLists.txt   |   2 +-
 .../auto_parallel/test/CMakeLists.txt         |   8 +-
 .../distributed/collective/CMakeLists.txt     |  23 +-
 .../distributed/collective/process_group.h    |   2 +-
 .../collective/process_group_bkcl.cc          |   2 +-
 .../collective/process_group_with_stream.h    |   2 +-
 .../collective/process_group_without_stream.h |   2 +-
 paddle/fluid/distributed/common/afs_warpper.h |   2 +-
 .../distributed/fleet_executor/CMakeLists.txt |   7 +-
 .../fleet_executor/compute_interceptor.cc     |   2 +-
 .../fleet_executor/cond_interceptor.cc        |   2 +-
 .../distributed/fleet_executor/dist_model.cc  |   4 +-
 .../fleet_executor/start_interceptor.cc       |   2 +-
 .../test/compute_interceptor_run_op_test.cc   |   2 +-
 .../distributed/ps/service/CMakeLists.txt     |   7 +-
 .../distributed/ps/service/brpc_ps_client.cc  |   2 +-
 .../distributed/ps/service/brpc_ps_client.h   |   2 +-
 .../distributed/ps/service/brpc_utils.cc      |   8 +-
 .../ps/service/communicator/communicator.cc   |   2 +-
 paddle/fluid/distributed/ps/service/env.h     |   2 +-
 paddle/fluid/distributed/ps/service/server.h  |   2 +-
 .../fluid/distributed/ps/table/CMakeLists.txt |   6 +-
 .../distributed/ps/table/graph/graph_edge.h   |   2 +-
 paddle/fluid/distributed/ps/table/table.h     |   2 +-
 paddle/fluid/distributed/rpc/CMakeLists.txt   |   2 +-
 paddle/fluid/distributed/test/CMakeLists.txt  |   1 +
 .../fluid/distributed/test/brpc_utils_test.cc |  12 +-
 paddle/fluid/eager/CMakeLists.txt             |  10 +-
 .../fluid/eager/accumulation/CMakeLists.txt   |   2 +-
 .../eager_generated/backwards/scale_node.cc   |   2 +-
 paddle/fluid/eager/api/utils/CMakeLists.txt   |   4 +-
 .../eager/auto_code_generator/CMakeLists.txt  |   6 +
 .../eager/custom_operator/CMakeLists.txt      |   4 +-
 .../custom_operator/custom_operator_utils.cc  |   9 +-
 paddle/fluid/eager/eager_layout_transformer.h |   8 +-
 paddle/fluid/eager/eager_tensor.h             |   2 +-
 paddle/fluid/eager/pylayer/CMakeLists.txt     |   2 +-
 paddle/fluid/eager/utils.cc                   |  18 +-
 paddle/fluid/framework/CMakeLists.txt         |  50 +--
 paddle/fluid/framework/convert_utils.h        |   2 +-
 paddle/fluid/framework/custom_operator.cc     |   6 +-
 paddle/fluid/framework/data_feed.cc           |  14 +-
 paddle/fluid/framework/data_feed.h            |   2 +-
 paddle/fluid/framework/data_layout.h          |   2 +-
 .../fluid/framework/data_layout_transform.cc  |   2 +-
 paddle/fluid/framework/data_set.h             |   2 +-
 paddle/fluid/framework/details/CMakeLists.txt |  74 +++--
 .../details/broadcast_op_handle_test.h        |   8 +-
 .../details/fetch_async_op_handle.cc          |   6 +-
 .../details/gather_op_handle_test.cc          |   4 +-
 paddle/fluid/framework/details/op_registry.h  |   2 +-
 .../details/reduce_op_handle_test.cc          |   8 +-
 .../details/scale_loss_grad_op_handle.cc      |   2 +-
 paddle/fluid/framework/device_worker.h        |   2 +-
 paddle/fluid/framework/dlpack_tensor.h        |   2 +-
 paddle/fluid/framework/eigen.h                |   8 +-
 paddle/fluid/framework/fleet/heter_wrapper.cc |   6 +-
 .../fluid/framework/heter_section_worker.cc   |   2 +-
 paddle/fluid/framework/infershape_utils.cc    |  10 +-
 paddle/fluid/framework/inplace_op_inference.h |   2 +-
 paddle/fluid/framework/ir/CMakeLists.txt      |   7 +-
 .../framework/ir/attention_lstm_fuse_pass.cc  |   8 +-
 .../framework/ir/auto_mixed_precision_pass.cc |   2 +-
 .../ir/conv2d_fusion_layout_transfer_pass.cc  |   2 +-
 ..._trans_filter_dilations_nxn_to_1x1_pass.cc |   2 +-
 .../fluid/framework/ir/conv_bn_fuse_pass.cc   |   7 +-
 .../ir/fc_elementwise_layernorm_fuse_pass.cc  |   4 +-
 .../fused_multi_transformer_encoder_pass.cc   |   8 +-
 .../framework/ir/fusion_group/CMakeLists.txt  |   4 +-
 .../ir/fusion_group/code_generator_tester.cc  |   4 +-
 .../framework/ir/ipu/delete_scale_op_pass.cc  |   2 +-
 .../framework/ir/ipu/infer_shape_pass.cc      |   6 +-
 .../framework/ir/layer_norm_fuse_pass.cc      |   6 +-
 .../ir/memory_optimize_pass/CMakeLists.txt    |   5 +-
 .../compute_propagate_scales_mkldnn_pass.cc   |   2 +-
 ...ute_propagate_scales_mkldnn_pass_tester.cc |   8 +-
 .../conv_affine_channel_mkldnn_fuse_pass.cc   |   4 +-
 .../framework/ir/mkldnn/cpu_quantize_pass.cc  |   2 +-
 .../ir/mkldnn/multi_gru_fuse_pass.cc          |   2 +-
 .../ir/mkldnn/multi_gru_seq_fuse_pass.cc      |   2 +-
 .../params_quantization_mkldnn_pass_tester.cc |   2 +-
 .../ir/mkldnn/quant_dequant_mkldnn_pass.cc    |  10 +-
 .../multi_devices_graph_pass.cc               |   4 +-
 .../ir/multihead_matmul_fuse_pass.cc          |   8 +-
 .../ir/multihead_matmul_roformer_fuse_pass.cc |   4 +-
 paddle/fluid/framework/ir/pass.h              |   2 +-
 paddle/fluid/framework/ir/pass_test_util.cc   |   3 +-
 paddle/fluid/framework/ir/pass_test_util.h    |   2 +-
 .../ir/split_layernorm_to_math_ops_pass.cc    |   8 +-
 .../trt_cross_multihead_matmul_fuse_pass.cc   |   2 +-
 ...rt_delete_weight_dequant_linear_op_pass.cc |   2 +-
 .../trt_flash_multihead_matmul_fuse_pass.cc   |   2 +-
 .../ir/trt_multihead_matmul_fuse_pass.cc      |   8 +-
 .../ir/trt_qk_multihead_matmul_fuse_pass.cc   |   4 +-
 .../ir/trt_remove_amp_strategy_op_pass.cc     |   2 +-
 .../framework/ir/trt_support_nhwc_pass.cc     |   4 +-
 .../framework/ir/vit_attention_fuse_pass.cc   |   4 +-
 .../framework/ir/xpu/conv1d_xpu_fuse_pass.cc  |   2 +-
 .../framework/ir/xpu/conv2d_bias_fuse_pass.cc |   2 +-
 .../framework/ir/xpu/conv2d_xpu_fuse_pass.cc  |   3 +-
 .../framework/ir/xpu/fc_xpu_fuse_pass.cc      |   3 +-
 .../xpu/fused_multi_transformer_xpu_pass.cc   |   2 +-
 .../ir/xpu/multi_encoder_xpu_fuse_pass.cc     |   6 +-
 paddle/fluid/framework/ir/xpu/pass_utils.cc   |  12 +-
 paddle/fluid/framework/lod_tensor.cc          |   4 +-
 paddle/fluid/framework/lod_tensor.h           |   2 +-
 .../framework/new_executor/CMakeLists.txt     |   2 +-
 .../new_executor/feed_fetch_utils.cc          |   4 +-
 .../new_executor/instruction/CMakeLists.txt   |   4 +-
 .../instruction/cinn_jit_instruction.cc       |   2 +-
 .../new_executor/interpreter/CMakeLists.txt   |   1 +
 .../framework/new_executor/interpreter/job.h  |   4 +-
 .../framework/new_executor/interpreter/plan.h |   2 +-
 .../new_executor/workqueue/CMakeLists.txt     |   4 +-
 .../new_executor/workqueue/event_count.h      |   2 +-
 .../framework/no_need_buffer_vars_inference.h |   2 +-
 paddle/fluid/framework/op_desc.cc             |   9 +-
 paddle/fluid/framework/op_desc.h              |   2 +-
 paddle/fluid/framework/op_registry.h          |   6 +-
 paddle/fluid/framework/op_version_registry.h  |   2 +-
 paddle/fluid/framework/operator.cc            |   2 +-
 paddle/fluid/framework/operator.h             |   2 +-
 .../framework/paddle2cinn/CMakeLists.txt      |   3 +-
 .../framework/paddle2cinn/cinn_cache_key.cc   |   2 +-
 .../framework/paddle2cinn/cinn_cache_key.h    |   2 +-
 paddle/fluid/framework/reader.h               |   2 +-
 paddle/fluid/framework/shape_inference.h      |   2 +-
 paddle/fluid/framework/tensor_util.cc         |  20 +-
 paddle/fluid/imperative/CMakeLists.txt        |  34 ++-
 paddle/fluid/imperative/all_reduce.cc         |   2 +-
 paddle/fluid/imperative/gloo_context.cc       |   2 +-
 paddle/fluid/imperative/infer_shape_context.h |   2 +-
 .../imperative/jit/program_desc_tracer.cc     |   2 +-
 paddle/fluid/imperative/layout_autotune.cc    |   4 +-
 paddle/fluid/imperative/layout_autotune.h     |   2 +-
 paddle/fluid/imperative/layout_transformer.h  |  16 +-
 paddle/fluid/imperative/reducer.cc            |   2 +-
 paddle/fluid/imperative/variable_wrapper.h    |   2 +-
 paddle/fluid/inference/CMakeLists.txt         |  14 +-
 .../inference/analysis/ir_pass_manager.cc     |   2 +-
 paddle/fluid/inference/api/CMakeLists.txt     |   4 +-
 .../fluid/inference/api/analysis_predictor.cc |   8 +-
 paddle/fluid/inference/api/api_impl.cc        |   4 +-
 paddle/fluid/inference/api/api_impl.h         |   2 +-
 .../inference/api/details/CMakeLists.txt      |   8 +-
 .../inference/api/details/zero_copy_tensor.cc |  13 +-
 .../fluid/inference/api/mkldnn_quantizer.cc   |   2 +-
 .../inference/api/onnxruntime_predictor.cc    |   2 +-
 .../fluid/inference/api/resource_manager.cc   |   2 +-
 .../fluid/inference/capi_exp/CMakeLists.txt   |   1 +
 paddle/fluid/inference/lite/tensor_utils.cc   |   8 +-
 .../inference/tensorrt/convert/CMakeLists.txt |   5 +-
 .../tensorrt/convert/bilinear_interp_v2_op.cc |   2 +-
 .../inference/tensorrt/convert/dropout_op.cc  |   2 +-
 .../tensorrt/convert/elementwise_op.cc        |   2 +-
 .../tensorrt/convert/emb_eltwise_layernorm.cc |  14 +-
 .../tensorrt/convert/fill_constant_op.cc      |   2 +-
 .../generic_and_custom_plugin_creater.cc      |   2 +-
 .../tensorrt/convert/leaky_relu_op.cc         |   2 +-
 .../tensorrt/convert/nearest_interp_op.cc     |   2 +-
 .../tensorrt/convert/nearest_interp_v2_op.cc  |   2 +-
 .../convert/preln_emb_eltwise_layernorm.cc    |   8 +-
 .../tensorrt/convert/preln_residual_bias.cc   |   6 +-
 .../tensorrt/convert/preln_skip_layernorm.cc  |   4 +-
 .../prompt_tuning_emb_eltwise_layernorm.cc    |  10 +-
 .../tensorrt/convert/test_op_converter.cc     |   2 +-
 .../tensorrt/convert/test_split_op.cc         |   6 +-
 .../inference/tensorrt/convert/ut_helper.h    |   4 +-
 .../dynamic_shape_infermeta_factory.h         |   2 +-
 paddle/fluid/inference/tensorrt/op_teller.cc  |   8 +-
 .../inference/tensorrt/plugin/CMakeLists.txt  |   6 +-
 .../elementwiseadd_transpose_op_plugin.cu     |  26 +-
 .../tensorrt/plugin/generic_plugin.cu         |   4 +-
 .../tensorrt/plugin/group_norm_op_plugin.cu   |   6 +-
 .../plugin/instance_norm_op_plugin.cu         |   8 +-
 .../tensorrt/plugin/layer_norm_op_plugin.cu   |  16 +-
 .../plugin/preln_groupnorm_act_op_plugin.cu   |   2 +-
 .../plugin/skip_groupnorm_act_op_plugin.cu    |   2 +-
 .../plugin/trans_layernorm_op_plugin.cu       |  24 +-
 .../inference/tensorrt/test_dynamic_engine.cc |  10 +-
 .../fluid/inference/tensorrt/test_engine.cc   |   2 +-
 .../inference/tensorrt/trt_int8_calibrator.cc |   2 +-
 paddle/fluid/inference/utils/CMakeLists.txt   |   4 +-
 .../translator/attribute_translator.cc        |   4 +-
 .../ir_adaptor/translator/op_translator.cc    |  31 +-
 .../translator/program_translator.cc          |   2 +-
 .../ir_adaptor/translator/type_translator.cc  |   6 +-
 paddle/fluid/ir_adaptor/translator/utils.cc   |   2 +-
 paddle/fluid/jit/layer.cc                     |   2 +-
 paddle/fluid/jit/property.cc                  |   2 +-
 paddle/fluid/memory/CMakeLists.txt            |   4 +-
 paddle/fluid/memory/allocation/CMakeLists.txt |   4 +-
 .../memory/allocation/allocator_facade.cc     |   2 +-
 paddle/fluid/memory/stats.cc                  |   2 +-
 paddle/fluid/operators/CMakeLists.txt         |  12 +-
 paddle/fluid/operators/affine_channel_op.cc   |   8 +-
 paddle/fluid/operators/affine_channel_op.cu   |   4 +-
 .../fluid/operators/affine_channel_op_xpu.cc  |   4 +-
 .../fluid/operators/array_to_lod_tensor_op.cc |  13 +-
 paddle/fluid/operators/assign_value_op.h      |   2 +-
 paddle/fluid/operators/attention_lstm_op.cc   |   2 +-
 paddle/fluid/operators/batch_norm_op.cc       |  15 +-
 paddle/fluid/operators/bilateral_slice_op.cc  |   2 +-
 paddle/fluid/operators/bpr_loss_op.cc         |  14 +-
 paddle/fluid/operators/cinn/CMakeLists.txt    |   1 +
 .../operators/cinn/cinn_launch_context.cc     |   6 +-
 .../operators/cinn/cinn_launch_context.h      |   2 +-
 .../fluid/operators/class_center_sample_op.cu |   2 +-
 .../fluid/operators/collective/CMakeLists.txt |   5 +-
 .../fluid/operators/collective/barrier_op.h   |   2 +-
 .../operators/collective/c_allgather_op.cc    |   2 +-
 .../operators/collective/c_allgather_op.h     |   2 +-
 .../operators/collective/c_broadcast_op.cu.cc |   2 +-
 .../operators/collective/c_embedding_op.cc    |   4 +-
 .../fluid/operators/collective/c_reduce_op.h  |   2 +-
 .../operators/collective/c_reducescatter_op.h |   2 +-
 .../operators/collective/global_gather_op.cc  |   2 +-
 .../collective/global_gather_op.cu.cc         |   4 +-
 .../operators/collective/global_scatter_op.cc |   2 +-
 .../collective/global_scatter_op.cu.cc        |   4 +-
 .../collective/partial_allgather_op.h         |   2 +-
 .../operators/collective/partial_recv_op.cc   |   6 +-
 .../fluid/operators/collective/recv_v2_op.cc  |   2 +-
 .../operators/collective/recv_v2_op.cu.cc     |   4 +-
 .../operators/collective/send_v2_op.cu.cc     |   8 +-
 .../operators/common_infer_shape_functions.cc |   2 +-
 .../controlflow/tensor_array_read_write_op.cc |   2 +-
 .../fluid/operators/controlflow/while_op.cc   |   6 +-
 paddle/fluid/operators/correlation_op.cc      |   2 +-
 paddle/fluid/operators/crop_op.cc             |  10 +-
 paddle/fluid/operators/crop_op.h              |   2 +-
 paddle/fluid/operators/cross_entropy_op.cc    |  20 +-
 paddle/fluid/operators/cross_entropy_op.h     |   8 +-
 paddle/fluid/operators/ctc_align_op.h         |   2 +-
 .../custom_device_common_op_registry.cc       |  10 +-
 paddle/fluid/operators/data_norm_op.cc        |  14 +-
 .../fluid/operators/dequantize_abs_max_op.h   |   2 +-
 paddle/fluid/operators/dequantize_log_op.h    |   2 +-
 .../fluid/operators/detection/CMakeLists.txt  |   5 +-
 .../detection/anchor_generator_op.cc          |   4 +-
 .../operators/detection/anchor_generator_op.h |   2 +-
 .../detection/box_decoder_and_assign_op.cc    |  10 +-
 .../detection/density_prior_box_op.cc         |   4 +-
 .../detection/density_prior_box_op.h          |   2 +-
 .../operators/detection/iou_similarity_op.cc  |   2 +-
 .../detection/mine_hard_examples_op.cc        |   2 +-
 .../operators/detection/multiclass_nms_op.cc  |   2 +-
 .../detection/polygon_box_transform_op.cc     |   2 +-
 .../fluid/operators/detection/prior_box_op.h  |   2 +-
 paddle/fluid/operators/detection_map_op.cc    |   2 +-
 paddle/fluid/operators/detection_map_op.h     |   6 +-
 .../fluid/operators/dlnne/dlnne_engine_op.h   |  10 +-
 .../elementwise/elementwise_functor.h         |   2 +-
 .../operators/elementwise/elementwise_op.h    |  10 +-
 .../elementwise/elementwise_op_function.h     |   4 +-
 paddle/fluid/operators/expand_op.cc           |   2 +-
 paddle/fluid/operators/fake_dequantize_op.cc  |   4 +-
 paddle/fluid/operators/fake_dequantize_op.h   |   2 +-
 paddle/fluid/operators/fake_quantize_op.cu.h  |   3 +-
 paddle/fluid/operators/fake_quantize_op.h     |   4 +-
 paddle/fluid/operators/fill_constant_op.cc    |   6 +-
 paddle/fluid/operators/fill_op.cc             |   2 +-
 paddle/fluid/operators/flatten_op.cc          |   6 +-
 paddle/fluid/operators/flatten_op.h           |   4 +-
 .../fused/cudnn_bn_stats_finalize.cu.h        |   2 +-
 .../operators/fused/cudnn_norm_conv.cu.h      |   8 +-
 .../fused/cudnn_scale_bias_add_relu.cu.h      |   6 +-
 .../operators/fused/fused_attention_utils.h   |   2 +-
 .../operators/fused/fused_bn_activation_op.cc |   2 +-
 .../fused/fused_bn_add_activation_op.cc       |   2 +-
 .../fused/fused_embedding_fc_lstm_op.cc       |  18 +-
 .../fused/fused_embedding_seq_pool_op.cc      |   2 +-
 .../operators/fused/fused_feedforward_op.cc   |   4 +-
 .../operators/fused/fused_feedforward_op.cu   |   2 +-
 .../operators/fused/fused_gate_attention.h    |  16 +-
 .../fused/fused_gate_attention_op.cu          |  12 +-
 .../operators/fused/fused_gemm_epilogue_op.cc |  12 +-
 .../operators/fused/fused_gemm_epilogue_op.cu |  11 +-
 .../fused/fused_gemm_epilogue_op_xpu.cc       |   4 +-
 .../fluid/operators/fused/fused_matmul_op.cc  |   8 +-
 .../operators/fused/fused_seqpool_cvm_op.cc   |   4 +-
 .../fused/fusion_conv_inception_op.cu         |  12 +-
 .../fused/mkldnn/fusion_lstm_mkldnn_op.cc     |   6 +-
 .../fused/mkldnn/multi_gru_mkldnn_op.cc       |   2 +-
 paddle/fluid/operators/fused/multi_gru_op.cc  |   2 +-
 .../operators/fused/resnet_basic_block_op.cc  |   8 +-
 .../fused/resnet_basic_block_op_xpu.cc        |  36 +--
 .../fluid/operators/fused/resnet_unit_op.cc   |   8 +-
 .../fluid/operators/fused/resnet_unit_op.cu   |  28 +-
 .../operators/fused/resnet_unit_op_xpu.cc     |  18 +-
 .../fluid/operators/generator/CMakeLists.txt  |   2 +-
 .../generator/get_expected_kernel_func.cc     |   6 +-
 .../operators/grid_sampler_cudnn_op.cu.cc     |  12 +-
 paddle/fluid/operators/hash_op.cc             |   2 +-
 paddle/fluid/operators/hash_op.h              |   2 +-
 paddle/fluid/operators/index_select_op.h      |   4 +-
 paddle/fluid/operators/interpolate_op.cc      |   8 +-
 paddle/fluid/operators/interpolate_op.cu      |  12 +-
 paddle/fluid/operators/interpolate_op.h       |  14 +-
 paddle/fluid/operators/is_empty_op.h          |   2 +-
 paddle/fluid/operators/l1_norm_op.cc          |   2 +-
 paddle/fluid/operators/layout_utils.h         |  24 +-
 paddle/fluid/operators/linear_chain_crf_op.h  |   2 +-
 paddle/fluid/operators/lite/ut_helper.h       |   2 +-
 paddle/fluid/operators/lod_reset_op.h         |   4 +-
 .../operators/lookup_table_dequant_op.cc      |   4 +-
 paddle/fluid/operators/lookup_table_op.cc     |   4 +-
 paddle/fluid/operators/lookup_table_op.cu     |   2 +-
 paddle/fluid/operators/lookup_table_op.h      |   2 +-
 paddle/fluid/operators/lookup_table_v2_op.cu  |   2 +-
 paddle/fluid/operators/lookup_table_v2_op.h   |   2 +-
 paddle/fluid/operators/lrn_op.cc              |  10 +-
 paddle/fluid/operators/lrn_op.h               |   4 +-
 .../fluid/operators/match_matrix_tensor_op.cc |   4 +-
 paddle/fluid/operators/math/CMakeLists.txt    |  12 +-
 paddle/fluid/operators/math/beam_search.cc    |   2 +-
 paddle/fluid/operators/math/beam_search.cu    |   4 +-
 .../fluid/operators/math/beam_search_xpu.cc   |   2 +-
 paddle/fluid/operators/math/context_project.h |   8 +-
 .../operators/math/eigen_values_vectors.h     |   8 +-
 paddle/fluid/operators/math/sample_prob.cu    |   2 +-
 paddle/fluid/operators/math/sample_prob.h     |   2 +-
 paddle/fluid/operators/matmul_op.cc           |  10 +-
 paddle/fluid/operators/merge_lod_tensor_op.cc |   8 +-
 .../operators/metrics/precision_recall_op.cc  |   4 +-
 paddle/fluid/operators/minus_op.cc            |   2 +-
 .../operators/mkldnn/interpolate_mkldnn_op.cc |   8 +-
 .../operators/mkldnn/layer_norm_mkldnn_op.cc  |   2 +-
 .../operators/mkldnn/matmul_mkldnn_op.cc      |   4 +-
 .../operators/mkldnn/quantize_mkldnn_op.cc    |   2 +-
 .../operators/mkldnn/requantize_mkldnn_op.cc  |   2 +-
 .../operators/mkldnn/reshape_mkldnn_op.cc     |  35 +--
 .../operators/mkldnn/transpose_mkldnn_op.cc   |   4 +-
 .../fluid/operators/modified_huber_loss_op.cc |   2 +-
 .../fluid/operators/modified_huber_loss_op.cu |   2 +-
 .../fluid/operators/modified_huber_loss_op.h  |   2 +-
 paddle/fluid/operators/nccl/nccl_op.cu.cc     |   4 +-
 paddle/fluid/operators/nce_op.cc              |   6 +-
 paddle/fluid/operators/nce_op.h               |   4 +-
 .../optimizers/decayed_adagrad_op.cc          |   4 +-
 paddle/fluid/operators/optimizers/dpsgd_op.cc |   4 +-
 paddle/fluid/operators/optimizers/ftrl_op.cc  |   6 +-
 .../pow2_decay_with_linear_warmup_op.cc       |   2 +-
 .../operators/optimizers/proximal_gd_op.cc    |   2 +-
 .../operators/optimizers/sparse_momentum_op.h |   2 +-
 paddle/fluid/operators/pad2d_op.cc            |   4 +-
 paddle/fluid/operators/partial_sum_op.cc      |   2 +-
 .../operators/positive_negative_pair_op.cc    |   2 +-
 paddle/fluid/operators/pscore/CMakeLists.txt  |   2 +
 .../pscore/distributed_lookup_table_op.cc     |   8 +-
 .../pscore/distributed_lookup_table_op.h      |   8 +-
 paddle/fluid/operators/pscore/fake_init_op.cc |   6 +-
 .../operators/pull_box_extended_sparse_op.cc  |   9 +-
 paddle/fluid/operators/pull_box_sparse_op.cc  |   5 +-
 .../fluid/operators/pull_gpups_sparse_op.cc   |   5 +-
 paddle/fluid/operators/pull_sparse_op.cc      |   5 +-
 paddle/fluid/operators/pull_sparse_v2_op.cc   |   4 +-
 paddle/fluid/operators/pyramid_hash_op.cc     |   8 +-
 paddle/fluid/operators/quantize_linear_op.h   |   4 +-
 paddle/fluid/operators/randperm_op.h          |   2 +-
 paddle/fluid/operators/range_op.h             |   2 +-
 paddle/fluid/operators/rank_attention.cu.h    |   2 +-
 paddle/fluid/operators/read_file_op.cc        |   4 +-
 .../operators/reader/create_py_reader_op.cc   |   4 +-
 .../reader/lod_tensor_blocking_queue.h        |   2 +-
 .../operators/reader/reader_op_registry.cc    |   2 +-
 paddle/fluid/operators/recurrent_op.cc        |  14 +-
 paddle/fluid/operators/reduce_ops/reduce_op.h |  10 +-
 .../operators/reduce_ops/reduce_op_function.h |   8 +-
 .../fluid/operators/repeat_interleave_op.cc   |   4 +-
 paddle/fluid/operators/reshape_op.cc          |  24 +-
 paddle/fluid/operators/sampling_id_op.cc      |   2 +-
 paddle/fluid/operators/sampling_id_op.h       |   2 +-
 .../operators/sequence_ops/CMakeLists.txt     |   2 +-
 .../sequence_ops/sequence_concat_op.cc        |  10 +-
 .../sequence_ops/sequence_enumerate_op.h      |   2 +-
 .../sequence_ops/sequence_expand_as_op.cu     |   4 +-
 .../sequence_ops/sequence_expand_as_op.h      |   4 +-
 .../sequence_ops/sequence_expand_op.cu        |   2 +-
 .../sequence_ops/sequence_expand_op.h         |   2 +-
 .../operators/sequence_ops/sequence_pad_op.cc |  12 +-
 .../sequence_ops/sequence_reshape_op.cc       |   2 +-
 .../sequence_ops/sequence_slice_op.h          |   8 +-
 .../sequence_softmax_cudnn_op.cu.cc           |   6 +-
 .../sequence_ops/sequence_softmax_op.cc       |   6 +-
 .../sequence_ops/sequence_unpad_op.cc         |   2 +-
 .../sequence_ops/sequence_unpad_op.h          |   2 +-
 paddle/fluid/operators/shuffle_batch_op.cc    |   2 +-
 paddle/fluid/operators/slice_op.cc            |   2 +-
 paddle/fluid/operators/sparse_attention_op.cu |   2 +-
 paddle/fluid/operators/split_op.h             |   6 +-
 paddle/fluid/operators/spp_op.cc              |   2 +-
 paddle/fluid/operators/spp_op.h               |  17 +-
 paddle/fluid/operators/squeeze_op.h           |   4 +-
 paddle/fluid/operators/stft_op.cc             |   2 +-
 .../operators/string/faster_tokenizer_op.h    |   9 +-
 paddle/fluid/operators/svd_helper.h           |  50 +--
 paddle/fluid/operators/sync_batch_norm_op.cu  |   2 +-
 .../fluid/operators/sync_batch_norm_utils.h   |   4 +-
 paddle/fluid/operators/tdm_child_op.cc        |   6 +-
 paddle/fluid/operators/tdm_sampler_op.cc      |  11 +-
 paddle/fluid/operators/tdm_sampler_op.h       |   2 +-
 paddle/fluid/operators/temporal_shift_op.cu   |  12 +-
 paddle/fluid/operators/temporal_shift_op.h    |   6 +-
 .../operators/tensor_array_to_tensor_op.cc    |  10 +-
 .../operators/tensorrt/tensorrt_engine_op.h   |  10 +-
 .../tensorrt/tensorrt_engine_op_test.cc       |   2 +-
 paddle/fluid/operators/top_k_op.cu            |   4 +-
 paddle/fluid/operators/top_k_op.h             |   4 +-
 paddle/fluid/operators/top_k_op_xpu.cc        |   2 +-
 paddle/fluid/operators/transfer_layout_op.h   |   6 +-
 paddle/fluid/operators/transpose_op.cc        |  10 +-
 paddle/fluid/operators/unbind_op.h            |   2 +-
 .../uniform_random_batch_size_like_op.cc      |   4 +-
 .../uniform_random_batch_size_like_op.cu      |   4 +-
 paddle/fluid/operators/uniform_random_op.h    |   2 +-
 paddle/fluid/operators/unique_op.h            |  22 +-
 paddle/fluid/operators/utils.h                |   8 +-
 paddle/fluid/operators/var_conv_2d_op.cc      |   8 +-
 paddle/fluid/pir/dialect/CMakeLists.txt       |   2 +-
 .../pir/dialect/kernel/ir/kernel_dialect.cc   |   6 +-
 .../pir/dialect/op_generator/op_build_gen.py  |   4 +-
 .../pir/dialect/operator/ir/api_builder.cc    |   2 +-
 .../pir/dialect/operator/ir/api_builder.h     |   2 +-
 .../dialect/operator/ir/attribute_storage.h   |   2 +-
 .../pir/dialect/operator/ir/ir_meta_tensor.h  |   1 +
 .../dialect/operator/ir/ir_selected_rows.cc   |   5 +-
 .../pir/dialect/operator/ir/ir_tensor.cc      |   4 +-
 .../pir/dialect/operator/ir/manual_op.cc      |   2 +-
 .../pir/dialect/operator/ir/op_dialect.cc     |   6 +-
 .../operator/utils/op_yaml_info_parser.cc     |   1 +
 .../fluid/pir/dialect/operator/utils/utils.h  |   1 +
 .../fusion/conv2d_add_act_fuse_pass.cc        |   2 +-
 .../transforms/fusion/conv2d_add_fuse_pass.cc |   2 +-
 .../transforms/fusion/conv2d_bn_fuse_pass.cc  |   4 +-
 .../params_sync_among_devices_pass.cc         |   2 +-
 .../transforms/transform_general_functions.h  |   4 +-
 paddle/fluid/platform/CMakeLists.txt          |  55 ++--
 paddle/fluid/platform/bfloat16_test.cu        |   2 +-
 .../platform/cuda_graph_with_memory_pool.h    |   2 +-
 .../platform/device/custom/CMakeLists.txt     |   4 +-
 .../device/custom/custom_device_test.cc       |  12 +-
 .../fluid/platform/device/gpu/CMakeLists.txt  |  10 +-
 .../platform/device/gpu/cuda/CMakeLists.txt   |   2 +-
 .../fluid/platform/device/ipu/CMakeLists.txt  |   2 +-
 .../fluid/platform/device/ipu/ipu_compiler.cc |   2 +-
 .../fluid/platform/device/ipu/ipu_executor.cc |   4 +-
 .../fluid/platform/device/xpu/CMakeLists.txt  |   2 +
 paddle/fluid/platform/device_code_test.cc     |   4 +-
 paddle/fluid/platform/dynload/CMakeLists.txt  |  12 +-
 paddle/fluid/platform/enforce.h               |   4 -
 paddle/fluid/platform/errors.h                |   6 +-
 paddle/fluid/platform/float16_test.cu         |   2 +-
 paddle/fluid/platform/macros.h                |   2 +-
 paddle/fluid/platform/monitor.h               |   2 +-
 paddle/fluid/platform/profiler/CMakeLists.txt |  12 +-
 .../profiler/custom_device/CMakeLists.txt     |   2 +-
 .../tensor_operants_gen.py                    |   4 +-
 .../composite_backward_api.h                  | 122 ++++----
 .../composite_double_backward_api.h           |  49 +--
 .../api/manual_prim/utils/static_utils.cc     |   2 +-
 .../fluid/prim/api/manual_prim/utils/utils.h  |   6 +-
 paddle/fluid/prim/utils/static/desc_tensor.h  |   6 +-
 paddle/fluid/primitive/backend/CMakeLists.txt |   2 +-
 paddle/fluid/primitive/composite/composite.h  |  51 ++--
 paddle/fluid/primitive/rule/vjp/details.h     |  88 +++---
 paddle/fluid/primitive/type/lazy_tensor.h     |   2 +-
 paddle/fluid/primitive/utils/CMakeLists.txt   |   4 +-
 paddle/fluid/primitive/utils/utils.h          |   4 +-
 paddle/fluid/pybind/CMakeLists.txt            |  15 +-
 paddle/fluid/pybind/auto_parallel_py.cc       |   4 +-
 paddle/fluid/pybind/eager.cc                  |  10 +-
 paddle/fluid/pybind/eager_functions.cc        |   6 +-
 paddle/fluid/pybind/eager_math_op_patch.cc    |   2 +-
 paddle/fluid/pybind/eager_method.cc           |   4 +-
 paddle/fluid/pybind/eager_properties.cc       |   8 +-
 paddle/fluid/pybind/eager_utils.cc            |   5 +-
 paddle/fluid/pybind/eval_frame_tools.cc       |   2 +-
 paddle/fluid/pybind/exception.cc              |   4 +-
 .../pybind/global_value_getter_setter.cc      |   2 +-
 paddle/fluid/pybind/pir.cc                    |   2 +-
 paddle/fluid/pybind/pir.h                     |   2 +-
 paddle/fluid/pybind/process_group_utils.h     |  10 +-
 paddle/fluid/pybind/pybind.cc                 |   6 +-
 paddle/fluid/pybind/reader_py.cc              |  12 +-
 paddle/fluid/pybind/tensor.cc                 |  42 +--
 paddle/fluid/pybind/tensor_py.h               |  14 +-
 paddle/fluid/sub_graph/sub_graph_checker.cc   |   2 +-
 paddle/phi/CMakeLists.txt                     |   3 +-
 paddle/phi/api/all.h                          |   4 +-
 paddle/phi/api/ext/exception.h                |  99 ------
 paddle/phi/api/ext/op_meta_info.h             |   2 +-
 paddle/phi/api/include/context_pool.h         |   2 +-
 paddle/phi/api/include/tensor.h               |  16 +-
 paddle/phi/api/lib/api_gen_utils.cc           |  46 +--
 paddle/phi/api/lib/backend_set.h              |   2 +-
 paddle/phi/api/lib/data_type_set.h            |   2 +-
 paddle/phi/api/lib/kernel_dispatch.cc         |   9 +-
 paddle/phi/api/lib/kernel_dispatch.h          |  11 +-
 paddle/phi/api/lib/tensor.cc                  |  16 +-
 paddle/phi/api/lib/tensor_utils.cc            |   3 +-
 paddle/phi/api/profiler/common_event.h        |   2 +-
 paddle/phi/api/profiler/host_event_recorder.h |   2 +-
 paddle/phi/api/profiler/supplement_tracing.h  |   2 +-
 .../api/yaml/generator/tensor_operants_gen.py |   6 +-
 paddle/phi/backends/CMakeLists.txt            |   2 +-
 paddle/phi/backends/c_comm_lib.h              |   4 +-
 paddle/phi/backends/context_pool.h            |   2 +-
 paddle/phi/backends/device_memory_aligment.h  |   2 +-
 paddle/phi/backends/dynload/CMakeLists.txt    |   4 +-
 paddle/phi/backends/event.h                   |   2 +-
 paddle/phi/backends/gpu/cuda/cuda_graph.h     |   4 +-
 paddle/phi/backends/gpu/cuda/cudnn_desc.h     |   6 +-
 paddle/phi/backends/gpu/cuda/cudnn_helper.h   |   7 +-
 paddle/phi/backends/gpu/gpu_context.cc        |   2 +-
 paddle/phi/backends/gpu/rocm/miopen_desc.h    |   6 +-
 paddle/phi/backends/gpu/rocm/miopen_helper.h  |   4 +-
 paddle/phi/backends/onednn/matmul_utils.cc    |   4 +-
 paddle/phi/backends/onednn/matmul_utils.h     |   4 +-
 paddle/phi/backends/onednn/onednn_context.h   |   2 +-
 paddle/phi/backends/onednn/onednn_helper.h    |  12 +-
 paddle/phi/backends/onednn/onednn_reuse.h     |  31 +-
 paddle/phi/backends/stream.h                  |   2 +-
 paddle/phi/backends/xpu/xpu_context.cc        |   2 +-
 paddle/phi/capi/include/type_utils.h          |   2 +-
 paddle/phi/capi/include/wrapper_base.h        |   2 +-
 paddle/phi/capi/lib/c_meta_tensor.cc          |   2 +-
 paddle/phi/capi/lib/c_tensor.cc               |   2 +-
 paddle/phi/common/backend.h                   |   2 +-
 paddle/phi/common/cpstring_impl.h             |   2 +-
 paddle/phi/common/data_type.h                 |   2 +-
 paddle/phi/common/int_array.cc                |   2 +-
 paddle/phi/common/int_array.h                 |   8 +-
 paddle/phi/common/memory_utils.h              |   2 +-
 paddle/phi/common/place.cc                    |   2 +-
 paddle/phi/common/place.h                     |   2 +-
 paddle/phi/common/scalar.h                    |   2 +-
 paddle/phi/common/tensor_ref.h                |   2 +-
 paddle/phi/core/CMakeLists.txt                |   2 -
 paddle/phi/core/attribute.h                   |   2 +-
 paddle/phi/core/compat/convert_utils.h        |   2 +-
 paddle/phi/core/compat/op_utils.h             |   2 +-
 paddle/phi/core/custom_kernel.h               |   2 +-
 paddle/phi/core/ddim.cc                       | 230 --------------
 paddle/phi/core/ddim.h                        | 284 ------------------
 paddle/phi/core/dense_tensor.h                |   1 +
 .../distributed/auto_parallel/dist_tensor.cc  |   8 +-
 .../auto_parallel/inferspmd_utils.h           |   2 +-
 .../auto_parallel/placement_types.h           |   2 +-
 .../reshard/nd_mesh_reshard_function.cc       |  16 +-
 .../reshard/p_to_s_reshard_function.cc        |   2 +-
 .../auto_parallel/reshard/reshard_function.cc |   4 +-
 .../reshard/s_to_s_reshard_function.cc        |   4 +-
 .../phi/core/distributed/bkcl_comm_context.cc |   2 +-
 .../distributed/check/nccl_dynamic_check.cc   |   2 +-
 .../core/distributed/check/static_check.cc    |   2 +-
 paddle/phi/core/distributed/comm_context.h    |   2 +-
 .../core/distributed/comm_context_manager.h   |   2 +-
 paddle/phi/core/distributed/comm_task.h       |   2 +-
 .../phi/core/distributed/comm_task_manager.h  |   2 +-
 .../phi/core/distributed/gloo_comm_context.h  |   2 +-
 paddle/phi/core/distributed/gloo_utils.cc     |   2 +-
 .../phi/core/distributed/nccl_comm_context.cc |   6 +-
 .../phi/core/distributed/nccl_comm_context.h  |   2 +-
 paddle/phi/core/distributed/nccl_comm_task.h  |   2 +-
 paddle/phi/core/distributed/nccl_tools.cc     |   2 +-
 .../phi/core/distributed/store/CMakeLists.txt |   2 +-
 .../phi/core/distributed/xccl_comm_context.cc |   6 +-
 .../phi/core/distributed/xccl_comm_context.h  |   2 +-
 paddle/phi/core/enforce.h                     |  78 +----
 paddle/phi/core/errors.h                      | 147 ---------
 paddle/phi/core/extended_tensor.cc            |   1 +
 paddle/phi/core/flags.h                       |   2 +-
 paddle/phi/core/infermeta_utils.h             |   2 +-
 paddle/phi/core/kernel_factory.cc             |   2 +-
 paddle/phi/core/kernel_factory.h              |   2 +-
 paddle/phi/core/macros.h                      |  67 -----
 paddle/phi/core/meta_tensor.h                 |   6 +-
 paddle/phi/core/mixed_vector.h                |   2 +-
 paddle/phi/core/scope_guard.h                 |   2 +-
 paddle/phi/core/selected_rows_impl.h          |   6 +-
 paddle/phi/core/sparse_coo_tensor.cc          |   7 +-
 paddle/phi/core/sparse_csr_tensor.cc          |   4 +-
 paddle/phi/core/storage_properties.h          |   2 +-
 paddle/phi/core/tensor_array.cc               |   1 +
 paddle/phi/core/tensor_base.h                 |   4 +-
 paddle/phi/core/tensor_meta.cc                |   2 +-
 paddle/phi/core/tensor_meta.h                 |   5 +-
 paddle/phi/core/tensor_utils.cc               |   2 +-
 paddle/phi/core/threadpool.h                  |   2 +-
 paddle/phi/core/utils/array.h                 | 142 ---------
 paddle/phi/core/utils/dim.h                   | 111 -------
 paddle/phi/core/utils/unroll_array_ops.h      | 129 --------
 paddle/phi/core/visit_type.h                  |   2 +-
 paddle/phi/infermeta/backward.cc              |  26 +-
 paddle/phi/infermeta/binary.cc                | 197 ++++++------
 paddle/phi/infermeta/fusion.cc                |  96 +++---
 paddle/phi/infermeta/multiary.cc              | 219 +++++++-------
 paddle/phi/infermeta/nullary.cc               |  28 +-
 paddle/phi/infermeta/nullary.h                |   1 +
 paddle/phi/infermeta/sparse/binary.cc         |   4 +-
 paddle/phi/infermeta/spmd_rules/concat.cc     |   6 +-
 .../spmd_rules/default_data_parallel.cc       |   8 +-
 .../phi/infermeta/spmd_rules/elementwise.cc   |  16 +-
 paddle/phi/infermeta/spmd_rules/embedding.cc  |   8 +-
 .../infermeta/spmd_rules/flash_attention.cc   |  26 +-
 paddle/phi/infermeta/spmd_rules/flatten.cc    |   6 +-
 paddle/phi/infermeta/spmd_rules/layer_norm.cc |  16 +-
 paddle/phi/infermeta/spmd_rules/matmul.cc     |  10 +-
 paddle/phi/infermeta/spmd_rules/numel.cc      |   2 +-
 paddle/phi/infermeta/spmd_rules/reduction.cc  |  10 +-
 paddle/phi/infermeta/spmd_rules/replicated.cc |   8 +-
 paddle/phi/infermeta/spmd_rules/reshape.cc    |   6 +-
 paddle/phi/infermeta/spmd_rules/slice.cc      |  13 +-
 paddle/phi/infermeta/spmd_rules/softmax.cc    |   6 +-
 paddle/phi/infermeta/spmd_rules/split.cc      |  10 +-
 paddle/phi/infermeta/spmd_rules/squeeze.cc    |   6 +-
 paddle/phi/infermeta/spmd_rules/stack.cc      |   2 +-
 paddle/phi/infermeta/spmd_rules/transpose.cc  |   9 +-
 paddle/phi/infermeta/spmd_rules/triu.cc       |   6 +-
 paddle/phi/infermeta/spmd_rules/unsqueeze.cc  |   6 +-
 paddle/phi/infermeta/spmd_rules/where.cc      |  22 +-
 paddle/phi/infermeta/strings/nullary.cc       |   2 +-
 paddle/phi/infermeta/ternary.cc               |  80 ++---
 paddle/phi/infermeta/unary.cc                 | 258 ++++++++--------
 paddle/phi/kernels/array_kernel.cc            |   8 +-
 paddle/phi/kernels/assign_kernel.cc           |   2 +-
 paddle/phi/kernels/autotune/cache_base.h      |   2 +-
 paddle/phi/kernels/autotune/gpu_timer.h       |   2 +-
 paddle/phi/kernels/coalesce_tensor_kernel.cc  |   2 +-
 .../kernels/cpu/affine_grid_grad_kernel.cc    |   6 +-
 paddle/phi/kernels/cpu/affine_grid_kernel.cc  |   6 +-
 paddle/phi/kernels/cpu/arange_kernel.cc       |   2 +-
 paddle/phi/kernels/cpu/arg_min_max_kernel.cc  |   6 +-
 paddle/phi/kernels/cpu/argsort_grad_kernel.cc |   6 +-
 paddle/phi/kernels/cpu/argsort_kernel.cc      |   6 +-
 paddle/phi/kernels/cpu/assign_pos_kernel.cc   |   2 +-
 .../phi/kernels/cpu/batch_norm_grad_kernel.cc |   4 +-
 paddle/phi/kernels/cpu/batch_norm_kernel.cc   |   2 +-
 paddle/phi/kernels/cpu/concat_kernel.cc       |   4 +-
 paddle/phi/kernels/cpu/conv_util.h            |  19 +-
 paddle/phi/kernels/cpu/cum_maxmin_kernel.cc   |   2 +-
 paddle/phi/kernels/cpu/cumprod_grad_kernel.cc |   2 +-
 .../phi/kernels/cpu/diagonal_grad_kernel.cc   |   4 +-
 paddle/phi/kernels/cpu/diagonal_kernel.cc     |   4 +-
 paddle/phi/kernels/cpu/dropout_grad_kernel.cc |   2 +-
 paddle/phi/kernels/cpu/dropout_kernel.cc      |   4 +-
 paddle/phi/kernels/cpu/eig.h                  |   8 +-
 paddle/phi/kernels/cpu/eig_kernel.cc          |   4 +-
 paddle/phi/kernels/cpu/eigvals_kernel.cc      |  12 +-
 .../cpu/fill_diagonal_tensor_kernel.cc        |   2 +-
 paddle/phi/kernels/cpu/flip_kernel.cc         |   2 +-
 paddle/phi/kernels/cpu/full_kernel.cc         |   4 +-
 paddle/phi/kernels/cpu/gaussian_kernel.cc     |   2 +-
 .../kernels/cpu/generate_proposals_kernel.cc  |  57 ++--
 paddle/phi/kernels/cpu/grid_sample_kernel.cc  |   4 +-
 .../phi/kernels/cpu/group_norm_grad_kernel.cc |   4 +-
 paddle/phi/kernels/cpu/group_norm_kernel.cc   |   4 +-
 .../phi/kernels/cpu/gumbel_softmax_kernel.cc  |   2 +-
 .../phi/kernels/cpu/hsigmoid_loss_kernel.cc   |   4 +-
 paddle/phi/kernels/cpu/index_add_impl.h       |   4 +-
 .../phi/kernels/cpu/index_put_grad_kernel.cc  |  15 +-
 paddle/phi/kernels/cpu/index_put_kernel.cc    |   6 +-
 paddle/phi/kernels/cpu/index_sample_kernel.cc |   2 +-
 paddle/phi/kernels/cpu/index_select_impl.h    |   4 +-
 .../kernels/cpu/instance_norm_grad_kernel.cc  |   2 +-
 .../phi/kernels/cpu/instance_norm_kernel.cc   |   2 +-
 .../kernels/cpu/interpolate_grad_kernel.cc    |   8 +-
 paddle/phi/kernels/cpu/interpolate_kernel.cc  |   8 +-
 .../phi/kernels/cpu/kthvalue_grad_kernel.cc   |   8 +-
 paddle/phi/kernels/cpu/kthvalue_kernel.cc     |   8 +-
 .../phi/kernels/cpu/layer_norm_grad_kernel.cc |   2 +-
 paddle/phi/kernels/cpu/layer_norm_kernel.cc   |   2 +-
 .../kernels/cpu/limit_by_capacity_kernel.cc   |   2 +-
 paddle/phi/kernels/cpu/linspace_kernel.cc     |   2 +-
 paddle/phi/kernels/cpu/logspace_kernel.cc     |   2 +-
 paddle/phi/kernels/cpu/lstsq_kernel.cc        |  14 +-
 paddle/phi/kernels/cpu/lu_kernel.cc           |   6 +-
 .../kernels/cpu/masked_select_grad_kernel.cc  |   4 +-
 .../phi/kernels/cpu/masked_select_kernel.cc   |   4 +-
 paddle/phi/kernels/cpu/matrix_nms_kernel.cc   |  14 +-
 paddle/phi/kernels/cpu/mode_grad_kernel.cc    |   8 +-
 paddle/phi/kernels/cpu/mode_kernel.cc         |   8 +-
 .../phi/kernels/cpu/multiclass_nms3_kernel.cc |   2 +-
 paddle/phi/kernels/cpu/nms_kernel.cc          |   2 +-
 paddle/phi/kernels/cpu/nonzero_kernel.cc      |   2 +-
 .../kernels/cpu/overlap_add_grad_kernel.cc    |  28 +-
 paddle/phi/kernels/cpu/overlap_add_kernel.cc  |  28 +-
 paddle/phi/kernels/cpu/prior_box_kernel.cc    |   2 +-
 .../cpu/prune_gate_by_capacity_kernel.cc      |   2 +-
 paddle/phi/kernels/cpu/randint_kernel.cc      |   2 +-
 .../phi/kernels/cpu/random_routing_kernel.cc  |   2 +-
 .../cpu/repeat_interleave_grad_kernel.cc      |   2 +-
 paddle/phi/kernels/cpu/rnn_functor.h          |   2 +-
 paddle/phi/kernels/cpu/rnn_grad_kernel.cc     |  10 +-
 paddle/phi/kernels/cpu/rnn_kernel.cc          |  20 +-
 .../phi/kernels/cpu/roi_align_grad_kernel.cc  |   8 +-
 paddle/phi/kernels/cpu/roi_align_kernel.cc    |   6 +-
 .../phi/kernels/cpu/roi_pool_grad_kernel.cc   |   8 +-
 paddle/phi/kernels/cpu/roi_pool_kernel.cc     |   8 +-
 paddle/phi/kernels/cpu/send_u_recv_kernel.cc  |   4 +-
 paddle/phi/kernels/cpu/send_ue_recv_kernel.cc |   4 +-
 paddle/phi/kernels/cpu/send_uv_grad_kernel.cc |   4 +-
 .../phi/kernels/cpu/shuffle_batch_kernel.cc   |   4 +-
 .../sparse_weight_embedding_grad_kernel.cc    |   2 +-
 paddle/phi/kernels/cpu/strided_copy_kernel.cc |   4 +-
 .../kernels/cpu/temporal_shift_grad_kernel.cc |   8 +-
 .../phi/kernels/cpu/temporal_shift_kernel.cc  |   8 +-
 paddle/phi/kernels/cpu/top_k_grad_kernel.cc   |   6 +-
 paddle/phi/kernels/cpu/top_k_kernel.cc        |   6 +-
 .../kernels/cpu/triangular_solve_kernel.cc    |   4 +-
 paddle/phi/kernels/cpu/uniform_kernel.cc      |   2 +-
 .../kernels/cpu/unique_consecutive_functor.h  |  16 +-
 .../kernels/cpu/unique_consecutive_kernel.cc  |   2 +-
 paddle/phi/kernels/dist_grad_kernel.cc        |   4 +-
 paddle/phi/kernels/empty_kernel.cc            |   4 +-
 paddle/phi/kernels/flatten_grad_kernel.cc     |   2 +-
 paddle/phi/kernels/full_kernel.cc             |   2 +-
 paddle/phi/kernels/funcs/affine_grid_utils.h  |  28 +-
 paddle/phi/kernels/funcs/axis_utils.h         |   2 +-
 paddle/phi/kernels/funcs/batch_norm_utils.h   |  24 +-
 paddle/phi/kernels/funcs/blas/blas.cc         |   4 +-
 .../phi/kernels/funcs/blas/blaslt_impl.cu.h   |   8 +-
 paddle/phi/kernels/funcs/broadcast_function.h |  30 +-
 paddle/phi/kernels/funcs/common_shape.h       |  16 +-
 paddle/phi/kernels/funcs/compound_functors.h  |   2 +-
 paddle/phi/kernels/funcs/concat_funcs.h       |   2 +-
 paddle/phi/kernels/funcs/cufft_util.h         |   2 +-
 paddle/phi/kernels/funcs/cumprod.h            |   2 +-
 .../kernels/funcs/data_layout_transform.cc    |   8 +-
 .../phi/kernels/funcs/data_layout_transform.h |   4 +-
 .../funcs/detail/activation_functions.h       |   2 +-
 .../phi/kernels/funcs/detail/strided_memcpy.h |   2 +-
 paddle/phi/kernels/funcs/diag_functor.h       |   2 +-
 paddle/phi/kernels/funcs/diagonal.h           |  12 +-
 paddle/phi/kernels/funcs/dims_simplifier.h    |  12 +-
 paddle/phi/kernels/funcs/dropout_impl.cu.h    |   2 +-
 paddle/phi/kernels/funcs/elementwise_base.h   |  19 +-
 .../phi/kernels/funcs/elementwise_functor.h   |   2 +-
 .../phi/kernels/funcs/elementwise_grad_base.h |  10 +-
 paddle/phi/kernels/funcs/elementwise_utils.h  |   6 +-
 paddle/phi/kernels/funcs/fc_functor.cu        |   6 +-
 paddle/phi/kernels/funcs/fft.cc               |  28 +-
 paddle/phi/kernels/funcs/fft.cu               |  12 +-
 paddle/phi/kernels/funcs/fft_fill_conj.h      |   6 +-
 paddle/phi/kernels/funcs/fft_key.h            |   4 +-
 paddle/phi/kernels/funcs/for_range.h          |   2 +-
 .../phi/kernels/funcs/fused_gemm_epilogue.h   |   4 +-
 paddle/phi/kernels/funcs/gather.cu.h          |   6 +-
 paddle/phi/kernels/funcs/gather.h             |  10 +-
 .../kernels/funcs/gather_scatter_functor.cc   |   2 +-
 paddle/phi/kernels/funcs/im2col.h             |   4 +-
 paddle/phi/kernels/funcs/index_calculator.h   |  13 +-
 paddle/phi/kernels/funcs/index_put_utils.h    |  24 +-
 .../phi/kernels/funcs/interpolate_function.h  |   8 +-
 paddle/phi/kernels/funcs/jit/CMakeLists.txt   |   4 +-
 paddle/phi/kernels/funcs/jit/kernel_base.h    |   2 +-
 paddle/phi/kernels/funcs/jit/registry.h       |   2 +-
 paddle/phi/kernels/funcs/layer_norm_impl.cu.h |   2 +-
 paddle/phi/kernels/funcs/math_function.cc     |   6 +-
 paddle/phi/kernels/funcs/math_function.cu     |   4 +-
 paddle/phi/kernels/funcs/matrix_reduce.cc     |   7 +-
 paddle/phi/kernels/funcs/matrix_reduce.cu     |   6 +-
 paddle/phi/kernels/funcs/matrix_solve.cu      |   2 +-
 paddle/phi/kernels/funcs/matrix_solve.h       |   2 +-
 paddle/phi/kernels/funcs/maxouting.h          |   2 +-
 paddle/phi/kernels/funcs/mode.h               |   2 +-
 paddle/phi/kernels/funcs/nanmedian_utils.h    |   4 +-
 paddle/phi/kernels/funcs/norm_utils.cu.h      |   2 +-
 paddle/phi/kernels/funcs/norm_utils.h         |   4 +-
 paddle/phi/kernels/funcs/pooling.h            |   4 +-
 paddle/phi/kernels/funcs/reduce_function.h    |  12 +-
 paddle/phi/kernels/funcs/reduce_functor.h     |   2 +-
 .../phi/kernels/funcs/reduce_grad_functions.h |   4 +-
 .../funcs/repeat_tensor2index_tensor.h        |   2 +-
 paddle/phi/kernels/funcs/scatter.cu.h         |   4 +-
 paddle/phi/kernels/funcs/scatter.h            |   6 +-
 paddle/phi/kernels/funcs/segment_pooling.cc   |   2 +-
 paddle/phi/kernels/funcs/select_impl.cu.h     |   6 +-
 .../kernels/funcs/selected_rows_functor.cc    |  12 +-
 .../kernels/funcs/selected_rows_functor.cu    |   8 +-
 paddle/phi/kernels/funcs/sequence2batch.cc    |   4 +-
 paddle/phi/kernels/funcs/sequence_pooling.cc  |   2 +-
 paddle/phi/kernels/funcs/sequence_pooling.cu  |   2 +-
 paddle/phi/kernels/funcs/slice.h              |  10 +-
 paddle/phi/kernels/funcs/slice_utils.h        |   4 +-
 paddle/phi/kernels/funcs/softmax.cu           |   4 +-
 .../phi/kernels/funcs/sparse/common_shape.h   |   6 +-
 paddle/phi/kernels/funcs/sparse/convolution.h |   2 +-
 .../kernels/funcs/sparse/flatten_indices.h    |   2 +-
 paddle/phi/kernels/funcs/sparse/softmax.cu.h  |   6 +-
 paddle/phi/kernels/funcs/sparse/softmax.h     |   2 +-
 .../funcs/sparse/sparse_blas_impl.cu.h        |  10 +-
 .../funcs/sparse/sparse_blas_impl.hip.h       |   8 +-
 paddle/phi/kernels/funcs/strided_memcpy.h     |   6 +-
 paddle/phi/kernels/funcs/strided_slice.h      |  14 +-
 .../phi/kernels/funcs/top_k_function_cuda.h   |   4 +-
 .../phi/kernels/funcs/transpose_function.cu.h |   6 +-
 paddle/phi/kernels/funcs/unique_functor.h     |  22 +-
 paddle/phi/kernels/funcs/unsqueeze.h          |  10 +-
 .../kernels/funcs/values_vectors_functor.h    |   8 +-
 paddle/phi/kernels/funcs/vol2col.cc           |   1 +
 paddle/phi/kernels/funcs/vol2col.h            |   2 +-
 .../cpu/distributed_fused_lamb_init_kernel.cc |   2 +-
 ...used_softmax_mask_upper_triangle_kernel.cc |   2 +-
 .../kernels/fusion/cpu/fusion_gru_kernel.cc   |   4 +-
 .../cpu/fusion_repeated_fc_relu_kernel.cc     |   2 +-
 .../cpu/fusion_seqconv_eltadd_relu_kernel.cc  |   6 +-
 .../cpu/fusion_seqexpand_concat_fc_kernel.cc  |   2 +-
 .../cutlass/memory_efficient_attention.cu     |   2 +-
 .../generate_variable_forward_kernels.py      |   2 +-
 .../memory_efficient_attention_utils.h        |   2 +-
 paddle/phi/kernels/fusion/gpu/cast_with_ptr.h |   6 +-
 .../kernels/fusion/gpu/conv_fusion_kernel.cu  |  16 +-
 .../gpu/distributed_fused_lamb_init_kernel.cu |   2 +-
 ...used_embedding_eltwise_layernorm_kernel.cu |   2 +-
 .../fused_fc_elementwise_layernorm_kernel.cu  |   4 +-
 .../gpu/fused_scale_bias_add_relu_kernel.cu   |   4 +-
 .../fused_scale_bias_relu_conv_bn_kernel.cu   |  14 +-
 ...softmax_mask_upper_triangle_grad_kernel.cu |   2 +-
 ...used_softmax_mask_upper_triangle_kernel.cu |   2 +-
 .../fusion_transpose_flatten_concat_kernel.cu |   2 +-
 .../fusion/gpu/multihead_matmul_kernel.cu     |   6 +-
 .../fusion/gpu/skip_layernorm_kernel.cu       |   2 +-
 paddle/phi/kernels/fusion/onednn/fc_kernel.cc |  14 +-
 .../fusion/onednn/fused_conv_kernel.cc        |   2 +-
 .../fusion/onednn/fused_matmul_kernel.cc      |  15 +-
 .../fusion/onednn/fused_transpose_kernel.cc   |   8 +-
 .../fusion/onednn/fusion_gru_kernel.cc        |   8 +-
 .../kernels/fusion/xpu/add_act_xpu_kernel.cc  |   4 +-
 .../fusion/xpu/add_layernorm_xpu_kernel.cc    |   4 +-
 .../kernels/fusion/xpu/bn_act_xpu_kernel.cc   |   2 +-
 .../kernels/fusion/xpu/conv2d_xpu_kernel.cc   |   7 +-
 .../fusion/xpu/conv_transpose_xpu_kernel.cc   |   2 +-
 .../fusion/xpu/fast_layernorm_xpu_kernel.cc   |   2 +-
 .../fusion/xpu/fast_where_xpu_kernel.cc       |   6 +-
 .../xpu/fused_feedforward_grad_kernel.cc      |   2 +-
 .../fusion/xpu/fused_feedforward_kernel.cc    |   2 +-
 ...fused_multi_transformer_int8_xpu_kernel.cc |  10 +-
 .../xpu/fused_multi_transformer_xpu_kernel.cc |   8 +-
 .../fusion/xpu/fused_softmax_mask_kernel.cc   |   4 +-
 .../fusion/xpu/layer_norm_act_xpu_kernel.cc   |   2 +-
 .../kernels/fusion/xpu/yolo_box_xpu_kernel.cc |   9 +-
 .../kernels/gpu/affine_grid_grad_kernel.cu    |   6 +-
 paddle/phi/kernels/gpu/affine_grid_kernel.cu  |   6 +-
 paddle/phi/kernels/gpu/arange_kernel.cu       |   6 +-
 paddle/phi/kernels/gpu/arg_min_max_kernel.cu  |   4 +-
 paddle/phi/kernels/gpu/argsort_grad_kernel.cu |   6 +-
 paddle/phi/kernels/gpu/argsort_kernel.cu      |   8 +-
 paddle/phi/kernels/gpu/assign_pos_kernel.cu   |   2 +-
 .../phi/kernels/gpu/batch_norm_grad_kernel.cu |   6 +-
 paddle/phi/kernels/gpu/batch_norm_kernel.cu   |   4 +-
 paddle/phi/kernels/gpu/c_split_kernel.cu      |   4 +-
 paddle/phi/kernels/gpu/concat_kernel.cu       |   4 +-
 paddle/phi/kernels/gpu/contiguous_kernel.cu   |  10 +-
 .../kernels/gpu/conv_transpose_grad_kernel.cu |   8 +-
 .../phi/kernels/gpu/conv_transpose_kernel.cu  |   8 +-
 paddle/phi/kernels/gpu/cum_maxmin_kernel.cu   |   2 +-
 paddle/phi/kernels/gpu/decode_jpeg_kernel.cu  |   2 +-
 paddle/phi/kernels/gpu/depthwise_conv.h       |   1 +
 .../kernels/gpu/depthwise_conv_grad_kernel.cu |   6 +-
 .../phi/kernels/gpu/depthwise_conv_kernel.cu  |   4 +-
 .../phi/kernels/gpu/diagonal_grad_kernel.cu   |   5 +-
 paddle/phi/kernels/gpu/diagonal_kernel.cu     |   6 +-
 paddle/phi/kernels/gpu/dist_kernel.cu         |   2 +-
 .../phi/kernels/gpu/embedding_grad_kernel.cu  |   2 +-
 paddle/phi/kernels/gpu/expand_as_kernel.cu    |   2 +-
 paddle/phi/kernels/gpu/expand_kernel.cu       |   4 +-
 .../gpu/fill_diagonal_tensor_grad_kernel.cu   |   2 +-
 .../gpu/fill_diagonal_tensor_kernel.cu        |   4 +-
 paddle/phi/kernels/gpu/flip_kernel.cu         |   4 +-
 paddle/phi/kernels/gpu/full_kernel.cu         |   2 +-
 .../gpu/gaussian_inplace_grad_kernel.cu       |   2 +-
 paddle/phi/kernels/gpu/gaussian_kernel.cu     |   2 +-
 .../kernels/gpu/generate_proposals_kernel.cu  |  59 ++--
 .../phi/kernels/gpu/group_norm_grad_kernel.cu |   4 +-
 paddle/phi/kernels/gpu/group_norm_kernel.cu   |   6 +-
 .../phi/kernels/gpu/gumbel_softmax_kernel.cu  |   2 +-
 .../phi/kernels/gpu/index_add_grad_kernel.cu  |   2 +-
 paddle/phi/kernels/gpu/index_add_kernel.cu    |   2 +-
 .../phi/kernels/gpu/index_put_grad_kernel.cu  |  42 +--
 paddle/phi/kernels/gpu/index_put_kernel.cu    |  14 +-
 .../kernels/gpu/index_select_grad_kernel.cu   |   2 +-
 paddle/phi/kernels/gpu/index_select_kernel.cu |   2 +-
 .../kernels/gpu/instance_norm_grad_kernel.cu  |   2 +-
 .../phi/kernels/gpu/instance_norm_kernel.cu   |   2 +-
 .../kernels/gpu/interpolate_grad_kernel.cu    |   8 +-
 paddle/phi/kernels/gpu/interpolate_kernel.cu  |   8 +-
 paddle/phi/kernels/gpu/kthvalue_kernel.cu     |  12 +-
 .../phi/kernels/gpu/layer_norm_grad_kernel.cu |   2 +-
 paddle/phi/kernels/gpu/layer_norm_kernel.cu   |   6 +-
 paddle/phi/kernels/gpu/lerp_grad_kernel.cu    |  10 +-
 paddle/phi/kernels/gpu/lerp_kernel.cu         |   8 +-
 paddle/phi/kernels/gpu/linspace_kernel.cu     |   2 +-
 paddle/phi/kernels/gpu/logspace_kernel.cu     |   2 +-
 paddle/phi/kernels/gpu/logsumexp_kernel.cu    |   8 +-
 paddle/phi/kernels/gpu/lstsq_kernel.cu        |  14 +-
 paddle/phi/kernels/gpu/lu_kernel.cu           |   6 +-
 .../kernels/gpu/masked_select_grad_kernel.cu  |   4 +-
 .../phi/kernels/gpu/masked_select_kernel.cu   |   4 +-
 paddle/phi/kernels/gpu/mode_kernel.cu         |   8 +-
 paddle/phi/kernels/gpu/multinomial_kernel.cu  |   5 +-
 paddle/phi/kernels/gpu/nanmedian_kernel.cu    |   4 +-
 paddle/phi/kernels/gpu/nms_kernel.cu          |   2 +-
 paddle/phi/kernels/gpu/nonzero_kernel.cu      |   2 +-
 paddle/phi/kernels/gpu/number_count_kernel.cu |   2 +-
 .../kernels/gpu/overlap_add_grad_kernel.cu    |  27 +-
 paddle/phi/kernels/gpu/overlap_add_kernel.cu  |  28 +-
 paddle/phi/kernels/gpu/p_recv_kernel.cu       |   4 +-
 paddle/phi/kernels/gpu/p_send_kernel.cu       |   2 +-
 paddle/phi/kernels/gpu/qr_kernel.cu           |  22 +-
 paddle/phi/kernels/gpu/randint_kernel.cu      |   2 +-
 paddle/phi/kernels/gpu/randperm_kernel.cu     |   2 +-
 .../phi/kernels/gpu/reduce_amin_amax_common.h |   8 +-
 paddle/phi/kernels/gpu/reduce_grad.h          |   4 +-
 paddle/phi/kernels/gpu/reduce_kernel.cu       |  20 +-
 paddle/phi/kernels/gpu/roi_pool_kernel.cu     |   2 +-
 paddle/phi/kernels/gpu/roll_grad_kernel.cu    |   2 +-
 paddle/phi/kernels/gpu/roll_kernel.cu         |   4 +-
 paddle/phi/kernels/gpu/roll_kernel_impl.h     |   2 +-
 paddle/phi/kernels/gpu/send_u_recv_kernel.cu  |   4 +-
 paddle/phi/kernels/gpu/send_ue_recv_kernel.cu |   4 +-
 paddle/phi/kernels/gpu/send_uv_grad_kernel.cu |   4 +-
 .../kernels/gpu/shuffle_batch_grad_kernel.cu  |   2 +-
 .../phi/kernels/gpu/shuffle_batch_kernel.cu   |   6 +-
 ...d_cross_entropy_with_logits_grad_kernel.cu |   2 +-
 ...igmoid_cross_entropy_with_logits_kernel.cu |   2 +-
 paddle/phi/kernels/gpu/strided_copy_kernel.cu |  18 +-
 .../kernels/gpu/temporal_shift_grad_kernel.cu |   8 +-
 .../phi/kernels/gpu/temporal_shift_kernel.cu  |   8 +-
 paddle/phi/kernels/gpu/tile_kernel.cu         |  10 +-
 paddle/phi/kernels/gpu/top_k_kernel.cu        |   6 +-
 .../phi/kernels/gpu/top_p_sampling_kernel.cu  |  14 +-
 .../kernels/gpu/triangular_solve_kernel.cu    |   4 +-
 .../gpu/uniform_inplace_grad_kernel.cu        |   2 +-
 paddle/phi/kernels/gpu/uniform_kernel.cu      |   2 +-
 .../kernels/gpu/unique_consecutive_functor.h  |  34 +--
 .../kernels/gpu/unique_consecutive_kernel.cu  |   2 +-
 paddle/phi/kernels/gpu/unique_kernel.cu       |  58 ++--
 .../phi/kernels/gpu/viterbi_decode_kernel.cu  |   2 +-
 paddle/phi/kernels/gpu/yolo_box_kernel.cu     |   3 +-
 .../phi/kernels/gpudnn/affine_grid_kernel.cu  |   2 +-
 .../phi/kernels/gpudnn/conv_cudnn_frontend.h  |   2 +-
 paddle/phi/kernels/gpudnn/conv_gpudnn_base.h  |   4 +-
 paddle/phi/kernels/gpudnn/conv_grad_kernel.cu |   8 +-
 paddle/phi/kernels/gpudnn/conv_kernel.cu      |   4 +-
 .../gpudnn/conv_transpose_grad_kernel.cu      |  27 +-
 .../kernels/gpudnn/conv_transpose_kernel.cu   |  18 +-
 paddle/phi/kernels/gpudnn/pool_grad_kernel.cu |  32 +-
 paddle/phi/kernels/gpudnn/pool_kernel.cu      |  24 +-
 paddle/phi/kernels/impl/amp_kernel_impl.h     |   3 +-
 .../kernels/impl/bilinear_grad_kernel_impl.h  |  10 +-
 .../phi/kernels/impl/bilinear_kernel_impl.h   |   4 +-
 .../impl/broadcast_tensors_kernel_impl.h      |   2 +-
 .../kernels/impl/cholesky_solve_kernel_impl.h |   4 +-
 .../phi/kernels/impl/conv_grad_kernel_impl.h  |  17 +-
 paddle/phi/kernels/impl/conv_kernel_impl.h    |   9 +-
 .../impl/conv_transpose_grad_kernel_impl.h    |  14 +-
 .../kernels/impl/conv_transpose_kernel_impl.h |  14 +-
 paddle/phi/kernels/impl/crop_kernel_impl.h    |   2 +-
 .../impl/deformable_conv_grad_kernel_impl.h   |  29 +-
 .../impl/deformable_conv_kernel_impl.h        |  35 +--
 .../impl/determinant_grad_kernel_impl.h       |   2 +-
 .../kernels/impl/determinant_kernel_impl.h    |   6 +-
 paddle/phi/kernels/impl/diag_embed_impl.h     |   6 +-
 .../phi/kernels/impl/dot_grad_kernel_impl.h   |   4 +-
 .../phi/kernels/impl/eigh_grad_kernel_impl.h  |   4 +-
 paddle/phi/kernels/impl/einsum_grad_impl.h    |   4 +-
 paddle/phi/kernels/impl/einsum_impl.h         |   9 +-
 .../kernels/impl/expand_as_grad_kernel_impl.h |   2 +-
 .../phi/kernels/impl/expand_as_kernel_impl.h  |   8 +-
 .../kernels/impl/expand_grad_kernel_impl.h    |   2 +-
 paddle/phi/kernels/impl/expand_kernel_impl.h  |   4 +-
 paddle/phi/kernels/impl/fc_kernel_impl.h      |   4 +-
 .../phi/kernels/impl/fft_grad_kernel_impl.h   |   6 +-
 paddle/phi/kernels/impl/fft_kernel_impl.h     |   4 +-
 .../phi/kernels/impl/fold_grad_kernel_impl.h  |   4 +-
 paddle/phi/kernels/impl/fold_kernel_impl.h    |   4 +-
 .../phi/kernels/impl/frame_grad_kernel_impl.h |  26 +-
 paddle/phi/kernels/impl/frame_kernel_impl.h   |  28 +-
 .../impl/full_whit_tensor_kernel_impl.h       |   2 +-
 .../kernels/impl/graph_message_passing_impl.h |  10 +-
 .../phi/kernels/impl/kron_grad_kernel_impl.h  |   6 +-
 paddle/phi/kernels/impl/kron_kernel_impl.h    |   8 +-
 paddle/phi/kernels/impl/lamb_kernel_impl.h    |   4 +-
 .../phi/kernels/impl/lerp_grad_kernel_impl.h  |   2 +-
 paddle/phi/kernels/impl/lerp_kernel_impl.h    |   2 +-
 paddle/phi/kernels/impl/lstsq_kernel_impl.h   |   8 +-
 paddle/phi/kernels/impl/lu_grad_kernel_impl.h |   2 +-
 paddle/phi/kernels/impl/lu_kernel_impl.h      |  18 +-
 .../kernels/impl/lu_unpack_grad_kernel_impl.h |   2 +-
 .../kernels/impl/matmul_grad_kernel_impl.h    |  46 +--
 paddle/phi/kernels/impl/matmul_kernel_impl.h  |  32 +-
 .../kernels/impl/matrix_rank_kernel_impl.h    |  22 +-
 .../phi/kernels/impl/merged_momentum_impl.h   |   2 +-
 .../kernels/impl/meshgrid_grad_kernel_impl.h  |   2 +-
 .../phi/kernels/impl/meshgrid_kernel_impl.h   |   4 +-
 .../phi/kernels/impl/multi_dot_kernel_impl.h  |  16 +-
 .../phi/kernels/impl/pool_grad_kernel_impl.h  |   2 +-
 paddle/phi/kernels/impl/pool_kernel_impl.h    |   2 +-
 ...ow2_decay_with_linear_warmup_kernel_impl.h |   2 +-
 paddle/phi/kernels/impl/qr_grad_kernel_impl.h |  10 +-
 .../kernels/impl/quant_linear_kernel_impl.h   |   8 +-
 paddle/phi/kernels/impl/reduce_grad.h         |   2 +-
 paddle/phi/kernels/impl/renorm_impl.h         |  14 +-
 .../impl/repeat_interleave_grad_kernel_impl.h |   6 +-
 .../impl/repeat_interleave_kernel_impl.h      |  30 +-
 .../kernels/impl/searchsorted_kernel_impl.h   |   2 +-
 .../kernels/impl/segment_pool_kernel_impl.h   |   2 +-
 .../kernels/impl/sequence_mask_kernel_impl.h  |   8 +-
 .../kernels/impl/set_value_grad_kernel_impl.h |   2 +-
 .../phi/kernels/impl/set_value_kernel_impl.h  |   4 +-
 .../phi/kernels/impl/slice_grad_kernel_impl.h |  18 +-
 .../impl/slogdeterminant_grad_kernel_impl.h   |   2 +-
 .../impl/slogdeterminant_kernel_impl.h        |   4 +-
 .../phi/kernels/impl/solve_grad_kernel_impl.h |  22 +-
 paddle/phi/kernels/impl/solve_kernel_impl.h   |   8 +-
 .../impl/spectral_norm_grad_kernel_impl.h     |   6 +-
 .../kernels/impl/spectral_norm_kernel_impl.h  |   8 +-
 .../phi/kernels/impl/svd_grad_kernel_impl.h   |   6 +-
 .../phi/kernels/impl/tile_grad_kernel_impl.h  |   2 +-
 paddle/phi/kernels/impl/tile_kernel_impl.h    |   4 +-
 .../phi/kernels/impl/trace_grad_kernel_impl.h |   8 +-
 .../kernels/impl/unfold_grad_kernel_impl.h    |   4 +-
 paddle/phi/kernels/impl/unfold_kernel_impl.h  |   4 +-
 paddle/phi/kernels/impl/warpctc_kernel_impl.h |   2 +-
 .../phi/kernels/impl/warprnnt_kernel_impl.h   |   2 +-
 paddle/phi/kernels/is_empty_kernel.cc         |   2 +-
 paddle/phi/kernels/kps/reduce_kernel.cu       |   2 +-
 .../phi/kernels/legacy/cpu/randint_kernel.cc  |   2 +-
 .../phi/kernels/legacy/cpu/uniform_kernel.cc  |   2 +-
 .../phi/kernels/legacy/gpu/randint_kernel.cu  |   2 +-
 .../phi/kernels/legacy/gpu/uniform_kernel.cu  |   2 +-
 .../phi/kernels/legacy/xpu/compare_kernel.cc  |   4 +-
 .../phi/kernels/legacy/xpu/randint_kernel.cc  |   2 +-
 .../phi/kernels/legacy/xpu/uniform_kernel.cc  |   2 +-
 paddle/phi/kernels/onednn/add_n_kernel.cc     |   2 +-
 .../kernels/onednn/batch_norm_grad_kernel.cc  |   4 +-
 .../phi/kernels/onednn/batch_norm_kernel.cc   |   2 +-
 paddle/phi/kernels/onednn/cast_kernel.cc      |   2 +-
 .../phi/kernels/onednn/concat_grad_kernel.cc  |   4 +-
 paddle/phi/kernels/onednn/concat_kernel.cc    |   4 +-
 paddle/phi/kernels/onednn/conv_grad_kernel.cc |  12 +-
 paddle/phi/kernels/onednn/conv_handler.h      |  41 +--
 paddle/phi/kernels/onednn/conv_kernel.cc      |   2 +-
 .../kernels/onednn/conv_transpose_kernel.cc   |  18 +-
 .../phi/kernels/onednn/dequantize_kernel.cc   |   2 +-
 .../kernels/onednn/elementwise_grad_kernel.cc |   8 +-
 .../phi/kernels/onednn/expand_grad_kernel.cc  |  11 +-
 paddle/phi/kernels/onednn/expand_kernel.cc    |   4 +-
 paddle/phi/kernels/onednn/full_kernel.cc      |   4 +-
 paddle/phi/kernels/onednn/gaussian_kernel.cc  |   2 +-
 .../phi/kernels/onednn/interpolate_kernel.cc  |   8 +-
 .../phi/kernels/onednn/matmul_grad_kernel.cc  |  10 +-
 paddle/phi/kernels/onednn/matmul_kernel.cc    |  20 +-
 paddle/phi/kernels/onednn/pad3d_kernel.cc     |   2 +-
 paddle/phi/kernels/onednn/pad_kernel_impl.h   |   4 +-
 paddle/phi/kernels/onednn/pool_grad_kernel.cc |   2 +-
 paddle/phi/kernels/onednn/pool_kernel.cc      |   2 +-
 .../phi/kernels/onednn/reduce_kernel_impl.h   |  12 +-
 .../kernels/onednn/reduce_mean_grad_kernel.cc |   2 +-
 paddle/phi/kernels/onednn/reshape_kernel.cc   |  18 +-
 paddle/phi/kernels/onednn/shape_kernel.cc     |   6 +-
 .../phi/kernels/onednn/slice_grad_kernel.cc   |   2 +-
 paddle/phi/kernels/onednn/slice_kernel.cc     |   6 +-
 paddle/phi/kernels/onednn/split_kernel.cc     |   4 +-
 .../phi/kernels/onednn/squeeze_grad_kernel.cc |   4 +-
 paddle/phi/kernels/onednn/squeeze_kernel.cc   |   6 +-
 paddle/phi/kernels/onednn/stack_kernel.cc     |  10 +-
 .../kernels/onednn/transpose_grad_kernel.cc   |   2 +-
 paddle/phi/kernels/onednn/transpose_kernel.cc |   4 +-
 .../kernels/primitive/datamover_primitives.h  |   2 +-
 .../elementwise_multiply_kernel.cc            |   2 +-
 .../selected_rows/impl/add_n_kernel_impl.h    |   2 +-
 .../selected_rows/impl/lamb_kernel_impl.h     |   4 +-
 .../phi/kernels/sparse/cpu/coalesce_kernel.cc |   4 +-
 paddle/phi/kernels/sparse/cpu/conv_kernel.cc  |   2 +-
 .../kernels/sparse/cpu/elementwise_kernel.cc  |   8 +-
 paddle/phi/kernels/sparse/cpu/mask_kernel.cc  |   2 +-
 .../kernels/sparse/cpu/reshape_grad_kernel.cc |   4 +-
 .../phi/kernels/sparse/cpu/reshape_kernel.cc  |   6 +-
 paddle/phi/kernels/sparse/cpu/slice_kernel.cc |   2 +-
 .../kernels/sparse/cpu/softmax_grad_kernel.cc |   8 +-
 .../phi/kernels/sparse/cpu/softmax_kernel.cc  |   2 +-
 .../kernels/sparse/cpu/sparse_utils_kernel.cc |   2 +-
 paddle/phi/kernels/sparse/cpu/sum_kernel.cc   |  16 +-
 paddle/phi/kernels/sparse/gpu/addmm_kernel.cu |   8 +-
 .../phi/kernels/sparse/gpu/coalesce_kernel.cu |   4 +-
 paddle/phi/kernels/sparse/gpu/conv_kernel.cu  |   2 +-
 paddle/phi/kernels/sparse/gpu/full_kernel.cu  |   4 +-
 .../sparse/gpu/fused_attention_kernel.cu      |   3 +-
 paddle/phi/kernels/sparse/gpu/mask_kernel.cu  |   2 +-
 .../kernels/sparse/gpu/matmul_grad_kernel.cu  |   2 +-
 .../phi/kernels/sparse/gpu/matmul_kernel.cu   |  14 +-
 paddle/phi/kernels/sparse/gpu/mv_kernel.cu    |   8 +-
 .../kernels/sparse/gpu/reshape_grad_kernel.cu |   4 +-
 .../phi/kernels/sparse/gpu/reshape_kernel.cu  |   4 +-
 paddle/phi/kernels/sparse/gpu/slice_kernel.cu |   2 +-
 .../kernels/sparse/gpu/softmax_grad_kernel.cu |   4 +-
 .../phi/kernels/sparse/gpu/softmax_kernel.cu  |   2 +-
 paddle/phi/kernels/sparse/gpu/sum_kernel.cu   |  16 +-
 .../phi/kernels/sparse/sparse_utils_kernel.h  |   2 +-
 paddle/phi/kernels/sparse/unary_kernel.h      |   2 +-
 paddle/phi/kernels/squeeze_grad_kernel.cc     |   2 +-
 .../kernels/stride/as_strided_grad_kernel.cc  |   4 +-
 .../phi/kernels/stride/complex_grad_kernel.cc |   8 +-
 .../kernels/stride/diagonal_grad_kernel.cc    |   4 +-
 paddle/phi/kernels/stride/diagonal_kernel.cc  |   4 +-
 .../phi/kernels/stride/flatten_grad_kernel.cc |   4 +-
 paddle/phi/kernels/stride/flatten_kernel.cc   |   6 +-
 .../stride/index_select_grad_kernel.cc        |   4 +-
 .../phi/kernels/stride/index_select_kernel.cc |   4 +-
 .../phi/kernels/stride/reshape_grad_kernel.cc |   2 +-
 .../phi/kernels/stride/slice_grad_kernel.cc   |   4 +-
 paddle/phi/kernels/stride/slice_kernel.cc     |   5 +-
 .../phi/kernels/stride/squeeze_grad_kernel.cc |   4 +-
 paddle/phi/kernels/stride/squeeze_kernel.cc   |   2 +-
 .../stride/strided_slice_grad_kernel.cc       |   4 +-
 .../kernels/stride/strided_slice_kernel.cc    |   5 +-
 .../stride/tensor_unfold_grad_kernel.cc       |   4 +-
 .../kernels/stride/unsqueeze_grad_kernel.cc   |   4 +-
 paddle/phi/kernels/stride/unsqueeze_kernel.cc |   7 +-
 paddle/phi/kernels/stride/view_grad_kernel.cc |   2 +-
 paddle/phi/kernels/strings/gpu/copy_utils.h   |   8 +-
 .../kernels/strings/strings_empty_kernel.cc   |   2 +-
 paddle/phi/kernels/strings/unicode.h          |   2 +-
 paddle/phi/kernels/strings/unicode_flag.h     |   2 +-
 paddle/phi/kernels/transfer_layout_kernel.cc  |   2 +-
 .../kernels/triangular_solve_grad_kernel.h    |   2 +-
 paddle/phi/kernels/unsqueeze_grad_kernel.cc   |   2 +-
 .../phi/kernels/xpu/activation_grad_kernel.cc |   8 +-
 paddle/phi/kernels/xpu/activation_kernel.cc   |   2 +-
 paddle/phi/kernels/xpu/arange_kernel.cc       |   2 +-
 paddle/phi/kernels/xpu/arg_min_max_kernel.cc  |   6 +-
 paddle/phi/kernels/xpu/argsort_grad_kernel.cc |   4 +-
 paddle/phi/kernels/xpu/argsort_kernel.cc      |   4 +-
 .../phi/kernels/xpu/batch_norm_grad_kernel.cc |   2 +-
 paddle/phi/kernels/xpu/batch_norm_kernel.cc   |   2 +-
 paddle/phi/kernels/xpu/c_split_kernel.cc      |   4 +-
 paddle/phi/kernels/xpu/compare_kernel.cc      |   4 +-
 paddle/phi/kernels/xpu/contiguous_kernel.cc   |  67 +++--
 paddle/phi/kernels/xpu/conv_grad_kernel.cc    |  16 +-
 paddle/phi/kernels/xpu/conv_kernel.cc         |  16 +-
 .../kernels/xpu/conv_transpose_grad_kernel.cc |   2 +-
 .../phi/kernels/xpu/conv_transpose_kernel.cc  |   2 +-
 .../phi/kernels/xpu/cross_entropy_kernel.cc   |   2 +-
 paddle/phi/kernels/xpu/cum_kernel.cc          |   2 +-
 paddle/phi/kernels/xpu/cumprod_kernel.cc      |   2 +-
 .../xpu/deformable_conv_grad_kernel.cc        |   2 +-
 .../phi/kernels/xpu/deformable_conv_kernel.cc |   2 +-
 paddle/phi/kernels/xpu/diag_kernel.cc         |   4 +-
 paddle/phi/kernels/xpu/diagonal_kernel.cc     |   4 +-
 .../xpu/elementwise_add_grad_kernel.cc        |   4 +-
 .../phi/kernels/xpu/elementwise_add_kernel.cc |   4 +-
 paddle/phi/kernels/xpu/expand_as_kernel.cc    |   8 +-
 paddle/phi/kernels/xpu/expand_grad_kernel.cc  |   4 +-
 paddle/phi/kernels/xpu/expand_kernel.cc       |   6 +-
 .../xpu/fill_diagonal_tensor_kernel.cc        |   4 +-
 paddle/phi/kernels/xpu/flip_kernel.cc         |   2 +-
 paddle/phi/kernels/xpu/full_kernel.cc         |   4 +-
 .../phi/kernels/xpu/gather_nd_grad_kernel.cc  |   8 +-
 paddle/phi/kernels/xpu/gather_nd_kernel.cc    |   8 +-
 paddle/phi/kernels/xpu/gaussian_kernel.cc     |   2 +-
 .../kernels/xpu/generate_proposals_kernel.cc  |  49 +--
 paddle/phi/kernels/xpu/grid_sample_kernel.cc  |   8 +-
 .../phi/kernels/xpu/group_norm_grad_kernel.cc |   6 +-
 paddle/phi/kernels/xpu/group_norm_kernel.cc   |   6 +-
 paddle/phi/kernels/xpu/index_put_kernel.cc    |  21 +-
 .../kernels/xpu/index_sample_grad_kernel.cc   |   6 +-
 .../kernels/xpu/index_select_grad_kernel.cc   |   4 +-
 paddle/phi/kernels/xpu/index_select_kernel.cc |   2 +-
 .../kernels/xpu/interpolate_grad_kernel.cc    |   4 +-
 paddle/phi/kernels/xpu/interpolate_kernel.cc  |   4 +-
 .../phi/kernels/xpu/layer_norm_grad_kernel.cc |   2 +-
 paddle/phi/kernels/xpu/layer_norm_kernel.cc   |   2 +-
 paddle/phi/kernels/xpu/linspace_kernel.cc     |   2 +-
 .../kernels/xpu/log_softmax_grad_kernel.cc    |   2 +-
 paddle/phi/kernels/xpu/log_softmax_kernel.cc  |   2 +-
 paddle/phi/kernels/xpu/logical_kernel.cc      |   2 +-
 .../kernels/xpu/masked_select_grad_kernel.cc  |   4 +-
 .../phi/kernels/xpu/masked_select_kernel.cc   |   4 +-
 paddle/phi/kernels/xpu/matmul_grad_kernel.cc  |   4 +-
 paddle/phi/kernels/xpu/meshgrid_kernel.cc     |   2 +-
 .../phi/kernels/xpu/multiclass_nms3_kernel.cc |   2 +-
 .../phi/kernels/xpu/nll_loss_grad_kernel.cc   |   2 +-
 paddle/phi/kernels/xpu/nll_loss_kernel.cc     |   2 +-
 paddle/phi/kernels/xpu/nonzero_kernel.cc      |   4 +-
 paddle/phi/kernels/xpu/p_norm_grad_kernel.cc  |   2 +-
 paddle/phi/kernels/xpu/p_norm_kernel.cc       |   2 +-
 paddle/phi/kernels/xpu/pad3d_grad_kernel.cc   |   2 +-
 paddle/phi/kernels/xpu/pad_grad_kernel.cc     |   2 +-
 paddle/phi/kernels/xpu/pad_kernel.cc          |   2 +-
 .../pow2_decay_with_linear_warmup_kernel.cc   |   2 +-
 paddle/phi/kernels/xpu/randint_kernel.cc      |   2 +-
 paddle/phi/kernels/xpu/randperm_kernel.cc     |   2 +-
 .../kernels/xpu/reduce_mean_grad_kernel.cc    |   4 +-
 paddle/phi/kernels/xpu/scatter_kernel.cc      |   4 +-
 .../kernels/xpu/scatter_nd_add_grad_kernel.cc |   8 +-
 .../phi/kernels/xpu/scatter_nd_add_kernel.cc  |   6 +-
 .../phi/kernels/xpu/set_value_grad_kernel.cc  |  14 +-
 paddle/phi/kernels/xpu/set_value_kernel.cc    |  10 +-
 paddle/phi/kernels/xpu/split_kernel.cc        |   2 +-
 paddle/phi/kernels/xpu/stack_grad_kernel.cc   |   2 +-
 paddle/phi/kernels/xpu/stride_slice_kernel.cc |   2 +-
 paddle/phi/kernels/xpu/strided_copy_kernel.cc | 105 +++----
 .../phi/kernels/xpu/take_along_axis_kernel.cc |   2 +-
 .../kernels/xpu/temporal_shift_grad_kernel.cc |   8 +-
 .../phi/kernels/xpu/temporal_shift_kernel.cc  |   8 +-
 paddle/phi/kernels/xpu/tile_grad_kernel.cc    |   2 +-
 paddle/phi/kernels/xpu/tile_kernel.cc         |   6 +-
 paddle/phi/kernels/xpu/top_k_kernel.cc        |   6 +-
 .../phi/kernels/xpu/transpose_grad_kernel.cc  |   2 +-
 paddle/phi/kernels/xpu/transpose_kernel.cc    |   2 +-
 .../phi/kernels/xpu/tril_triu_grad_kernel.cc  |   2 +-
 paddle/phi/kernels/xpu/tril_triu_kernel.cc    |   2 +-
 paddle/phi/kernels/xpu/unbind_kernel.cc       |   2 +-
 paddle/phi/kernels/xpu/unfold_grad_kernel.cc  |   2 +-
 paddle/phi/kernels/xpu/unfold_kernel.cc       |   2 +-
 paddle/phi/kernels/xpu/uniform_kernel.cc      |   2 +-
 paddle/phi/kernels/xpu/unique_kernel.cc       |  16 +-
 paddle/phi/kernels/xpu/unstack_kernel.cc      |   2 +-
 paddle/phi/kernels/xpu/warpctc_kernel.cc      |   4 +-
 paddle/phi/kernels/xpu/where_grad_kernel.cc   |   4 +-
 paddle/phi/kernels/xpu/where_kernel.cc        |   4 +-
 paddle/phi/kernels/xpu/xpu_api_wrapper.h      |   4 +-
 paddle/phi/tools/CMakeLists.txt               |   2 +-
 paddle/pir/core/block.cc                      |   2 +-
 paddle/pir/core/block_argument.cc             |   2 +-
 paddle/pir/core/block_operand.cc              |   2 +-
 paddle/pir/core/builtin_attribute_storage.h   |   2 +-
 paddle/pir/core/builtin_op.cc                 |   2 +-
 paddle/pir/core/builtin_type.cc               |   4 +-
 paddle/pir/core/builtin_type.h                |   1 -
 paddle/pir/core/builtin_type_interfaces.cc    |   2 +-
 paddle/pir/core/builtin_type_interfaces.h     |   6 +-
 paddle/pir/core/builtin_type_storage.h        |  17 +-
 paddle/pir/core/dialect.h                     |   2 +-
 paddle/pir/core/enforce.h                     |  82 -----
 paddle/pir/core/interface_support.h           |   2 +-
 paddle/pir/core/iterator.h                    |   2 +-
 paddle/pir/core/macros.h                      |  31 --
 paddle/pir/core/op_base.h                     |   2 +-
 paddle/pir/core/op_operand.cc                 |   2 +-
 paddle/pir/core/op_result.cc                  |   2 +-
 paddle/pir/core/op_trait.cc                   |   2 +-
 paddle/pir/core/operation.cc                  |   2 +-
 paddle/pir/core/operation.h                   |   4 +-
 paddle/pir/core/region.cc                     |   2 +-
 paddle/pir/core/storage_manager.cc            |   2 +-
 paddle/pir/core/type_util.h                   |   4 +-
 paddle/pir/core/value.cc                      |   2 +-
 paddle/pir/dialect/shape/ir/shape_op.cc       |   2 +-
 paddle/pir/pass/pass.h                        |   2 +-
 paddle/pir/pass/pass_registry.h               |   3 +-
 paddle/pir/pattern_rewrite/pattern_match.cc   |   2 +-
 paddle/pir/pattern_rewrite/pattern_match.h    |   2 +-
 paddle/testing/CMakeLists.txt                 |   1 +
 paddle/utils/CMakeLists.txt                   |   6 +-
 paddle/utils/string/CMakeLists.txt            |   6 +-
 patches/eigen/TensorReductionGpu.h            |   2 +-
 python/setup.py.in                            |   3 +-
 python/setup_cinn.py.in                       |   2 +
 setup.py                                      |  15 +-
 test/CMakeLists.txt                           |   6 +-
 test/cpp/auto_parallel/CMakeLists.txt         |   8 +-
 test/cpp/auto_parallel/dist_tensor_test.cc    |   2 +-
 test/cpp/auto_parallel/spmd_rule_test.cc      | 145 ++++-----
 test/cpp/eager/CMakeLists.txt                 |   1 +
 .../accumulation_node_test.cc                 |  22 +-
 .../autograd_meta_test.cc                     |   2 +-
 .../data_structure_tests/eager_tensor_test.cc |  19 +-
 .../grad_node_info_test.cc                    |   6 +-
 .../data_structure_tests/grad_node_test.h     |   2 +-
 .../grad_tensor_holder_test.cc                |  10 +-
 .../tensor_wrapper_test.cc                    |   4 +-
 .../eager/performance_tests/CMakeLists.txt    |   8 +-
 .../performance_tests/benchmark_eager_cpu.cc  |  16 +-
 .../performance_tests/benchmark_eager_cuda.cc |  16 +-
 .../performance_tests/benchmark_fluid_cpu.cc  |  12 +-
 .../performance_tests/benchmark_fluid_cuda.cc |  12 +-
 test/cpp/eager/task_tests/CMakeLists.txt      |   2 +-
 test/cpp/eager/task_tests/backward_test.cc    |   8 +-
 .../cross_batch_accumulation_test.cc          |   2 +-
 test/cpp/eager/task_tests/eager_utils_test.cc |   6 +-
 .../eager/task_tests/forward_autograd_test.cc |   6 +-
 .../eager/task_tests/fwd_bwd_joint_test.cc    |  17 +-
 test/cpp/eager/task_tests/generated_test.cc   |  10 +-
 test/cpp/eager/task_tests/grad_test.cc        |   8 +-
 test/cpp/eager/task_tests/hook_test.cc        |   7 +-
 .../task_tests/hook_test_intermidiate.cc      |  17 +-
 .../cpp/eager/task_tests/tensor_utils_test.cc |   2 +-
 test/cpp/eager/test_utils.h                   |   7 +-
 test/cpp/fluid/CMakeLists.txt                 |  26 +-
 test/cpp/fluid/assign_op_test.cc              |  10 +-
 test/cpp/fluid/benchmark/CMakeLists.txt       |   3 +-
 test/cpp/fluid/benchmark/op_tester.cc         |   6 +-
 test/cpp/fluid/benchmark/op_tester.h          |   2 +-
 test/cpp/fluid/cinn/CMakeLists.txt            |   1 +
 .../fluid/cinn/cinn_launch_context_test.cc    |  12 +-
 test/cpp/fluid/cinn/cinn_launch_op_test.cc    |   2 +-
 test/cpp/fluid/cinn/test_helper.h             |   2 +-
 .../controlflow/conditional_block_op_test.cc  |   6 +-
 test/cpp/fluid/dlnne/dlnne_engine_op_test.cc  |   2 +-
 test/cpp/fluid/elementwise/CMakeLists.txt     |  24 +-
 .../test_elementwise_add_grad_grad.cc         |   4 +-
 .../test_elementwise_add_op_inplace.cc        |   2 +-
 .../test_elementwise_div_grad_grad.cc         |   2 +-
 .../test_elementwise_op_grad_grad.h           |   4 +-
 test/cpp/fluid/feed_forward_test.cu           |   8 +-
 test/cpp/fluid/framework/CMakeLists.txt       |  15 +-
 .../fluid/framework/copy_same_tensor_test.cc  |   6 +-
 .../framework/data_layout_transform_test.cc   |  10 +-
 .../framework/data_type_transform_test.cc     |  41 +--
 .../framework/data_type_transform_test.cu     |  16 +-
 test/cpp/fluid/framework/eigen_test.cc        |  15 +-
 test/cpp/fluid/framework/operator_test.cc     |   2 +-
 .../paddle2cinn/cinn_cache_key_test.cc        |   6 +-
 .../paddle2cinn/cinn_compiler_test.cc         |   4 +-
 test/cpp/fluid/framework/reader_test.cc       |   2 +-
 .../framework/selected_rows_utils_test.cc     |  19 +-
 test/cpp/fluid/framework/tensor_test.cc       |  42 +--
 test/cpp/fluid/framework/tensor_util_test.cc  |  22 +-
 test/cpp/fluid/fused/CMakeLists.txt           |   5 +
 .../cpp/fluid/fused/cudnn_bn_add_relu_test.cc |  30 +-
 test/cpp/fluid/fused/cudnn_norm_conv_test.cc  |  20 +-
 test/cpp/fluid/fused/fusion_group_op_test.cc  |   5 +-
 test/cpp/fluid/gather_test.cc                 |   6 +-
 test/cpp/fluid/lite/CMakeLists.txt            |   2 +-
 test/cpp/fluid/math/CMakeLists.txt            |  10 +-
 test/cpp/fluid/math/beam_search_test.cc       |   6 +-
 test/cpp/fluid/math/concat_test.cc            |  24 +-
 .../fluid/math/selected_rows_functor_test.cc  |  48 +--
 .../math/selected_rows_functor_test.cu.cc     |  29 +-
 test/cpp/fluid/mkldnn/CMakeLists.txt          |   6 +
 test/cpp/fluid/mkldnn/test_mkldnn_caching.cc  |   2 +-
 .../fluid/mkldnn/test_mkldnn_op_inplace.cc    |   2 +-
 test/cpp/fluid/mkldnn/test_mkldnn_op_nhwc.cc  |  12 +-
 test/cpp/fluid/nccl/nccl_op_test.cu.cc        |  10 +-
 test/cpp/fluid/pscore/CMakeLists.txt          |  17 +-
 test/cpp/fluid/scatter_test.cc                |   6 +-
 .../test_common_infer_shape_functions.cc      |   2 +-
 test/cpp/imperative/CMakeLists.txt            |   2 +-
 test/cpp/imperative/heter_ccl_context_test.cc |   4 +-
 test/cpp/imperative/nccl_context_test.cc      |   2 +-
 .../imperative/test_gradient_accmulator.cc    |  10 +-
 test/cpp/imperative/test_group.cc             |   2 +-
 test/cpp/imperative/test_hooks.cc             |  10 +-
 test/cpp/imperative/test_prepare_op.cc        |   4 +-
 test/cpp/imperative/test_tracer.cc            |  28 +-
 test/cpp/inference/analysis/CMakeLists.txt    |   1 +
 test/cpp/inference/api/CMakeLists.txt         | 125 ++++++--
 test/cpp/inference/api/api_impl_tester.cc     |   6 +-
 .../inference/api/mkldnn_quantizer_tester.cc  |  26 +-
 test/cpp/inference/api/tester_helper.h        |   6 +-
 test/cpp/inference/test_helper.h              |   2 +-
 test/cpp/jit/CMakeLists.txt                   |  10 +-
 test/cpp/jit/layer_test.cc                    |   2 +-
 test/cpp/new_executor/CMakeLists.txt          |   5 +-
 .../new_executor/standalone_executor_test.cc  |   2 +-
 test/cpp/phi/api/CMakeLists.txt               |  10 +-
 test/cpp/phi/api/test_phi_exception.cc        |   2 +-
 test/cpp/phi/api/test_phi_tensor.cc           |   2 +-
 test/cpp/phi/api/test_strings_empty_api.cc    |   2 +-
 test/cpp/phi/api/test_to_api.cc               |   5 +-
 test/cpp/phi/common/CMakeLists.txt            |  14 +-
 test/cpp/phi/common/test_backend.cc           |   2 +-
 test/cpp/phi/common/test_data_layout.cc       |   4 +-
 test/cpp/phi/common/test_data_type.cc         |   2 +-
 test/cpp/phi/common/test_scalar.cu            |  31 +-
 test/cpp/phi/core/CMakeLists.txt              |  30 +-
 test/cpp/phi/core/test_custom_kernel.cc       |  10 +-
 test/cpp/phi/core/test_ddim.cc                |  44 +--
 test/cpp/phi/core/test_dim.cu                 |  22 +-
 test/cpp/phi/core/test_meta_fn_utils.cc       |   2 +-
 test/cpp/phi/core/test_selected_rows.cc       |  19 +-
 test/cpp/phi/core/test_sparse_coo_tensor.cc   |  15 +-
 test/cpp/phi/core/test_sparse_csr_tensor.cc   |  17 +-
 test/cpp/phi/core/test_tensor_array.cc        |   2 +-
 test/cpp/phi/core/unroll_array_ops_test.cc    |  18 +-
 test/cpp/phi/kernels/CMakeLists.txt           |  34 +--
 test/cpp/phi/kernels/sequence_padding_test.cc |  11 +-
 test/cpp/phi/kernels/sequence_pooling_test.cc |   4 +-
 test/cpp/phi/kernels/test_auto_tune.cu        |  20 +-
 .../cpp/phi/kernels/test_fused_adam_kernel.cc |   8 +-
 test/cpp/phi/kernels/test_memcpy_dev_api.cc   |   2 +-
 .../cpp/phi/kernels/test_ternary_broadcast.cu |  24 +-
 .../kernels/test_transfer_layout_dev_api.cc   |   4 +-
 test/cpp/phi/ops/CMakeLists.txt               |   2 +-
 test/cpp/pir/cinn/CMakeLists.txt              |   3 +-
 test/cpp/pir/cinn/group_op_test.cc            |  10 +-
 test/cpp/pir/core/CMakeLists.txt              |   9 +-
 test/cpp/pir/core/ir_op_test.cc               |   2 +-
 test/cpp/pir/core/ir_program_test.cc          |   2 +-
 test/cpp/pir/core/type_interface_test.cc      |   2 +-
 test/cpp/pir/kernel_dialect/CMakeLists.txt    |   8 +-
 test/cpp/pir/pass/CMakeLists.txt              |   1 +
 .../pattern_rewrite/pattern_rewrite_test.cc   |   4 +-
 test/cpp/pir/shape_dialect/CMakeLists.txt     |  20 +-
 .../pir/shape_dialect/constraint_pass_test.cc |   2 +-
 test/cpp/pir/tools/test_interface.h           |   2 +-
 test/cpp/pir/tools/test_op.cc                 |   2 +-
 test/cpp/pir/tools/test_trait.cc              |   2 +-
 test/cpp/prim/CMakeLists.txt                  |   6 +-
 test/cpp/prim/test_eager_prim.cc              |   6 +-
 1819 files changed, 8289 insertions(+), 8899 deletions(-)
 mode change 100755 => 100644 paddle/cinn/frontend/interpreter_test.cc
 mode change 100755 => 100644 paddle/cinn/hlir/framework/memory.h
 rename paddle/{phi/core => common}/errors.cc (93%)
 rename paddle/{phi => }/common/layout.h (93%)
 mode change 100755 => 100644 paddle/fluid/distributed/ps/service/brpc_ps_client.h
 delete mode 100644 paddle/phi/api/ext/exception.h
 delete mode 100644 paddle/phi/core/ddim.cc
 delete mode 100644 paddle/phi/core/ddim.h
 delete mode 100644 paddle/phi/core/errors.h
 delete mode 100644 paddle/phi/core/macros.h
 delete mode 100644 paddle/phi/core/utils/array.h
 delete mode 100644 paddle/phi/core/utils/dim.h
 delete mode 100644 paddle/phi/core/utils/unroll_array_ops.h
 delete mode 100644 paddle/pir/core/enforce.h
 delete mode 100644 paddle/pir/core/macros.h

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index baa0340eeb992a..ab09d597499772 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -622,7 +622,7 @@ function(paddle_test_build TARGET_NAME)
     if(APPLE)
       target_link_libraries(
         ${TARGET_NAME}
-        "-Wl,-rpath,$<TARGET_FILE_DIR:${paddle_lib}> -Wl,-rpath,$<TARGET_FILE_DIR:phi> -Wl,-rpath,$<TARGET_FILE_DIR:pir>"
+        "-Wl,-rpath,$<TARGET_FILE_DIR:${paddle_lib}> -Wl,-rpath,$<TARGET_FILE_DIR:phi> -Wl,-rpath,$<TARGET_FILE_DIR:pir> -Wl,-rpath,$<TARGET_FILE_DIR:common>"
       )
     endif()
     common_link(${TARGET_NAME})
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 73668097014eb4..06dc5d6173794a 100755
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -286,6 +286,10 @@ copy(
 include_directories(${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io)
 
 # copy api headers for phi & custom op
+copy(
+  inference_lib_dist
+  SRCS ${PADDLE_SOURCE_DIR}/paddle/common/*.h
+  DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/common/)
 copy(
   inference_lib_dist
   SRCS ${PADDLE_SOURCE_DIR}/paddle/phi/api/ext/*.h
@@ -304,8 +308,17 @@ copy(
   DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/phi/common/)
 copy(
   inference_lib_dist
-  SRCS ${PADDLE_SOURCE_DIR}/paddle/phi/core/macros.h
+  SRCS ${PADDLE_SOURCE_DIR}/paddle/phi/core/enforce.h
   DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/phi/core/)
+copy(
+  inference_lib_dist
+  SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/string/*.h
+  DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/utils/string/)
+copy(
+  inference_lib_dist
+  SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/string/tinyformat/tinyformat.h
+  DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/utils/string/tinyformat/
+)
 copy(
   inference_lib_dist
   SRCS ${PADDLE_SOURCE_DIR}/paddle/phi/core/visit_type.h
@@ -320,40 +333,13 @@ copy(
   DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/phi/)
 copy(
   inference_lib_dist
-  SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/any.h
-  DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/utils/)
-copy(
-  inference_lib_dist
-  SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/optional.h
-  DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/utils/)
-copy(
-  inference_lib_dist
-  SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/none.h
-  DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/utils/)
-copy(
-  inference_lib_dist
-  SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/flat_hash_map.h
-  DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/utils/)
-copy(
-  inference_lib_dist
-  SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/flags.h
-  DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/utils/)
-copy(
-  inference_lib_dist
-  SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/test_macros.h
+  SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/*.h
   DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/utils/)
 copy(
   inference_lib_dist
   SRCS ${PADDLE_SOURCE_DIR}/paddle/extension.h
   DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/)
 
-if(NOT WITH_GFLAGS)
-  copy(
-    inference_lib_dist
-    SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/flags_native.h
-    DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/utils/)
-endif()
-
 # the include path of phi needs to be changed to adapt to inference api path
 add_custom_command(
   TARGET inference_lib_dist
diff --git a/paddle/cinn/api/tensor_node.h b/paddle/cinn/api/tensor_node.h
index fca0a844108bc8..2a9a836c7d1e93 100644
--- a/paddle/cinn/api/tensor_node.h
+++ b/paddle/cinn/api/tensor_node.h
@@ -52,9 +52,10 @@ class TensorNode final {
 
   class ConsumerOpListView {
    public:
-    ConsumerOpListView(const std::set<common::Shared<common::GraphEdge>,
-                                      common::GraphEdgeCompare>& edges,
-                       const hlir::framework::Graph* graph)
+    ConsumerOpListView(
+        const std::set<cinn::common::Shared<cinn::common::GraphEdge>,
+                       cinn::common::GraphEdgeCompare>& edges,
+        const hlir::framework::Graph* graph)
         : edges_(edges), graph_(graph) {}
 
     ConsumerOpListView(const ConsumerOpListView& other) = delete;
@@ -64,8 +65,8 @@ class TensorNode final {
 
     class Iterator {
      public:
-      Iterator(std::set<common::Shared<common::GraphEdge>,
-                        common::GraphEdgeCompare>::const_iterator it,
+      Iterator(std::set<cinn::common::Shared<cinn::common::GraphEdge>,
+                        cinn::common::GraphEdgeCompare>::const_iterator it,
                const hlir::framework::Graph* graph)
           : iter_(it), graph_(graph) {}
 
@@ -89,8 +90,8 @@ class TensorNode final {
       OpNode operator*() const;
 
      private:
-      std::set<common::Shared<common::GraphEdge>,
-               common::GraphEdgeCompare>::const_iterator iter_;
+      std::set<cinn::common::Shared<cinn::common::GraphEdge>,
+               cinn::common::GraphEdgeCompare>::const_iterator iter_;
       const hlir::framework::Graph* graph_;
     };
 
@@ -101,7 +102,8 @@ class TensorNode final {
     Iterator end() const { return Iterator(this->edges_.end(), graph_); }
 
    private:
-    const std::set<Shared<common::GraphEdge>, common::GraphEdgeCompare>& edges_;
+    const std::set<Shared<cinn::common::GraphEdge>,
+                   cinn::common::GraphEdgeCompare>& edges_;
     const hlir::framework::Graph* graph_;
   };
 
diff --git a/paddle/cinn/ast_gen_ius/ast_gen.cc b/paddle/cinn/ast_gen_ius/ast_gen.cc
index c8be20ae3afa61..bef88ade1a7fb1 100644
--- a/paddle/cinn/ast_gen_ius/ast_gen.cc
+++ b/paddle/cinn/ast_gen_ius/ast_gen.cc
@@ -90,7 +90,7 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) {
     std::vector<ir::Expr> iter_values;
     // reduce body and reduce init schedule block should have different objects
     // for same axis so we re-create objects
-    std::vector<Var> axis_vars = common::GenDefaultAxis(axis_len);
+    std::vector<Var> axis_vars = cinn::common::GenDefaultAxis(axis_len);
     for (int i = 0; i < shape.size(); ++i) {
       block_vars.push_back(Var(Expr(0),
                                shape[i],
@@ -118,7 +118,7 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) {
     std::vector<ir::Expr> reduce_iter_values;
     // reduce body and reduce init schedule block should have different objects
     // for same axis so we re-create objects
-    std::vector<Var> reduce_axis_vars = common::GenDefaultAxis(axis_len);
+    std::vector<Var> reduce_axis_vars = cinn::common::GenDefaultAxis(axis_len);
     for (int i = 0; i < shape.size(); ++i) {
       reduce_block_vars.push_back(Var(Expr(0),
                                       shape[i],
@@ -182,7 +182,7 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) {
     // create schedule block itervars, i0,i1...
     std::vector<ir::Var> block_vars;
     std::vector<ir::Expr> iter_values;
-    std::vector<Var> axis_vars = common::GenDefaultAxis(axis_len);
+    std::vector<Var> axis_vars = cinn::common::GenDefaultAxis(axis_len);
     for (int i = 0; i < shape.size(); ++i) {
       block_vars.push_back(Var(
           Expr(0), shape[i], cinn::UniqName("i" + std::to_string(i)), false));
diff --git a/paddle/cinn/auto_schedule/analysis/analyze_ir.cc b/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
index fbfdc7af72e9a6..6f00ee34813d15 100644
--- a/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
+++ b/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
@@ -144,7 +144,7 @@ bool NeedsMultiLevelTiling(const ir::ScheduleBlockRealize& sche_block_realize) {
   return total_unused_iter_vars >= 1;
 }
 
-ir::LoweredFunc UpdateFuncWithNewBody(const common::Target& target,
+ir::LoweredFunc UpdateFuncWithNewBody(const cinn::common::Target& target,
                                       const ir::LoweredFunc& old_func,
                                       ir::Expr& body) {  // NOLINT
   ir::ModuleExpr mod_expr(std::vector<ir::Expr>({body}));
@@ -179,7 +179,7 @@ ir::LoweredFunc UpdateFuncWithNewBody(const common::Target& target,
   ir::LoweredFunc new_func = ir::_LoweredFunc_::Make(
       old_func->name, old_func->args, updated_body, new_temp_bufs);
 #ifdef CINN_WITH_CUDA
-  if (target == common::DefaultNVGPUTarget()) {
+  if (target == cinn::common::DefaultNVGPUTarget()) {
     new_func->PrepareCudaAxisInfoFromBody();
   }
 #endif
diff --git a/paddle/cinn/auto_schedule/analysis/analyze_ir.h b/paddle/cinn/auto_schedule/analysis/analyze_ir.h
index 81d00dcb22ec3a..2afe33ea2706fe 100644
--- a/paddle/cinn/auto_schedule/analysis/analyze_ir.h
+++ b/paddle/cinn/auto_schedule/analysis/analyze_ir.h
@@ -44,7 +44,7 @@ bool NeedsMultiLevelTiling(const ir::ScheduleBlockRealize& sche_block_realize);
 /**
  * Update a LoweredFunc by regenerating related fields with a new function body
  */
-ir::LoweredFunc UpdateFuncWithNewBody(const common::Target& target,
+ir::LoweredFunc UpdateFuncWithNewBody(const cinn::common::Target& target,
                                       const ir::LoweredFunc& old_func,
                                       ir::Expr& body);  // NOLINT
 
diff --git a/paddle/cinn/auto_schedule/analysis/analyze_ir_test.cc b/paddle/cinn/auto_schedule/analysis/analyze_ir_test.cc
index f7fffa0e0ff4b2..970a41173087a5 100644
--- a/paddle/cinn/auto_schedule/analysis/analyze_ir_test.cc
+++ b/paddle/cinn/auto_schedule/analysis/analyze_ir_test.cc
@@ -38,9 +38,9 @@ namespace auto_schedule {
 TEST(AnalyzeIr, AnalyzeScheduleBlockReadWriteBuffer_SimpleAssign) {
   Context::Global().ResetNameId();
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
 
   ir::Expr M(32);
@@ -102,9 +102,9 @@ TEST(AnalyzeIr, AnalyzeScheduleBlockReadWriteBuffer_SimpleAssign) {
 TEST(AnalyzeIr, AnalyzeScheduleBlockReadWriteBuffer_AddDiffShape) {
   Context::Global().ResetNameId();
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
 
   ir::Expr M(32);
@@ -158,9 +158,9 @@ TEST(AnalyzeIr, AnalyzeScheduleBlockReadWriteBuffer_AddDiffShape) {
 TEST(AnalyzeIr, ContainsNodeType) {
   Context::Global().ResetNameId();
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
 
   ir::Expr M(32);
diff --git a/paddle/cinn/auto_schedule/auto_tuner.cc b/paddle/cinn/auto_schedule/auto_tuner.cc
index d8280af500089e..d45dcc743e525c 100644
--- a/paddle/cinn/auto_schedule/auto_tuner.cc
+++ b/paddle/cinn/auto_schedule/auto_tuner.cc
@@ -38,7 +38,7 @@
 namespace cinn {
 namespace auto_schedule {
 
-AutoTuner::AutoTuner(const common::Target& target,
+AutoTuner::AutoTuner(const cinn::common::Target& target,
                      hlir::framework::Graph* graph)
     : target_(target), graph_(graph) {}
 
@@ -58,7 +58,7 @@ void AutoTuner::Initialize(const Config& config,
   tasks_ = task_creator.CreateTuneTaskOpLevel(graph_);
 
   const auto& dtype_dict =
-      graph_->GetAttrs<absl::flat_hash_map<std::string, common::Type>>(
+      graph_->GetAttrs<absl::flat_hash_map<std::string, cinn::common::Type>>(
           "inferdtype");
   const auto& shape_dict = graph_->GetAttrs<
       absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
diff --git a/paddle/cinn/auto_schedule/auto_tuner.h b/paddle/cinn/auto_schedule/auto_tuner.h
index 9875e5dfcdd000..e4c416c9009478 100644
--- a/paddle/cinn/auto_schedule/auto_tuner.h
+++ b/paddle/cinn/auto_schedule/auto_tuner.h
@@ -46,7 +46,7 @@ class AutoTuner {
     DatabaseConfig database_config;
   };
 
-  AutoTuner(const common::Target& target, hlir::framework::Graph* graph);
+  AutoTuner(const cinn::common::Target& target, hlir::framework::Graph* graph);
 
   // Initialize tuner with specific config and auxiliary objects.
   void Initialize(const Config& config,
@@ -56,7 +56,7 @@ class AutoTuner {
   TuningResult Tune(const TuningOptions& options);
 
  private:
-  const common::Target& target_;
+  const cinn::common::Target& target_;
   hlir::framework::Graph* graph_;
   std::unique_ptr<hlir::framework::OpLowerer<GroupPtr>> op_lowerer_;
 
diff --git a/paddle/cinn/auto_schedule/auto_tuner_test.cc b/paddle/cinn/auto_schedule/auto_tuner_test.cc
index 36fd51016c989e..6ddaa2b2d7669d 100644
--- a/paddle/cinn/auto_schedule/auto_tuner_test.cc
+++ b/paddle/cinn/auto_schedule/auto_tuner_test.cc
@@ -48,9 +48,9 @@ using ::cinn::hlir::framework::Scope;
 class TestAutoTuner : public ::testing::Test {
  public:
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
 
   std::shared_ptr<Graph> graph;
diff --git a/paddle/cinn/auto_schedule/cost_model/expr_cost_model.cc b/paddle/cinn/auto_schedule/cost_model/expr_cost_model.cc
index fcaf57d54c4cef..a9074c76fa8cf9 100644
--- a/paddle/cinn/auto_schedule/cost_model/expr_cost_model.cc
+++ b/paddle/cinn/auto_schedule/cost_model/expr_cost_model.cc
@@ -29,7 +29,7 @@ namespace cinn {
 namespace auto_schedule {
 
 float ExprCostModel::Predict(const ir::ModuleExpr& sample,
-                             const common::Target& target) const {
+                             const cinn::common::Target& target) const {
   if (trained_times_.load() == 0) {
     return SearchState::NOT_INIT_COST;
   }
@@ -42,7 +42,7 @@ float ExprCostModel::Predict(const ir::ModuleExpr& sample,
 
 void ExprCostModel::Train(const std::vector<const ir::ModuleExpr*>& samples,
                           const std::vector<float>& labels,
-                          const common::Target& target) {
+                          const cinn::common::Target& target) {
   trained_times_.store(1);
   size_t total_size = samples.size();
   CHECK_EQ(total_size, labels.size())
@@ -60,7 +60,7 @@ void ExprCostModel::Train(const std::vector<const ir::ModuleExpr*>& samples,
 
 void ExprCostModel::Update(const std::vector<const ir::ModuleExpr*>& samples,
                            const std::vector<float>& labels,
-                           const common::Target& target) {
+                           const cinn::common::Target& target) {
   ++trained_times_;
   size_t total_size = samples.size();
   CHECK_EQ(total_size, labels.size())
diff --git a/paddle/cinn/auto_schedule/cost_model/expr_cost_model.h b/paddle/cinn/auto_schedule/cost_model/expr_cost_model.h
index 02e0b4a52c831f..4dc34045709374 100644
--- a/paddle/cinn/auto_schedule/cost_model/expr_cost_model.h
+++ b/paddle/cinn/auto_schedule/cost_model/expr_cost_model.h
@@ -30,13 +30,13 @@ namespace auto_schedule {
 class ExprCostModel : public XgbCostModel {
  public:
   virtual float Predict(const ir::ModuleExpr& sample,
-                        const common::Target& target) const;
+                        const cinn::common::Target& target) const;
   void Train(const std::vector<const ir::ModuleExpr*>& samples,
              const std::vector<float>& labels,
-             const common::Target& target);
+             const cinn::common::Target& target);
   void Update(const std::vector<const ir::ModuleExpr*>& samples,
               const std::vector<float>& labels,
-              const common::Target& target);
+              const cinn::common::Target& target);
 
  private:
   std::atomic<int> trained_times_{0};
diff --git a/paddle/cinn/auto_schedule/cost_model/feature.cc b/paddle/cinn/auto_schedule/cost_model/feature.cc
index f993ee256616a6..3a403b21d081f7 100644
--- a/paddle/cinn/auto_schedule/cost_model/feature.cc
+++ b/paddle/cinn/auto_schedule/cost_model/feature.cc
@@ -37,12 +37,12 @@ namespace cinn {
 namespace auto_schedule {
 
 Feature::Feature()
-    : target_(common::UnkTarget()),
+    : target_(cinn::common::UnkTarget()),
       stack_encoded_feature_(1),  // initialize a LoopBlockFeature as root block
       current_loop_block_index_(0),
       parent_indices_(1, -1) {}
 
-Feature::Feature(const common::Target& target)
+Feature::Feature(const cinn::common::Target& target)
     : target_(target),
       stack_encoded_feature_(1),  // initialize a LoopBlockFeature as root block
       current_loop_block_index_(0),
@@ -52,7 +52,7 @@ std::vector<float> Feature::ToFixedSizeVector() {
   std::vector<float> ret(LoopBlockFeature::kTotalSize + 1,
                          0);  // LoopBlockFeature::kTotalSize plus 1 for target
 
-  if (target_ == common::DefaultNVGPUTarget()) {
+  if (target_ == cinn::common::DefaultNVGPUTarget()) {
     ret[0] = 1;
   }  // else 0 for other cases
 
diff --git a/paddle/cinn/auto_schedule/cost_model/feature.h b/paddle/cinn/auto_schedule/cost_model/feature.h
index cfd100598cdd17..2f98b12c269a6b 100644
--- a/paddle/cinn/auto_schedule/cost_model/feature.h
+++ b/paddle/cinn/auto_schedule/cost_model/feature.h
@@ -134,7 +134,7 @@ class Feature {
  public:
   Feature();
 
-  explicit Feature(const common::Target& target);
+  explicit Feature(const cinn::common::Target& target);
 
   // Convert the various-length loop block features to fixed-size vector
   std::vector<float> ToFixedSizeVector();
@@ -182,7 +182,7 @@ class Feature {
   int current_loop_block_index_;
   std::vector<int> parent_indices_;
 
-  common::Target target_;
+  cinn::common::Target target_;
 };
 
 }  // namespace auto_schedule
diff --git a/paddle/cinn/auto_schedule/cost_model/feature_extractor.cc b/paddle/cinn/auto_schedule/cost_model/feature_extractor.cc
index a8255c1875c746..3189e2e1c2b4eb 100644
--- a/paddle/cinn/auto_schedule/cost_model/feature_extractor.cc
+++ b/paddle/cinn/auto_schedule/cost_model/feature_extractor.cc
@@ -50,7 +50,7 @@ void FeatureExtractor::Visit(const Expr *x) {
 }
 
 Feature FeatureExtractor::Extract(const ir::ModuleExpr &mod_expr,
-                                  const common::Target &target) {
+                                  const cinn::common::Target &target) {
   feature_ = Feature(target);
   for (const ir::Expr &e : mod_expr.GetExprs()) {
     Visit(&e);
@@ -91,8 +91,9 @@ NotVisitExprFields(_Tensor_)
 
 #define VisitForDtypePattern(NodeType, member)                         \
   void FeatureExtractor::Visit(const NodeType *x) {                    \
-    if (x->type() == common::F32() || x->type() == common::F16() ||    \
-        x->type() == common::F64()) {                                  \
+    if (x->type() == cinn::common::F32() ||                            \
+        x->type() == cinn::common::F16() ||                            \
+        x->type() == cinn::common::F64()) {                            \
       feature_.CurrentLoopBlock().float_##member += x->type().lanes(); \
     } else {                                                           \
       feature_.CurrentLoopBlock().int_##member += x->type().lanes();   \
@@ -125,8 +126,9 @@ VisitForDtypePattern(Let, other_call);
 
 #define VisitForMultiOperandsDtypePattern(NodeType, member)                   \
   void FeatureExtractor::Visit(const NodeType *x) {                           \
-    if (x->type() == common::F32() || x->type() == common::F16() ||           \
-        x->type() == common::F64()) {                                         \
+    if (x->type() == cinn::common::F32() ||                                   \
+        x->type() == cinn::common::F16() ||                                   \
+        x->type() == cinn::common::F64()) {                                   \
       feature_.CurrentLoopBlock().float_##member +=                           \
           (x->operands().size() - 1);                                         \
     } else {                                                                  \
@@ -231,8 +233,8 @@ void FeatureExtractor::Visit(const PolyFor *x) {
 /* Visit for Reduce and Broadcast */
 
 void FeatureExtractor::Visit(const Reduce *x) {
-  if (x->type() == common::F32() || x->type() == common::F16() ||
-      x->type() == common::F64()) {
+  if (x->type() == cinn::common::F32() || x->type() == cinn::common::F16() ||
+      x->type() == cinn::common::F64()) {
     switch (x->reduce_type) {
       case Reduce::ReduceType::kSum:
         feature_.CurrentLoopBlock().float_reduce_sum_or_sub +=
diff --git a/paddle/cinn/auto_schedule/cost_model/feature_extractor.h b/paddle/cinn/auto_schedule/cost_model/feature_extractor.h
index 690d669da720b9..61b2a6083b7ba4 100644
--- a/paddle/cinn/auto_schedule/cost_model/feature_extractor.h
+++ b/paddle/cinn/auto_schedule/cost_model/feature_extractor.h
@@ -40,7 +40,8 @@ namespace auto_schedule {
 class FeatureExtractor : public ir::IRVisitorRequireReImpl<void> {
  public:
   FeatureExtractor();
-  Feature Extract(const ir::ModuleExpr& mod_expr, const common::Target& target);
+  Feature Extract(const ir::ModuleExpr& mod_expr,
+                  const cinn::common::Target& target);
 
   void Visit(const Expr* x) override;
 
diff --git a/paddle/cinn/auto_schedule/cost_model/feature_extractor_test.cc b/paddle/cinn/auto_schedule/cost_model/feature_extractor_test.cc
index 10726f450a0de3..22fa1a7f259bad 100644
--- a/paddle/cinn/auto_schedule/cost_model/feature_extractor_test.cc
+++ b/paddle/cinn/auto_schedule/cost_model/feature_extractor_test.cc
@@ -38,9 +38,9 @@ namespace auto_schedule {
 TEST(FeatureExtractor, SimpleAssign) {
   Context::Global().ResetNameId();
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
   ir::Expr M(32);
   ir::Expr N(32);
@@ -93,9 +93,9 @@ TEST(FeatureExtractor, SimpleAssign) {
 TEST(FeatureExtractor, MatrixMultiply) {
   Context::Global().ResetNameId();
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
 
   ir::Expr M(2);
diff --git a/paddle/cinn/auto_schedule/cost_model/xgb_cost_model.cc b/paddle/cinn/auto_schedule/cost_model/xgb_cost_model.cc
index 8697aaa42ee1c0..8cac30ee841391 100644
--- a/paddle/cinn/auto_schedule/cost_model/xgb_cost_model.cc
+++ b/paddle/cinn/auto_schedule/cost_model/xgb_cost_model.cc
@@ -96,7 +96,7 @@ void AddDistPkgToPythonSysPath() {
 }
 
 XgbCostModel::XgbCostModel() {
-  common::PythonInterpreterGuard::Guard();
+  cinn::common::PythonInterpreterGuard::Guard();
   int previous = xgb_cost_model_count_.fetch_add(1);
   if (previous == 0) {
     AddDistPkgToPythonSysPath();
diff --git a/paddle/cinn/auto_schedule/database/jsonfile_database_test.cc b/paddle/cinn/auto_schedule/database/jsonfile_database_test.cc
index 5db6f8999b18a5..0a4a98f977dc11 100644
--- a/paddle/cinn/auto_schedule/database/jsonfile_database_test.cc
+++ b/paddle/cinn/auto_schedule/database/jsonfile_database_test.cc
@@ -92,7 +92,7 @@ class TestJSONFileDatabase : public ::testing::Test {
   std::string record_file_path;
   JSONFileDatabase test_db;
   std::vector<ir::LoweredFunc> lowered_funcs;
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 };
 
 TEST_F(TestJSONFileDatabase, Serialize) {
diff --git a/paddle/cinn/auto_schedule/measure/measurer_test.cc b/paddle/cinn/auto_schedule/measure/measurer_test.cc
index 89a2feece5aeaf..26600567c5abbf 100644
--- a/paddle/cinn/auto_schedule/measure/measurer_test.cc
+++ b/paddle/cinn/auto_schedule/measure/measurer_test.cc
@@ -57,9 +57,9 @@ class TestMeasurer : public ::testing::Test {
 
   void SetUp() override {
 #ifdef CINN_WITH_CUDA
-    Target target = common::DefaultNVGPUTarget();
+    Target target = cinn::common::DefaultNVGPUTarget();
 #else
-    Target target = common::DefaultHostTarget();
+    Target target = cinn::common::DefaultHostTarget();
 #endif
     std::unordered_set<std::string> fetch_ids;
     auto program = CreateAddReluProgram();
@@ -70,7 +70,7 @@ class TestMeasurer : public ::testing::Test {
     TaskCreator task_creator;
     tasks = task_creator.CreateTuneTaskOpLevel(graph.get());
     const auto& dtype_dict =
-        graph->GetAttrs<absl::flat_hash_map<std::string, common::Type>>(
+        graph->GetAttrs<absl::flat_hash_map<std::string, cinn::common::Type>>(
             "inferdtype");
     const auto& shape_dict = graph->GetAttrs<
         absl::flat_hash_map<std::string, hlir::framework::shape_t>>(
diff --git a/paddle/cinn/auto_schedule/measure/simple_runner.cc b/paddle/cinn/auto_schedule/measure/simple_runner.cc
index 1871cfc82ae92f..92dcc00693b5b9 100644
--- a/paddle/cinn/auto_schedule/measure/simple_runner.cc
+++ b/paddle/cinn/auto_schedule/measure/simple_runner.cc
@@ -45,31 +45,31 @@ static const std::unordered_map<std::string, std::vector<int>>
 };
 
 // Generate random value and populate them to the output address of memory
-static void PopulateRandomValue(const common::Type& type,
+static void PopulateRandomValue(const cinn::common::Type& type,
                                 const int numel,
                                 void* raw_ptr) {
   std::random_device seed;
   std::default_random_engine engine(seed());
 
-  if (type == common::Bool()) {
+  if (type == cinn::common::Bool()) {
     auto* fmt_ptr = reinterpret_cast<bool*>(raw_ptr);
     std::bernoulli_distribution dist(0.5);
     std::generate_n(
         fmt_ptr, numel, [&engine, &dist]() { return dist(engine); });
-  } else if (type == common::I32()) {
+  } else if (type == cinn::common::I32()) {
     auto* fmt_ptr = reinterpret_cast<int*>(raw_ptr);
     std::uniform_int_distribution<int> dist(std::numeric_limits<int>::min(),
                                             std::numeric_limits<int>::max());
     std::generate_n(
         fmt_ptr, numel, [&engine, &dist]() { return dist(engine); });
-  } else if (type == common::I64()) {
+  } else if (type == cinn::common::I64()) {
     auto* fmt_ptr = reinterpret_cast<int64_t*>(raw_ptr);
     std::uniform_int_distribution<int64_t> dist(
         std::numeric_limits<int64_t>::min(),
         std::numeric_limits<int64_t>::max());
     std::generate_n(
         fmt_ptr, numel, [&engine, &dist]() { return dist(engine); });
-  } else if (type == common::F32()) {
+  } else if (type == cinn::common::F32()) {
     auto* fmt_ptr = reinterpret_cast<float*>(raw_ptr);
     std::uniform_real_distribution<float> dist(
         std::numeric_limits<float>::min(), std::numeric_limits<float>::max());
@@ -90,12 +90,12 @@ static void PopulateRandomValue(const common::Type& type,
 // Initialize a tensor with 0 if init_with_zero == true, otherwise initialize
 // the tensor with random value.
 static void InitTensorData(Tensor tensor,
-                           const common::Target& target,
+                           const cinn::common::Target& target,
                            bool init_with_zero) {
   int mem_size = tensor->shape().numel() * tensor->type().bytes();
   auto* tensor_data = tensor->mutable_data(target, tensor->type());
 #ifdef CINN_WITH_CUDA
-  if (target == common::DefaultNVGPUTarget()) {
+  if (target == cinn::common::DefaultNVGPUTarget()) {
     if (init_with_zero) {
       cudaMemset(tensor_data, 0, mem_size);
     } else {
@@ -106,7 +106,7 @@ static void InitTensorData(Tensor tensor,
     }
   }
 #endif
-  if (target == common::DefaultHostTarget()) {
+  if (target == cinn::common::DefaultHostTarget()) {
     if (init_with_zero) {
       memset(tensor_data, 0, mem_size);
     } else {
@@ -228,7 +228,7 @@ MeasureResult SimpleRunner::Run(const MeasureInput& input,
       instr->Run(&execution_args);
     }
 #ifdef CINN_WITH_CUDA
-    if (instr->target_ == common::DefaultNVGPUTarget()) {
+    if (instr->target_ == cinn::common::DefaultNVGPUTarget()) {
       CUDA_CALL(cudaDeviceSynchronize());
     }
 #endif
diff --git a/paddle/cinn/auto_schedule/measure/simple_runner_test.cc b/paddle/cinn/auto_schedule/measure/simple_runner_test.cc
index a0427edd56ced4..fc231b00e8e9dc 100644
--- a/paddle/cinn/auto_schedule/measure/simple_runner_test.cc
+++ b/paddle/cinn/auto_schedule/measure/simple_runner_test.cc
@@ -40,9 +40,9 @@ using ::cinn::hlir::framework::Scope;
 class TestSimpleRunner : public ::testing::Test {
  public:
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
   std::shared_ptr<Graph> graph;
   std::shared_ptr<Scope> compiled_scope;
@@ -69,9 +69,9 @@ class TestSimpleRunner : public ::testing::Test {
 
     task = std::make_unique<TuneTask>();
 #ifdef CINN_WITH_CUDA
-    task->target = common::DefaultNVGPUTarget();
+    task->target = cinn::common::DefaultNVGPUTarget();
 #else
-    task->target = common::DefaultHostTarget();
+    task->target = cinn::common::DefaultHostTarget();
 #endif
     task->subgraph = graph->fusion_groups.front();
     input.task = task.get();
@@ -118,7 +118,7 @@ TEST_F(TestSimpleRunner, TimeMeasured) {
   BuildResult build_result;
   build_result.compiled_scope = nullptr;
   std::vector<std::unique_ptr<Instruction>> instructions;
-  instructions.emplace_back(new Instruction(common::DefaultHostTarget(),
+  instructions.emplace_back(new Instruction(cinn::common::DefaultHostTarget(),
                                             nullptr,
                                             {},
                                             {"empty_placeholder"},
diff --git a/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process_test.cc b/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process_test.cc
index 0507c78ff2e1cc..ad7e77e2d157ca 100644
--- a/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process_test.cc
+++ b/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process_test.cc
@@ -43,7 +43,7 @@ TEST_F(TestCooperativeProcess, Matmul) {
   int num_threads_x = 2;
   int steps_k = 8;
 
-  Initialize(common::DefaultNVGPUTarget());
+  Initialize(cinn::common::DefaultNVGPUTarget());
   frontend::Program matmul_op =
       tests::OpBuilder("matmul").Build({{"X", X_shape}, {"Y", Y_shape}});
   ir::IRSchedule ir_schedule = MakeIRSchedule(matmul_op, fixed_rand_seed);
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.h b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.h
index c4baf8e7797e38..b5981e5aec9a95 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.h
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.h
@@ -27,7 +27,7 @@ namespace auto_schedule {
 // Auto bind GPU index(BlockIdx, ThreadIdx) to the loops around the block
 class AutoBind : public AutoGenRule {
  public:
-  explicit AutoBind(const common::Target& target) : AutoGenRule(target) {}
+  explicit AutoBind(const cinn::common::Target& target) : AutoGenRule(target) {}
   ~AutoBind() = default;
 
   RuleApplyType Init(ir::IRSchedule* init_schedule) override;
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind_test.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind_test.cc
index 35dc5374b96647..72e11879733343 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind_test.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind_test.cc
@@ -38,7 +38,7 @@ class TestAutoBind : public TestAutoGenRuleBase {
 
   void TestApplyOnElementWiseAdd(const std::vector<int>& shape,
                                  const std::string& block_name) {
-    Initialize(common::DefaultNVGPUTarget());
+    Initialize(cinn::common::DefaultNVGPUTarget());
     auto test_program =
         tests::OpBuilder("elementwise_add").Build({{"X", shape}, {"Y", shape}});
     // construct input parameter
@@ -107,7 +107,7 @@ class TestAutoBind : public TestAutoGenRuleBase {
 };
 
 TEST_F(TestAutoBind, AnalyseApplyType) {
-  Initialize(common::DefaultNVGPUTarget());
+  Initialize(cinn::common::DefaultNVGPUTarget());
   ir::IRSchedule ir_schedule = MakeIRSchedule(
       tests::OpBuilder("matmul").Build({{"X", {32, 64}}, {"Y", {64, 32}}}));
   SearchState state(ir_schedule, 0, {});
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.cc
index bb215358d0b641..e52d91c1252241 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.cc
@@ -24,7 +24,8 @@
 namespace cinn {
 namespace auto_schedule {
 
-AutoGenRule::AutoGenRule(const common::Target& target) : target_(&target) {}
+AutoGenRule::AutoGenRule(const cinn::common::Target& target)
+    : target_(&target) {}
 
 int AutoGenRule::NumberApplicable() const {
   CHECK_GE(num_applicable_, 0)
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h
index e3008b857c53a9..dee0b72f19f4ff 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h
@@ -45,7 +45,7 @@ enum class RuleApplyType : int {
  */
 class AutoGenRule {
  public:
-  explicit AutoGenRule(const common::Target& target);
+  explicit AutoGenRule(const cinn::common::Target& target);
   ~AutoGenRule() = default;
 
   // Initialize the AutoGenRule, it must be called before further actions.
@@ -83,7 +83,7 @@ class AutoGenRule {
   // number of ScheduleBlock that can apply this auto gen rule
   int num_applicable_ = -1;
   // Target, not owned.
-  const common::Target* target_;
+  const cinn::common::Target* target_;
   // IRSchedule, not owned;
   ir::IRSchedule* ir_schedule_;
 };
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.cc
index 57e13c00a1c76b..4ba7092cee323c 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.cc
@@ -36,7 +36,7 @@ namespace cinn {
 namespace auto_schedule {
 
 AutoInline::AutoInline(
-    const common::Target& target,
+    const cinn::common::Target& target,
     const std::unordered_set<std::string>& no_inline_output_names)
     : AutoGenRule(target), no_inline_output_names_(no_inline_output_names) {}
 
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.h b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.h
index 9a0fc3e823361f..66a5818c7c4438 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.h
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.h
@@ -41,7 +41,7 @@ enum class AutoInlineType : int {
 
 class AutoInline : public AutoGenRule {
  public:
-  AutoInline(const common::Target& target,
+  AutoInline(const cinn::common::Target& target,
              const std::unordered_set<std::string>& no_inline_output_names);
   ~AutoInline() = default;
 
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline_test.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline_test.cc
index e69d3069f19390..83310de86f8baf 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline_test.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline_test.cc
@@ -50,7 +50,7 @@ using ::cinn::hlir::framework::OpLowerer;
 TEST(AutoInline, SingleLoopInline) {
   srand(0);
   Context::Global().ResetNameId();
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   Expr M(32);
 
@@ -140,7 +140,7 @@ TEST(AutoInline, SingleLoopInline) {
 TEST(AutoInline, AddReluInline) {
   srand(0);
   Context::Global().ResetNameId();
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   frontend::NetBuilder builder("test");
 
@@ -155,7 +155,7 @@ TEST(AutoInline, AddReluInline) {
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
 
   const auto& dtype_dict =
-      graph->GetAttrs<absl::flat_hash_map<std::string, common::Type>>(
+      graph->GetAttrs<absl::flat_hash_map<std::string, cinn::common::Type>>(
           "inferdtype");
   const auto& shape_dict = graph->GetAttrs<
       absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
@@ -268,7 +268,7 @@ class TestAutoInline : public TestAutoGenRuleBase {};
  *     Add(Multiply(Add(Relu())))
  */
 TEST_F(TestAutoInline, SingleChain) {
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
   Initialize(target);
   std::vector<std::string> input_names = {
       "bias", "conv_output", "bn_scale", "bn_offset"};
@@ -343,7 +343,7 @@ TEST_F(TestAutoInline, SingleChain) {
  *     z = Multiply(Exp())
  */
 TEST_F(TestAutoInline, InlineToMultiConsumers) {
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
   Initialize(target);
   std::vector<std::string> input_names = {"x"};
   std::vector<std::string> output_names = {"var_2", "var_1", "var_0"};
@@ -404,7 +404,7 @@ TEST_F(TestAutoInline, InlineToMultiConsumers) {
  *     z1 = Subtract(Gather(), Add(Gather()))
  */
 TEST_F(TestAutoInline, OnlySpatialOp) {
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
   Initialize(target);
   std::vector<std::string> input_names = {"x", "y"};
   std::vector<std::string> output_names = {"var_6",
@@ -472,7 +472,7 @@ TEST_F(TestAutoInline, OnlySpatialOp) {
  *     y = Add(fill_constant())
  */
 TEST_F(TestAutoInline, NoReadBufferOp) {
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
   Initialize(target);
   std::vector<std::string> input_names = {"x"};
   std::vector<std::string> output_names = {"var_0", "fill_constant"};
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.h b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.h
index f11ff7227a70ec..d5521e15c53482 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.h
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.h
@@ -31,7 +31,8 @@ namespace auto_schedule {
 // based on actual situation.
 class AutoUnroll : public AutoGenRule {
  public:
-  explicit AutoUnroll(const common::Target& target) : AutoGenRule(target) {}
+  explicit AutoUnroll(const cinn::common::Target& target)
+      : AutoGenRule(target) {}
   ~AutoUnroll() = default;
 
   RuleApplyType Init(ir::IRSchedule* init_schedule) override;
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll_test.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll_test.cc
index e4b0597cfeed75..0118846ab7a2e8 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll_test.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll_test.cc
@@ -35,9 +35,9 @@ TEST(AutoUnroll, Init) {
       {M, N}, [&](Var i, Var j) { return A(i, j) * B(i, j); }, "C");
 
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
   ast_gen_ius::TensorGroup tensor_group({C});
   auto funcs =
@@ -65,9 +65,9 @@ TEST(AutoUnroll, UnrollableApply) {
       "C");
 
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
   auto stages = CreateStages({C});
   auto funcs = cinn::lang::LowerVec(
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/mix_rules_test.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/mix_rules_test.cc
index d0e2dde7bdad67..caa130fb6bdbff 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/mix_rules_test.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/mix_rules_test.cc
@@ -36,7 +36,7 @@ class TestMixRules : public TestAutoGenRuleBase {
 TEST_F(TestMixRules, 2DMatmulOnMultiTilingRelated) {
   frontend::Program matmul_op =
       tests::OpBuilder("matmul").Build({{"X", {32, 32}}, {"Y", {32, 32}}});
-  Initialize(common::DefaultNVGPUTarget());
+  Initialize(cinn::common::DefaultNVGPUTarget());
   ir::IRSchedule ir_schedule = MakeIRSchedule(matmul_op);
   std::vector<ir::Expr> func_bodys = ir_schedule.GetModule().GetExprs();
   ASSERT_EQ(func_bodys.size(), 1UL);
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.cc
index 9cc02eefac7e5f..8b99fd6e61e221 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.cc
@@ -38,7 +38,7 @@
 namespace cinn {
 namespace auto_schedule {
 
-MultiLevelTiling::MultiLevelTiling(const common::Target& target,
+MultiLevelTiling::MultiLevelTiling(const cinn::common::Target& target,
                                    const Config& config)
     : AutoGenRule(target), config_(config) {
   for (int i = 0; i < config_.tile_struct.size(); ++i) {
@@ -434,9 +434,9 @@ void MultiLevelTiling::ApplyCacheWrite(ir::IRSchedule* ir_schedule,
   }
 }
 
-const std::unordered_map<common::Target::Arch, MultiLevelTiling::Config>
+const std::unordered_map<cinn::common::Target::Arch, MultiLevelTiling::Config>
     MultiLevelTiling::kConfigs{
-        {common::Target::Arch::NVGPU,
+        {cinn::common::Target::Arch::NVGPU,
          MultiLevelTiling::Config{
              /*bind_axis*/ std::vector<std::string>{"blockIdx.x",
                                                     "threadIdx.x"},
@@ -446,7 +446,7 @@ const std::unordered_map<common::Target::Arch, MultiLevelTiling::Config>
              /*write_cache_memory_type*/ std::string("local"),
              /*write_cache_levels*/ std::vector<int>{3},
          }},
-        {common::Target::Arch::X86,
+        {cinn::common::Target::Arch::X86,
          MultiLevelTiling::Config{
              /*bind_axis*/ std::vector<std::string>{},
              /*tile_struct*/ std::string("SSRSRS"),
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h
index 3bcf22a812ae0d..617cc24998bbb5 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h
@@ -53,9 +53,9 @@ class MultiLevelTiling : public AutoGenRule {
     std::vector<int> write_cache_levels;
   };
 
-  static const std::unordered_map<common::Target::Arch, Config> kConfigs;
+  static const std::unordered_map<cinn::common::Target::Arch, Config> kConfigs;
 
-  MultiLevelTiling(const common::Target& target, const Config& config);
+  MultiLevelTiling(const cinn::common::Target& target, const Config& config);
   ~MultiLevelTiling() = default;
 
   // initialize the AutoGenRule, it must be called before further actions.
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling_test.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling_test.cc
index 62f1bb74f4ac0e..bf7d8fb0b7d560 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling_test.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling_test.cc
@@ -44,9 +44,9 @@ TEST(MultiLevelTile, SampleSplitTwo) {
   srand(0);
   Context::Global().ResetNameId();
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
 
   MultiLevelTiling multi_level_tiling(
@@ -66,9 +66,9 @@ TEST(MultiLevelTile, SampleTileSplit) {
   srand(0);
   Context::Global().ResetNameId();
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
 
   MultiLevelTiling multi_level_tiling(
@@ -93,9 +93,9 @@ TEST(MultiLevelTile, SimpleLoops) {
   srand(0);
   Context::Global().ResetNameId();
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
 
   Expr M(32);
@@ -148,9 +148,9 @@ TEST(MulitLevelTile, MatrixMultiply) {
   srand(0);
   Context::Global().ResetNameId();
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
 
   Expr M(32);
@@ -214,7 +214,7 @@ TEST_F(TestMultiLevelTiling, Matmul) {
   std::vector<int32_t> Y_shape = {32, 32};
   std::vector<int32_t> out_shape = {32, 32};
 
-  Initialize(common::DefaultNVGPUTarget());
+  Initialize(cinn::common::DefaultNVGPUTarget());
   frontend::Program matmul_op =
       tests::OpBuilder("matmul").Build({{"X", X_shape}, {"Y", Y_shape}});
   ir::IRSchedule ir_schedule = MakeIRSchedule(matmul_op, fixed_rand_seed);
@@ -365,7 +365,7 @@ TEST_F(TestMultiLevelTiling, ReduceSum) {
   std::vector<int32_t> out_shape = {1, 16, 1};
   std::vector<int32_t> reduce_dim = {2};
 
-  Initialize(common::DefaultNVGPUTarget());
+  Initialize(cinn::common::DefaultNVGPUTarget());
   frontend::Program reduce_sum_op =
       tests::OpBuilder("reduce_sum")
           .Build({{"X", X_shape}}, {{"dim", reduce_dim}, {"keep_dim", false}});
@@ -408,7 +408,7 @@ TEST_F(TestMultiLevelTiling, Pool2d) {
        {"adaptive", adaptive},
        {"padding_algorithm", padding_algorithm}});
 
-  Initialize(common::DefaultNVGPUTarget());
+  Initialize(cinn::common::DefaultNVGPUTarget());
   ir::IRSchedule ir_schedule = MakeIRSchedule(pool2d_program, fixed_rand_seed);
   SearchState state(ir_schedule);
   VLOG(6) << "Original state:\n" << state->DebugString();
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring.cc
index c8b8fdeb0f554d..85bc207c84fc7d 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring.cc
@@ -189,7 +189,7 @@ void ReductionFactoring::Apply(const std::string& block_name,
       ir_schedule->GetBlock(block_name + "_rf__reduce_init");
   ir_schedule->SimpleComputeAt(rf_init_block, rb_loops.back());
 
-  if (*target_ == common::DefaultNVGPUTarget()) {
+  if (*target_ == cinn::common::DefaultNVGPUTarget()) {
     rb_loops = ir_schedule->GetLoops(block_name);
     rf_block = ir_schedule->GetBlock(block_name + "_rf");
     ir_schedule->Bind(rb_loops.back(), "threadIdx.x");
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring.h b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring.h
index 889e3e94292d2d..90963e831075c5 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring.h
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring.h
@@ -26,7 +26,7 @@ namespace auto_schedule {
 
 class ReductionFactoring : public AutoGenRule {
  public:
-  explicit ReductionFactoring(const common::Target& target)
+  explicit ReductionFactoring(const cinn::common::Target& target)
       : AutoGenRule(target) {}
   ~ReductionFactoring() = default;
 
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring_test.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring_test.cc
index 6848fba586944e..6f475d09de9316 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring_test.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/reduction_factoring_test.cc
@@ -39,7 +39,7 @@ class TestReductionFactoring : public TestAutoGenRuleBase {
                          const std::vector<int>& reduce_dim,
                          const std::string& block_name,
                          const std::string& expected_ir) {
-    Initialize(common::DefaultNVGPUTarget());
+    Initialize(cinn::common::DefaultNVGPUTarget());
     // In order to forcibly use the most basic Compute of reduction
     FLAGS_cinn_new_group_scheduler = 1;
     auto test_program = tests::ReduceBuilder().Build(
@@ -71,7 +71,7 @@ class TestReductionFactoring : public TestAutoGenRuleBase {
 
 TEST_F(TestReductionFactoring, AnalyseApplyType) {
   Context::Global().ResetNameId();
-  Initialize(common::DefaultNVGPUTarget());
+  Initialize(cinn::common::DefaultNVGPUTarget());
   auto test_program =
       tests::OpBuilder("elementwise_add").Build({{"X", {4, 5}}, {"Y", {4, 5}}});
   ir::IRSchedule ir_schedule = MakeIRSchedule(test_program);
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.cc
index 7810822299c8c7..c33641c0efae24 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.cc
@@ -24,7 +24,7 @@
 namespace cinn {
 namespace auto_schedule {
 
-SkipRule::SkipRule(const common::Target& target) : AutoGenRule(target) {}
+SkipRule::SkipRule(const cinn::common::Target& target) : AutoGenRule(target) {}
 
 RuleApplyType SkipRule::Init(ir::IRSchedule* ir_schedule) {
   ir_schedule_ = ir_schedule;
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.h b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.h
index b6862c23e7d2cb..a2d9e2bd5b8202 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.h
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.h
@@ -25,7 +25,7 @@ namespace auto_schedule {
 
 class SkipRule : public AutoGenRule {
  public:
-  explicit SkipRule(const common::Target& target);
+  explicit SkipRule(const cinn::common::Target& target);
   ~SkipRule() = default;
 
   RuleApplyType Init(ir::IRSchedule* init_schedule) override;
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule_test.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule_test.cc
index 5ba15a46fef188..f64bf387f59348 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule_test.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule_test.cc
@@ -39,9 +39,9 @@ TEST(SkipRule, Basic) {
   srand(0);
   Context::Global().ResetNameId();
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
 
   Expr M(32);
@@ -88,9 +88,9 @@ TEST(SkipRule, ApplyOnSpecificBlock) {
   srand(0);
   Context::Global().ResetNameId();
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
 
   Expr M(32);
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.cc
index 11fabfe16df2f0..257fb1a6a935de 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.cc
@@ -41,7 +41,7 @@ using ::cinn::hlir::framework::Scope;
 using ::cinn::hlir::framework::Shape;
 using ::cinn::hlir::framework::Tensor;
 
-void TestAutoGenRuleBase::Initialize(const common::Target& target) {
+void TestAutoGenRuleBase::Initialize(const cinn::common::Target& target) {
   target_ = target;
   backend_compier_ = backends::Compiler::Create(target);
 }
@@ -56,9 +56,8 @@ ir::IRSchedule TestAutoGenRuleBase::MakeIRSchedule(
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
   LOG_IF(WARNING, graph->fusion_groups.size() > 1)
       << "Test Graph has more than 1 group";
-  auto& dtype_dict =
-      graph->GetMutableAttrs<absl::flat_hash_map<std::string, common::Type>>(
-          "inferdtype");
+  auto& dtype_dict = graph->GetMutableAttrs<
+      absl::flat_hash_map<std::string, cinn::common::Type>>("inferdtype");
   auto& shape_dict = graph->GetMutableAttrs<
       absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
   auto op_lowerer =
@@ -107,7 +106,7 @@ ir::Module TestAutoGenRuleBase::BuildIRModule(const ir::IRSchedule& schedule) {
 std::string TestAutoGenRuleBase::GenSourceCode(const ir::Module& ir_module) {
   std::unique_ptr<backends::CodeGenC> codegen;
 #ifdef CINN_WITH_CUDA
-  if (target_ == common::DefaultNVGPUTarget()) {
+  if (target_ == cinn::common::DefaultNVGPUTarget()) {
     codegen = std::make_unique<backends::CodeGenCUDA_Dev>(this->target_);
   } else {
     codegen = std::make_unique<backends::CodeGenCX86>(
@@ -151,7 +150,7 @@ void MemoryCopy(const float* src, float* dst, int numel, std::string type) {
 }
 
 void AddDataToScope(Scope* scope,
-                    const common::Target& target,
+                    const cinn::common::Target& target,
                     float* data_ptr,
                     std::string name,
                     const std::vector<int>& shape) {
@@ -161,8 +160,9 @@ void AddDataToScope(Scope* scope,
   Shape cinn_shape(shape);
   tensor->Resize(cinn_shape);
   auto* tgt_data_ptr = tensor->mutable_data<float>(target);
-  std::string mem_cpy_type =
-      target == common::DefaultNVGPUTarget() ? "DeviceToHost" : "HostToHost";
+  std::string mem_cpy_type = target == cinn::common::DefaultNVGPUTarget()
+                                 ? "DeviceToHost"
+                                 : "HostToHost";
   MemoryCopy(data_ptr, tgt_data_ptr, cinn_shape.numel(), mem_cpy_type);
 }
 
@@ -172,7 +172,7 @@ void CheckResult(raw_func_type test_func,
                  const std::vector<std::string>& output_names,
                  const std::vector<std::vector<int>>& input_shapes,
                  const std::vector<std::vector<int>>& output_shapes,
-                 const common::Target& target) {
+                 const cinn::common::Target& target) {
   CHECK(input_names.size()) << "The number of inputs must be greater than 0.";
   CHECK(output_names.size()) << "The number of outputs must be greater than 0.";
   CHECK_EQ(input_names.size(), input_shapes.size())
@@ -239,7 +239,7 @@ void CheckResult(raw_func_type test_func,
     // data
     for (int i = 0; i < output_names.size(); ++i) {
       const float* result_ptr = scope.GetTensor(output_names[i])->data<float>();
-      std::string mem_cpy_type = target == common::DefaultNVGPUTarget()
+      std::string mem_cpy_type = target == cinn::common::DefaultNVGPUTarget()
                                      ? "DeviceToHost"
                                      : "HostToHost";
       MemoryCopy(
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h
index 73ef166e37b416..b808c046b752cf 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h
@@ -45,7 +45,7 @@ class TestAutoGenRuleBase : public ::testing::Test {
     Context::Global().ResetNameId();
   }
   // Initialize context for specified target
-  void Initialize(const common::Target& target);
+  void Initialize(const cinn::common::Target& target);
 
   // construct an ir::IRSchedule by lowering the specified for following
   // AutoGenRule test
@@ -68,7 +68,7 @@ class TestAutoGenRuleBase : public ::testing::Test {
   raw_func_type GenExecutableKernel(const ir::Module& ir_module);
 
  protected:
-  common::Target target_;
+  cinn::common::Target target_;
   std::vector<ir::LoweredFunc> lowered_funcs_;
   std::unique_ptr<backends::Compiler> backend_compier_;
 };
@@ -92,7 +92,7 @@ void CheckResult(raw_func_type test_func,
                  const std::vector<std::string>& output_names,
                  const std::vector<std::vector<int>>& input_shapes,
                  const std::vector<std::vector<int>>& output_shapes,
-                 const common::Target& target);
+                 const cinn::common::Target& target);
 
 }  // namespace auto_schedule
 }  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_space/rule_sampler_test.cc b/paddle/cinn/auto_schedule/search_space/rule_sampler_test.cc
index 2c21477a1bc590..442e108d948cc6 100644
--- a/paddle/cinn/auto_schedule/search_space/rule_sampler_test.cc
+++ b/paddle/cinn/auto_schedule/search_space/rule_sampler_test.cc
@@ -23,9 +23,9 @@ namespace cinn {
 namespace auto_schedule {
 
 #ifdef CINN_WITH_CUDA
-Target target = common::DefaultNVGPUTarget();
+Target target = cinn::common::DefaultNVGPUTarget();
 #else
-Target target = common::DefaultHostTarget();
+Target target = cinn::common::DefaultHostTarget();
 #endif
 
 std::vector<AutoGenRule*> GenerateTestRules() {
diff --git a/paddle/cinn/auto_schedule/search_space/search_state.cc b/paddle/cinn/auto_schedule/search_space/search_state.cc
index d380691faf62b5..8aff7dac3c2110 100644
--- a/paddle/cinn/auto_schedule/search_space/search_state.cc
+++ b/paddle/cinn/auto_schedule/search_space/search_state.cc
@@ -32,7 +32,8 @@ namespace auto_schedule {
 SearchState::SearchState(ir::IRSchedule ir_sch,
                          float cost,
                          const std::vector<AutoGenRule*>& rules)
-    : common::Shared<_SearchState_>(common::make_shared<_SearchState_>()) {
+    : cinn::common::Shared<_SearchState_>(
+          cinn::common::make_shared<_SearchState_>()) {
   auto* state = get();
   state->ir_schedule = std::move(ir_sch);
   state->applicable_rules = rules;
diff --git a/paddle/cinn/auto_schedule/search_space/search_state.h b/paddle/cinn/auto_schedule/search_space/search_state.h
index a5684eb6189522..6852fb1c99186b 100644
--- a/paddle/cinn/auto_schedule/search_space/search_state.h
+++ b/paddle/cinn/auto_schedule/search_space/search_state.h
@@ -31,7 +31,7 @@ struct _SearchState_;
 class AutoGenRule;
 
 //! Shared Wrapper for _SearchState_
-class SearchState : public common::Shared<_SearchState_> {
+class SearchState : public cinn::common::Shared<_SearchState_> {
  public:
   SearchState() = default;
   // create a new SearchState
@@ -49,7 +49,7 @@ class SearchState : public common::Shared<_SearchState_> {
 };
 
 //! Class to store immediate states during search
-struct _SearchState_ : public common::Object {
+struct _SearchState_ : public cinn::common::Object {
   // IRSchedule contains ir::ModuleExpr and trace scheduling process
   ir::IRSchedule ir_schedule;
   // Cost model predicted cost
diff --git a/paddle/cinn/auto_schedule/search_space/search_state_test.cc b/paddle/cinn/auto_schedule/search_space/search_state_test.cc
index b0f216c4895aa1..3ab24fd1fdb106 100644
--- a/paddle/cinn/auto_schedule/search_space/search_state_test.cc
+++ b/paddle/cinn/auto_schedule/search_space/search_state_test.cc
@@ -25,7 +25,7 @@ namespace cinn {
 namespace auto_schedule {
 
 TEST(TestSearchState, SearchStateHash_Equal) {
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   ir::Expr M(32);
   ir::Expr N(32);
diff --git a/paddle/cinn/auto_schedule/search_strategy/evolutionary_search_test.cc b/paddle/cinn/auto_schedule/search_strategy/evolutionary_search_test.cc
index 539be166f28cf2..6a983d7f9aaac8 100644
--- a/paddle/cinn/auto_schedule/search_strategy/evolutionary_search_test.cc
+++ b/paddle/cinn/auto_schedule/search_strategy/evolutionary_search_test.cc
@@ -41,7 +41,7 @@ std::vector<TuneTask> CreateTasks(const frontend::Program& program,
   TaskCreator task_creator;
   auto tasks = task_creator.CreateTuneTaskOpLevel(graph.get());
   const auto& dtype_dict =
-      graph->GetAttrs<absl::flat_hash_map<std::string, common::Type>>(
+      graph->GetAttrs<absl::flat_hash_map<std::string, cinn::common::Type>>(
           "inferdtype");
   const auto& shape_dict = graph->GetAttrs<
       absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
@@ -93,7 +93,7 @@ class MockSearchSpace : public SearchSpace {
 
 class MockCostModel : public ExprCostModel {
   float Predict(const ir::ModuleExpr& sample,
-                const common::Target& target) const override {
+                const cinn::common::Target& target) const override {
     float cost = 0.0f;
     std::vector<ir::Expr> exprs = sample.GetExprs();
     for (const ir::Expr& expr : exprs) {
@@ -108,7 +108,7 @@ class MockCostModel : public ExprCostModel {
 TEST(EvolutionarySearch, GetOneBest) {
   TuneTask mock_tune_task;
   mock_tune_task.serialized_key = "mock_task";
-  mock_tune_task.target = common::DefaultTarget();
+  mock_tune_task.target = cinn::common::DefaultTarget();
   InitialTaskRegistry* task_registry = InitialTaskRegistry::Global();
   task_registry->Regist(mock_tune_task.serialized_key,
                         ir::ModuleExpr({ir::Expr(0)}));
@@ -131,7 +131,7 @@ TEST(EvolutionarySearch, GetOneBest) {
 TEST(EvolutionarySearch, GetEpsGreedy) {
   TuneTask mock_tune_task;
   mock_tune_task.serialized_key = "mock_task";
-  mock_tune_task.target = common::DefaultTarget();
+  mock_tune_task.target = cinn::common::DefaultTarget();
   InitialTaskRegistry* task_registry = InitialTaskRegistry::Global();
   task_registry->Regist(mock_tune_task.serialized_key,
                         ir::ModuleExpr({ir::Expr(0)}));
@@ -155,7 +155,7 @@ TEST(EvolutionarySearch, GetEpsGreedy) {
 }
 
 TEST(EvolutionarySearch, Evolve) {
-  auto target = common::DefaultNVGPUTarget();
+  auto target = cinn::common::DefaultNVGPUTarget();
   auto tasks = CreateTasks(
       tests::OpBuilder("matmul").Build({{"X", {32, 32}}, {"Y", {32, 32}}}),
       target);
diff --git a/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size_test.cc b/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size_test.cc
index 94222d748c0546..87930cf81ce632 100644
--- a/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size_test.cc
+++ b/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size_test.cc
@@ -28,9 +28,9 @@ TEST(MutateTileSize, Basic) {
   srand(0);
   Context::Global().ResetNameId();
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
 
   const int kSize = 32;
diff --git a/paddle/cinn/auto_schedule/task/task_creator_test.cc b/paddle/cinn/auto_schedule/task/task_creator_test.cc
index 60b5ebec0e808c..2cb80727d9bc4f 100644
--- a/paddle/cinn/auto_schedule/task/task_creator_test.cc
+++ b/paddle/cinn/auto_schedule/task/task_creator_test.cc
@@ -50,9 +50,9 @@ Program CreateAddProgram() {
 
 TEST(TaskCreator, Basic) {
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
   Program prog = CreateAddProgram();
   auto graph = std::make_shared<hlir::framework::Graph>(prog, target);
diff --git a/paddle/cinn/auto_schedule/task/task_optimizer.cc b/paddle/cinn/auto_schedule/task/task_optimizer.cc
index d76797d9953ecd..273cba4c4060e6 100644
--- a/paddle/cinn/auto_schedule/task/task_optimizer.cc
+++ b/paddle/cinn/auto_schedule/task/task_optimizer.cc
@@ -49,12 +49,12 @@ using cinn::hlir::op::ExternalApiRegistry;
 
 // *** forward declarations of auxiliary functions to be used in this file only
 // *** update a scheduled function with several post-processors
-ir::LoweredFunc FuncWithUpdatedBody(const common::Target& target,
+ir::LoweredFunc FuncWithUpdatedBody(const cinn::common::Target& target,
                                     const ir::LoweredFunc& old_func,
                                     ir::Expr& body);  // NOLINT
 // check whether a scheduled lowered function is valid
 bool PruneInvalid(const ir::LoweredFunc& lowered_func,
-                  const common::Target& target);
+                  const cinn::common::Target& target);
 // exclude some special tasks
 bool IsForbiddenToTune(const TuneTask* task);
 // tell whether the task has been wrapped by custom_call in
@@ -441,11 +441,11 @@ bool IsGPUMemoryUsageExceedLimit(const ir::LoweredFunc& lowered_func,
 }
 
 bool PruneInvalid(const ir::LoweredFunc& lowered_func,
-                  const common::Target& target) {
+                  const cinn::common::Target& target) {
   static const size_t kGPUSharedMemoryLimitBytes = GetGPUSharedMemoryLimit();
   static const size_t kGPULocalStackLimitBytes = GetGPULocalStackLimit();
 
-  if (target == common::DefaultNVGPUTarget()) {
+  if (target == cinn::common::DefaultNVGPUTarget()) {
     if (IsGPUMemoryUsageExceedLimit(lowered_func,
                                     ir::MemoryType::GPUShared,
                                     kGPUSharedMemoryLimitBytes)) {
diff --git a/paddle/cinn/auto_schedule/task/task_registry_test.cc b/paddle/cinn/auto_schedule/task/task_registry_test.cc
index 23f31028be9e93..579d9378511de9 100644
--- a/paddle/cinn/auto_schedule/task/task_registry_test.cc
+++ b/paddle/cinn/auto_schedule/task/task_registry_test.cc
@@ -34,13 +34,13 @@ namespace cinn {
 namespace auto_schedule {
 
 std::vector<TuneTask> CreateTasks(hlir::framework::Graph* graph,
-                                  const common::Target& target) {
+                                  const cinn::common::Target& target) {
   // create tasks
   TaskCreator task_creator;
   std::vector<TuneTask> tasks = task_creator.CreateTuneTaskOpLevel(graph);
 
   const auto& dtype_dict =
-      graph->GetAttrs<absl::flat_hash_map<std::string, common::Type>>(
+      graph->GetAttrs<absl::flat_hash_map<std::string, cinn::common::Type>>(
           "inferdtype");
   const auto& shape_dict = graph->GetAttrs<
       absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
@@ -56,7 +56,7 @@ std::vector<TuneTask> CreateTasks(hlir::framework::Graph* graph,
 }
 
 std::shared_ptr<hlir::framework::Graph> CreateAddProgram(
-    const common::Target& target) {
+    const cinn::common::Target& target) {
   frontend::NetBuilder builder("test");
 
   auto a = builder.CreateInput(Float(32), {1, 64, 112, 112}, "A");
@@ -70,9 +70,9 @@ TEST(TestTaskRegistry, basic) {
   FLAGS_auto_schedule_use_cost_model = true;
 
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
   std::shared_ptr<hlir::framework::Graph> graph = CreateAddProgram(target);
   std::vector<TuneTask> tasks = CreateTasks(graph.get(), target);
diff --git a/paddle/cinn/auto_schedule/task/tune_task.cc b/paddle/cinn/auto_schedule/task/tune_task.cc
index f2c2b720b6f062..30353d2db584d3 100644
--- a/paddle/cinn/auto_schedule/task/tune_task.cc
+++ b/paddle/cinn/auto_schedule/task/tune_task.cc
@@ -63,7 +63,8 @@ std::string TuneTask::SerializeToString(
   // local function to print dtype,shape of out/in variables of the specified
   // node
   auto print_node_links_fn =
-      [&](const std::vector<common::Shared<common::GraphEdge>>& links,
+      [&](const std::vector<cinn::common::Shared<cinn::common::GraphEdge>>&
+              links,
           bool is_input) {
         int printed_num = 0;
         for (auto&& edge : links) {
diff --git a/paddle/cinn/auto_schedule/task/tune_task.h b/paddle/cinn/auto_schedule/task/tune_task.h
index 92bf5c73ca3f3d..b69c86917602c5 100644
--- a/paddle/cinn/auto_schedule/task/tune_task.h
+++ b/paddle/cinn/auto_schedule/task/tune_task.h
@@ -54,7 +54,7 @@ class TuneTask {
   // Lower handler, Not owned
   hlir::framework::OpLowerer<GroupPtr>* op_lowerer;
   // target of this task
-  common::Target target;
+  cinn::common::Target target;
   // stores the initial (un-optimized) LoweredFuncs
   std::vector<ir::LoweredFunc> lowered_funcs;
   // names of the output arguments of lowered_funcs_
diff --git a/paddle/cinn/auto_schedule/task/tune_task_test.cc b/paddle/cinn/auto_schedule/task/tune_task_test.cc
index 41fe147d6d60c4..733197b0a6f97d 100644
--- a/paddle/cinn/auto_schedule/task/tune_task_test.cc
+++ b/paddle/cinn/auto_schedule/task/tune_task_test.cc
@@ -59,9 +59,9 @@ Program CreateAddProgram() {
 TEST(TuneTask, GraphToUnoptLoweredFunc_NoPass) {
   Context::Global().ResetNameId();
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
   Program prog = CreateAddProgram();
   auto graph = std::make_shared<hlir::framework::Graph>(prog, target);
@@ -73,7 +73,7 @@ TEST(TuneTask, GraphToUnoptLoweredFunc_NoPass) {
   const auto& shape_dict = graph->GetAttrs<
       absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
   const auto& dtype_dict =
-      graph->GetAttrs<absl::flat_hash_map<std::string, common::Type>>(
+      graph->GetAttrs<absl::flat_hash_map<std::string, cinn::common::Type>>(
           "inferdtype");
   auto op_lowerer =
       hlir::framework::CreateOpLowerer(dtype_dict, shape_dict, target);
@@ -169,9 +169,9 @@ TEST(TuneTask, GraphToUnoptLoweredFunc_NoPass) {
 TEST(TuneTask, GraphToUnoptLoweredFunc_ApplyPass) {
   Context::Global().ResetNameId();
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
   Program prog = CreateAddProgram();
   auto graph = std::make_shared<hlir::framework::Graph>(prog, target);
@@ -185,7 +185,7 @@ TEST(TuneTask, GraphToUnoptLoweredFunc_ApplyPass) {
   const auto& shape_dict = graph->GetAttrs<
       absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
   const auto& dtype_dict =
-      graph->GetAttrs<absl::flat_hash_map<std::string, common::Type>>(
+      graph->GetAttrs<absl::flat_hash_map<std::string, cinn::common::Type>>(
           "inferdtype");
 
   OpLowerer op_lowerer(
@@ -277,9 +277,9 @@ TEST(TuneTask, GraphToUnoptLoweredFunc_ApplyPass) {
 TEST(TuneTask, SerializeToString) {
   Context::Global().ResetNameId();
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
   Program prog = CreateAddProgram();
   auto graph = std::make_shared<hlir::framework::Graph>(prog, target);
@@ -291,7 +291,7 @@ TEST(TuneTask, SerializeToString) {
   const auto& shape_dict = graph->GetAttrs<
       absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
   const auto& dtype_dict =
-      graph->GetAttrs<absl::flat_hash_map<std::string, common::Type>>(
+      graph->GetAttrs<absl::flat_hash_map<std::string, cinn::common::Type>>(
           "inferdtype");
   OpLowerer op_lowerer(
       new hlir::framework::OpLowererImpl(dtype_dict, shape_dict, target));
diff --git a/paddle/cinn/auto_schedule/tests/performance_comparison_test.cc b/paddle/cinn/auto_schedule/tests/performance_comparison_test.cc
index ce7a9369f847ef..2966467b3eda67 100644
--- a/paddle/cinn/auto_schedule/tests/performance_comparison_test.cc
+++ b/paddle/cinn/auto_schedule/tests/performance_comparison_test.cc
@@ -138,7 +138,7 @@ class PerformanceTester : public ::testing::Test {
   std::unique_ptr<hlir::framework::Program> BuildNoScheduleProgram(
       Graph* graph, GraphCompiler* graph_compiler) {
     const auto& dtype_dict =
-        graph->GetAttrs<absl::flat_hash_map<std::string, common::Type>>(
+        graph->GetAttrs<absl::flat_hash_map<std::string, cinn::common::Type>>(
             "inferdtype");
     const auto& shape_dict = graph->GetAttrs<
         absl::flat_hash_map<std::string, hlir::framework::shape_t>>(
@@ -211,9 +211,9 @@ class PerformanceTester : public ::testing::Test {
   }
 
 #ifdef CINN_WITH_CUDA
-  Target target_ = common::DefaultNVGPUTarget();
+  Target target_ = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target_ = common::DefaultHostTarget();
+  Target target_ = cinn::common::DefaultHostTarget();
 #endif
   Options options_;
 };
@@ -340,7 +340,7 @@ TEST_F(PerformanceTester, LookupTable) {
 
   Evaluate(tests::OpBuilder("lookup_table")
                .Build({{"table", {50001, 768}},
-                       {"ids", {10, 128, 1}, common::Int(64)}},
+                       {"ids", {10, 128, 1}, cinn::common::Int(64)}},
                       {{"padding_idx", padding_idx}}));
 }
 
@@ -349,7 +349,7 @@ TEST_F(PerformanceTester, Gather) {
 
   Evaluate(tests::OpBuilder("gather").Build(
       {{"operand", {10, 12, 128, 512}},
-       {"index", {1, 1, 1, 128}, common::Int(32)}},
+       {"index", {1, 1, 1, 128}, cinn::common::Int(32)}},
       {{"axis", axis}}));
 }
 
@@ -359,8 +359,9 @@ TEST_F(PerformanceTester, ResNet50) {
   FLAGS_cinn_infer_model_version = 1.0;
   std::unordered_map<std::string, std::vector<int64_t>> feeds = {
       {"inputs", {batch_size, 3, 224, 224}}};
-  Evaluate(cinn::frontend::PaddleModelConvertor(common::DefaultNVGPUTarget())
-               .LoadModel(FLAGS_resnet50_model_dir, true, feeds));
+  Evaluate(
+      cinn::frontend::PaddleModelConvertor(cinn::common::DefaultNVGPUTarget())
+          .LoadModel(FLAGS_resnet50_model_dir, true, feeds));
 }
 
 }  // namespace auto_schedule
diff --git a/paddle/cinn/backends/codegen_c.cc b/paddle/cinn/backends/codegen_c.cc
index c3c882f9e8f988..282a338204f26d 100644
--- a/paddle/cinn/backends/codegen_c.cc
+++ b/paddle/cinn/backends/codegen_c.cc
@@ -120,10 +120,11 @@ std::string CodeGenC::GetTypeName(Type type) {
     auto customized_name = type.customized_type();
     // get name of a cuda built-in vector type, it is started with a
     // 'CudaVectorType::' prefix
-    if (utils::Startswith(customized_name,
-                          common::customized_type::kcuda_builtin_vector_t)) {
+    if (utils::Startswith(
+            customized_name,
+            cinn::common::customized_type::kcuda_builtin_vector_t)) {
       customized_name.erase(
-          0, strlen(common::customized_type::kcuda_builtin_vector_t));
+          0, strlen(cinn::common::customized_type::kcuda_builtin_vector_t));
     }
     return customized_name;
   }
@@ -653,7 +654,7 @@ void CodeGenC::PrintBufferCreation(const std::vector<ir::Buffer> &buffers) {
     DoIndent();
     auto buffer_ptr_type =
         Type()
-            .set_customized_type(common::customized_type::kbuffer_t)
+            .set_customized_type(cinn::common::customized_type::kbuffer_t)
             .set_cpp_handle();
     Var variable = ir::_Var_::Make(buffer->name, buffer_ptr_type);
     auto expr = ir::intrinsics::BufferCreate::Make(buffer);
diff --git a/paddle/cinn/backends/codegen_c_test.cc b/paddle/cinn/backends/codegen_c_test.cc
index caf4950cdfe8cd..91f80c190f0f85 100644
--- a/paddle/cinn/backends/codegen_c_test.cc
+++ b/paddle/cinn/backends/codegen_c_test.cc
@@ -286,7 +286,7 @@ TEST(CodeGenC, matmul_tile) {
   // Code gen
   auto func = Lower("matmul", stages, {A, B, C});
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   Module::Builder builder("module1", target);
   builder.AddFunction(func);
@@ -373,7 +373,7 @@ TEST(CodeGenC, matmul_packed) {
   // Code gen
   auto func = Lower("matmul_with_packing", stages, {A, B, packedB, C});
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   Module::Builder builder("module1", target);
   builder.AddFunction(func);
@@ -445,10 +445,10 @@ TEST(CodeGenC, call_extern) {
 
   auto yexpr = Lower("yy", stages, {y});
 
-  Module::Builder builder("module0", common::DefaultHostTarget());
+  Module::Builder builder("module0", cinn::common::DefaultHostTarget());
   builder.AddFunction(yexpr);
 
-  CodeGenC codegen(common::DefaultHostTarget());
+  CodeGenC codegen(cinn::common::DefaultHostTarget());
   codegen.SetInlineBuiltinCodes(false);
   auto out = codegen.Compile(builder.Build(), CodeGenC::OutputKind::CImpl);
   std::cout << "codegen C:" << std::endl << out << std::endl;
diff --git a/paddle/cinn/backends/codegen_cuda_dev.cc b/paddle/cinn/backends/codegen_cuda_dev.cc
index 5a1ddbc450a091..4e7fa79d2d0b30 100644
--- a/paddle/cinn/backends/codegen_cuda_dev.cc
+++ b/paddle/cinn/backends/codegen_cuda_dev.cc
@@ -339,8 +339,9 @@ void CodeGenCUDA_Dev::Visit(const ir::Let *op) {
   // identify vectorized tensors by checking their dtypes are customized_type
   // with customized_type::kcuda_builtin_vector_t prefix, and save their names
   if (op->type().is_customized() &&
-      utils::Startswith(op->type().customized_type(),
-                        common::customized_type::kcuda_builtin_vector_t)) {
+      utils::Startswith(
+          op->type().customized_type(),
+          cinn::common::customized_type::kcuda_builtin_vector_t)) {
     str_ += GetTypeRepr(op->type());
     if (op->type().is_cpp_handle()) {
       str_ += " ";
diff --git a/paddle/cinn/backends/codegen_cuda_util.h b/paddle/cinn/backends/codegen_cuda_util.h
index 71be91a855c8ed..5a7f1f5882bf9b 100644
--- a/paddle/cinn/backends/codegen_cuda_util.h
+++ b/paddle/cinn/backends/codegen_cuda_util.h
@@ -47,9 +47,10 @@ namespace detail {
 
 struct CollectHostFunctionVisitor : public ir::IRMutator<> {
   explicit CollectHostFunctionVisitor(const std::string& module_name)
-      : host_module_builder(module_name + "_host", common::DefaultHostTarget()),
+      : host_module_builder(module_name + "_host",
+                            cinn::common::DefaultHostTarget()),
         device_module_builder(module_name + "_gpu_device",
-                              common::DefaultNVGPUTarget()) {}
+                              cinn::common::DefaultNVGPUTarget()) {}
 
   std::tuple<ir::Module, ir::Module> operator()(Expr* expr) {
     ir::IRMutator<>::Visit(expr, expr);
diff --git a/paddle/cinn/backends/codegen_debug_test.cc b/paddle/cinn/backends/codegen_debug_test.cc
index a156f5475b3db7..6ed5e37685b703 100644
--- a/paddle/cinn/backends/codegen_debug_test.cc
+++ b/paddle/cinn/backends/codegen_debug_test.cc
@@ -61,7 +61,7 @@ CUdeviceptr CreateCudaMemory(const std::vector<int>& shape, const T* data) {
 }
 
 TEST(CodeGenDebug, RunCudaSourceCode) {
-  common::Context::Global().ResetNameId();
+  cinn::common::Context::Global().ResetNameId();
 
   std::string source_code = R"ROC(
 extern "C" {
diff --git a/paddle/cinn/backends/compiler_test.cc b/paddle/cinn/backends/compiler_test.cc
index 84abedd91e5b61..1c14fcd4ffa64b 100644
--- a/paddle/cinn/backends/compiler_test.cc
+++ b/paddle/cinn/backends/compiler_test.cc
@@ -50,26 +50,30 @@ TEST(Compiler, x86) {
 
     auto fn = Lower("fn", stages, {A, B, C});
 
-    ir::Module::Builder builder("some_module", common::DefaultHostTarget());
+    ir::Module::Builder builder("some_module",
+                                cinn::common::DefaultHostTarget());
     builder.AddFunction(fn);
 
-    auto compiler = Compiler::Create(common::DefaultHostTarget());
+    auto compiler = Compiler::Create(cinn::common::DefaultHostTarget());
     compiler->Build(builder.Build());
 
     auto* fnp = compiler->Lookup("fn");
     ASSERT_TRUE(fnp);
 
-    auto* Ab = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
-                   .set_random()
-                   .Build();
-    auto* Bb = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
-                   .set_random()
-                   .Build();
-    auto* Cb = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
-                   .set_zero()
-                   .Build();
-
-    auto args = common::ArgsBuilder().Add(Ab).Add(Bb).Add(Cb).Build();
+    auto* Ab =
+        cinn::common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
+            .set_random()
+            .Build();
+    auto* Bb =
+        cinn::common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
+            .set_random()
+            .Build();
+    auto* Cb =
+        cinn::common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
+            .set_zero()
+            .Build();
+
+    auto args = cinn::common::ArgsBuilder().Add(Ab).Add(Bb).Add(Cb).Build();
     reinterpret_cast<void (*)(void*, int)>(fnp)(args.data(), args.size());
 
     // test result
@@ -107,24 +111,28 @@ TEST(Compiler, cuda) {
 
     auto fn = Lower("fn", stages, {A, B, C});
 
-    ir::Module::Builder builder("some_module", common::DefaultHostTarget());
+    ir::Module::Builder builder("some_module",
+                                cinn::common::DefaultHostTarget());
     builder.AddFunction(fn);
 
-    auto compiler = Compiler::Create(common::DefaultNVGPUTarget());
+    auto compiler = Compiler::Create(cinn::common::DefaultNVGPUTarget());
     compiler->Build(builder.Build());
 
     auto* fnp = compiler->Lookup("fn");
     ASSERT_TRUE(fnp);
 
-    auto* Ab = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
-                   .set_random()
-                   .Build();
-    auto* Bb = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
-                   .set_random()
-                   .Build();
-    auto* Cb = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
-                   .set_zero()
-                   .Build();
+    auto* Ab =
+        cinn::common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
+            .set_random()
+            .Build();
+    auto* Bb =
+        cinn::common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
+            .set_random()
+            .Build();
+    auto* Cb =
+        cinn::common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
+            .set_zero()
+            .Build();
 
     // allocate CUDA buffer
     void *Ag, *Bg, *Cg;
@@ -144,7 +152,8 @@ TEST(Compiler, cuda) {
     cinn_buffer_t Cbb;
     Cbb.memory = reinterpret_cast<uint8_t*>(Cg);
 
-    auto args = common::ArgsBuilder().Add(&Abb).Add(&Bbb).Add(&Cbb).Build();
+    auto args =
+        cinn::common::ArgsBuilder().Add(&Abb).Add(&Bbb).Add(&Cbb).Build();
 
     utils::Timer timer;
     timer.Start();
@@ -204,10 +213,10 @@ TEST(Compiler, sqrt) {
   auto fn =
       Lower("fn", stages, {input, mean, scale, bias, variance, A, B[0], BB});
 
-  Module::Builder builder("some", common::DefaultHostTarget());
+  Module::Builder builder("some", cinn::common::DefaultHostTarget());
   builder.AddFunction(fn);
 
-  auto compiler = Compiler::Create(common::DefaultHostTarget());
+  auto compiler = Compiler::Create(cinn::common::DefaultHostTarget());
   compiler->Build(builder.Build());
 }
 
diff --git a/paddle/cinn/backends/ir_schedule_test.cc b/paddle/cinn/backends/ir_schedule_test.cc
index cdcc709d9d45bf..1d6b92933b9f16 100644
--- a/paddle/cinn/backends/ir_schedule_test.cc
+++ b/paddle/cinn/backends/ir_schedule_test.cc
@@ -41,7 +41,7 @@ TEST(IrSchedule, split_and_fuse1) {
   Expr N(32);
   Expr P(32);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   Placeholder<float> A("A", {M, N});
   auto B = Compute(
@@ -100,7 +100,7 @@ TEST(IrSchedule, split_and_fuse2) {
   Expr N(32);
   Expr P(32);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   Placeholder<float> A("A", {M, N});
   auto B = Compute(
@@ -163,7 +163,7 @@ void TestSplitThrow() {
   Expr N(32);
   Expr P(32);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   Placeholder<float> A("A", {M, N});
   auto B = Compute(
@@ -205,7 +205,7 @@ TEST(IrSchedule, reorder1) {
   Expr N(32);
   Expr P(32);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   Placeholder<float> A("A", {M, N, P});
   auto B = Compute(
@@ -272,7 +272,7 @@ TEST(IrSchedule, reorder2) {
   Expr N(32);
   Expr P(32);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   Placeholder<float> A("A", {M, N, P});
   auto B = Compute(
@@ -338,7 +338,7 @@ TEST(IrSchedule, reorder3) {
   Expr N(32);
   Expr P(32);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   Placeholder<float> A("A", {M, N, P});
   auto B = Compute(
@@ -408,7 +408,7 @@ TEST(IrSchedule, reorder4) {
   Expr N(32);
   Expr P(32);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   Placeholder<float> A("A", {M, N, P});
   auto B = Compute(
@@ -482,7 +482,7 @@ TEST(IrSchedule, parallel) {
   Expr M(32);
   Expr N(32);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   Placeholder<float> A("A", {M, N});
   auto B = Compute(
@@ -548,7 +548,7 @@ TEST(IrSchedule, vectorize) {
   Expr M(32);
   Expr N(32);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   Placeholder<float> A("A", {M, N});
   auto B = Compute(
@@ -622,7 +622,7 @@ TEST(IrSchedule, unroll) {
   Expr M(32);
   Expr N(2);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   Placeholder<float> A("A", {M, N});
   auto B = Compute(
@@ -696,7 +696,7 @@ TEST(IrSchedule, bind) {
   Expr M(32);
   Expr N(2);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   Placeholder<float> A("A", {M, N});
   auto B = Compute(
@@ -741,7 +741,7 @@ TEST(IrSchedule, simple_compute_at) {
   Expr M(128);
   Expr N(10);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   Placeholder<float> A("A", {M, N});
   auto B = Compute(
@@ -814,7 +814,7 @@ TEST(IrSchedule, compute_at0) {
   Expr M(128);
   Expr N(10);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   Placeholder<float> A("A", {M, N});
   auto B = Compute(
@@ -888,7 +888,7 @@ TEST(IrSchedule, compute_at1) {
   Expr N(32);
   Expr P(32);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   Placeholder<float> A("A", {M, N, P});
   auto B = Compute(
@@ -960,7 +960,7 @@ TEST(IrSchedule, compute_at2) {
   Expr M(64);
   Expr N(32);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   Placeholder<float> A("A", {M, M});
   auto B = Compute(
@@ -1032,7 +1032,7 @@ TEST(IrSchedule, compute_at3) {
   Expr M(64);
   Expr N(32);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   Placeholder<float> A("A", {M, M});
   auto B = Compute(
@@ -1112,7 +1112,7 @@ TEST(IrSchedule, compute_at4) {
   Expr N(32);
   Expr P(32);
 
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 
   Placeholder<float> A("A", {M, N, P});
   auto B = Compute(
@@ -1174,7 +1174,7 @@ TEST(IrSchedule, compute_at5) {
   Expr M(64);
   Expr N(32);
 
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 
   Placeholder<float> A("A", {M, M});
   auto B = Compute(
@@ -1237,7 +1237,7 @@ TEST(IrSchedule, compute_at6) {
   Expr M(64);
   Expr N(32);
 
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 
   Placeholder<float> A("A", {M, M});
   auto B = Compute(
@@ -1303,7 +1303,7 @@ TEST(IrSchedule, cache_read1) {
   Expr N(32);
   Expr P(16);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   Placeholder<float> A("A", {M, M});
   auto B = Compute(
@@ -1388,7 +1388,7 @@ TEST(IrSchedule, cache_read2) {
   Expr M(64);
   Expr N(32);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   Placeholder<float> A("A", {M, N});
   auto B = Compute(
@@ -1456,7 +1456,7 @@ TEST(IrSchedule, cache_write1) {
   Expr M(64);
   Expr N(32);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   Placeholder<float> A("A", {M, N});
   auto B = Compute(
@@ -1542,7 +1542,7 @@ TEST(IrSchedule, cache_write2) {
   Expr M(64);
   Expr N(32);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   Placeholder<float> A("A", {M, N});
   auto B = Compute(
@@ -1610,7 +1610,7 @@ TEST(IrSchedule, cache_read3) {
   Expr N(32);
   Expr P(16);
 
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 
   Placeholder<float> A("A", {M, M});
   auto B = Compute(
@@ -1691,7 +1691,7 @@ TEST(IrSchedule, cache_write3) {
   Expr M(64);
   Expr N(32);
 
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 
   Placeholder<float> A("A", {M, N});
   auto B = Compute(
@@ -1773,7 +1773,7 @@ TEST(IrSchedule, sync_threads) {
   Expr M(64);
   Expr N(32);
 
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 
   Placeholder<float> A("A", {M, N});
   auto B = Compute(
@@ -1854,7 +1854,7 @@ TEST(IrSchedule, cache_write4) {
   Expr M(64);
   Expr N(32);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   Placeholder<float> A("A", {M, N, N});
   Var k(32, "k0");
@@ -1930,7 +1930,7 @@ TEST(IrSchedule, rfactor) {
   Expr N(2);
   Expr K(16);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   Placeholder<float> A("A", {M, N, K});
   Var j(2, "j0");
@@ -2057,7 +2057,7 @@ TEST(IrSchedule, rfactor1) {
   Expr N(2);
   Expr K(16);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   Placeholder<float> A("A", {M, N, K});
   Var j(2, "j0");
@@ -2185,7 +2185,7 @@ TEST(IrSchedule, rfactor2) {
   Expr N(2);
   Expr K(16);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   Placeholder<float> A("A", {M, K});
   Placeholder<float> B("B", {K, N});
@@ -2318,7 +2318,7 @@ TEST(IrSchedule, factorize_reduction) {
   Expr N(4);
   Expr K(5);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   Placeholder<float> A("A", {M, N, K});
   Var j(4, "j0");
@@ -2407,7 +2407,7 @@ TEST(IrSchedule, factorize_reduction1) {
   Expr N(4);
   Expr K(5);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   Placeholder<float> A("A", {M, N, K});
   Var j(4, "j0");
@@ -2496,7 +2496,7 @@ TEST(IrSchedule, factorize_reduction2) {
   Expr N(4);
   Expr K(5);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   Placeholder<float> A("A", {M, N * K});
   Var j(4 * 5, "j0");
@@ -2582,7 +2582,7 @@ TEST(IrSchedule, compute_inline1) {
   Expr N(32);
   Expr P(32);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   Placeholder<float> A("A", {M, N, P});
   auto B = Compute(
@@ -2653,7 +2653,7 @@ TEST(IrSchedule, compute_inline2) {
   Expr N(32);
   Expr P(32);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   Placeholder<float> A("A", {M, N, P});
   auto B = Compute(
@@ -2728,7 +2728,7 @@ TEST(IrSchedule, compute_inline3) {
   Expr N(32);
   Expr P(32);
 
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 
   Placeholder<float> A("A", {M, N, P});
   auto B = Compute(
@@ -2790,7 +2790,7 @@ TEST(IrSchedule, compute_inline4) {
   Expr N(32);
   Expr P(32);
 
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 
   Placeholder<float> A("A", {M, N, P});
   auto B = Compute(
@@ -2852,7 +2852,7 @@ TEST(IrSchedule, reverse_compute_inline1) {
   Expr M(32);
   Expr N(64);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   Placeholder<float> A("A", {M, N});
   auto B = Compute(
@@ -2915,7 +2915,7 @@ TEST(IrSchedule, reverse_compute_inline2) {
   Expr N(32);
   Expr P(32);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   Placeholder<float> A("A", {M, N, P});
   auto B = Compute(
@@ -2984,7 +2984,7 @@ TEST(IrSchedule, copytransform1) {
   Expr N(32);
   Expr P(32);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   Placeholder<float> A("A", {M, N, P});
   auto B = Compute(
@@ -3075,7 +3075,7 @@ TEST(IrSchedule, copytransform2) {
   Expr N(64);
   Expr P(128);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   Placeholder<float> A("A", {M, N, P});
   auto B = Compute(
@@ -3171,7 +3171,7 @@ TEST(IrSchedule, Annotate) {
                                     {},
                                     {},
                                     nullptr,
-                                    common::DefaultHostTarget(),
+                                    cinn::common::DefaultHostTarget(),
                                     true);
   ir::IRSchedule ir_sch(ir::ModuleExpr({funcs[0]->body}));
   auto fused = ir_sch.Fuse("B", {0, 1});
@@ -3215,7 +3215,7 @@ TEST(IrSchedule, Unannotate) {
                                     {},
                                     {},
                                     nullptr,
-                                    common::DefaultHostTarget(),
+                                    cinn::common::DefaultHostTarget(),
                                     true);
   ir::IRSchedule ir_sch(ir::ModuleExpr({funcs[0]->body}));
   auto fused = ir_sch.Fuse("B", {0, 1});
@@ -3253,7 +3253,7 @@ TEST(IrSchedule, Unannotate) {
 }
 
 TEST(IrSchedule, ComplexIndices) {
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
   ir::Expr M(32);
   ir::Expr K(64);
 
@@ -3375,7 +3375,7 @@ TEST(IrSchedule, SamplePerfectTile) {
                                     {},
                                     {},
                                     nullptr,
-                                    common::DefaultHostTarget(),
+                                    cinn::common::DefaultHostTarget(),
                                     true);
 
   ir::IRSchedule ir_sch(ir::ModuleExpr({funcs[0]->body}));
@@ -3400,7 +3400,7 @@ TEST(IrSchedule, GetChildBlocks) {
                                     {},
                                     {},
                                     nullptr,
-                                    common::DefaultHostTarget(),
+                                    cinn::common::DefaultHostTarget(),
                                     true);
   ir::IRSchedule ir_sch(ir::ModuleExpr({funcs[0]->body}));
 
@@ -3440,7 +3440,7 @@ TEST(IrSchedule, SampleCategorical) {
                                     {},
                                     {},
                                     nullptr,
-                                    common::DefaultHostTarget(),
+                                    cinn::common::DefaultHostTarget(),
                                     true);
 
   ir::IRSchedule ir_sch(ir::ModuleExpr({funcs[0]->body}));
diff --git a/paddle/cinn/backends/llvm/codegen_llvm.cc b/paddle/cinn/backends/llvm/codegen_llvm.cc
index ff90a14fcfd204..a79e67fd6c4839 100644
--- a/paddle/cinn/backends/llvm/codegen_llvm.cc
+++ b/paddle/cinn/backends/llvm/codegen_llvm.cc
@@ -55,8 +55,8 @@ namespace cinn {
 namespace backends {
 
 using BinaryInstruction = llvm::Instruction::BinaryOps;
-using common::bfloat16;
-using common::float16;
+using cinn::common::bfloat16;
+using cinn::common::float16;
 
 namespace {
 
@@ -69,9 +69,11 @@ auto NodeToExpr(const T *node) {
   return oss.str();
 }
 
-bool is_integral_type(common::Type t) { return t.is_int() || t.is_uint(); }
+bool is_integral_type(cinn::common::Type t) {
+  return t.is_int() || t.is_uint();
+}
 
-bool is_floating_type(common::Type t) { return t.is_float(); }
+bool is_floating_type(cinn::common::Type t) { return t.is_float(); }
 
 llvm::Value *EmitComparison(llvm::CmpInst::Predicate predicate,
                             llvm::Value *lhs,
@@ -405,7 +407,8 @@ llvm::Value *CodeGenLLVM::Visit(const ir::Cast *op) {
   // pod_value_t cast to a value.
   if (op->v().type().is_customized_type() &&
       op->v().type().customized_type() ==
-          common::customized_type::kpod_value_t) {  // pod_value_t operator
+          cinn::common::customized_type::kpod_value_t) {  // pod_value_t
+                                                          // operator
     llvm::Function *callee{};
     if (op->type().is_bool()) {
       callee = m_->getFunction(runtime::intrinsic::pod_value_to_bool);
@@ -970,7 +973,7 @@ llvm::Value *CodeGenLLVM::Visit(const ir::Store *op) {
       // fit the total_lanes in native_lanes(split into multiple native steps)
       for (int offset = 0; offset < total_lanes; offset += total_lanes) {
         int lanes = total_lanes;
-        Expr base = common::AutoSimplify(ramp->base + offset);
+        Expr base = cinn::common::AutoSimplify(ramp->base + offset);
         optim::VarModSimplify(&base);
         auto *ptr =
             CreateBufferPtr(op->type().ElementOf(), buffer, Visit(&base));
@@ -1283,7 +1286,7 @@ llvm::Value *CodeGenLLVM::DenseVectorLoad(const ir::Load *op) {
 
   for (int i = 0; i < load_lanes; i += load_lanes) {
     int slice_lanes = load_lanes;
-    auto slice_base = common::AutoSimplify(ramp->base + i);
+    auto slice_base = cinn::common::AutoSimplify(ramp->base + i);
     optim::VarModSimplify(&slice_base);
     auto slide_stride = Expr(1);
     auto slide_index = slice_base;
diff --git a/paddle/cinn/backends/llvm/codegen_llvm.h b/paddle/cinn/backends/llvm/codegen_llvm.h
index ff885db2c8e594..3428e213b014f8 100644
--- a/paddle/cinn/backends/llvm/codegen_llvm.h
+++ b/paddle/cinn/backends/llvm/codegen_llvm.h
@@ -118,7 +118,7 @@ class CodeGenLLVM : public LLVMIRVisitor, public IrBuilderMixin<CodeGenLLVM> {
       llvm::Module *m,
       llvm::IRBuilder<> *b,
       const std::shared_ptr<SymbolTable> &symbol_table = nullptr,
-      const Target &target = common::DefaultHostTarget());
+      const Target &target = cinn::common::DefaultHostTarget());
 
   // Common llvm types
   // @{
diff --git a/paddle/cinn/backends/llvm/codegen_llvm_test.cc b/paddle/cinn/backends/llvm/codegen_llvm_test.cc
index aa6ca91af1b26b..930e70f22e8692 100644
--- a/paddle/cinn/backends/llvm/codegen_llvm_test.cc
+++ b/paddle/cinn/backends/llvm/codegen_llvm_test.cc
@@ -59,7 +59,7 @@ auto CreateTensor() {
   auto c = lang::Compute(
       {M, N}, [&](auto i, auto j) { return a(i, j) + b(i, j); }, "c");
 
-  lang::Buffer c_buf(common::Float(32));
+  lang::Buffer c_buf(cinn::common::Float(32));
 
   return std::make_tuple(
       std::move(a), std::move(b), std::move(c), std::move(c_buf));
@@ -82,7 +82,7 @@ template <typename OT,
           typename T1,
           typename NT2 = NT1,
           typename T2 = T1>
-auto CreateBinaryOp(common::Type t, T1 x, T2 y) {
+auto CreateBinaryOp(cinn::common::Type t, T1 x, T2 y) {
   auto px = std::make_unique<NT1>(t, x);
   auto py = std::make_unique<NT2>(t, y);
 
@@ -92,7 +92,7 @@ auto CreateBinaryOp(common::Type t, T1 x, T2 y) {
   return std::make_unique<OT>(std::move(ex), std::move(ey));
 }
 
-auto CreateIrBuffer(common::Type t,
+auto CreateIrBuffer(cinn::common::Type t,
                     std::string name,
                     std::vector<int> shape,
                     int data_alignment = 0) {
@@ -104,7 +104,7 @@ auto CreateIrBuffer(common::Type t,
   }
 
   for (auto i : shape) {
-    auto pi = std::make_unique<ir::IntImm>(common::Int(32), i);
+    auto pi = std::make_unique<ir::IntImm>(cinn::common::Int(32), i);
     buffer->shape.emplace_back(pi.release());
   }
 
@@ -114,7 +114,7 @@ auto CreateIrBuffer(common::Type t,
 auto CreateIrTensor(std::string name, std::vector<int> shape) {
   std::vector<ir::Expr> shape_expr;
   for (auto i : shape) {
-    auto pi = std::make_unique<ir::IntImm>(common::Int(32), i);
+    auto pi = std::make_unique<ir::IntImm>(cinn::common::Int(32), i);
     shape_expr.emplace_back(pi.release());
   }
 
@@ -146,28 +146,28 @@ TEST(CodeGenLLVM, Imm) {
 
   llvm::Value *value = nullptr;
 
-  ir::IntImm i32_imm(common::Int(32), 10);
+  ir::IntImm i32_imm(cinn::common::Int(32), 10);
   value = emitter->Visit(&i32_imm);
   ASSERT_EQ(value->getType(), i32);
   ASSERT_EQ(value, llvm::ConstantInt::get(i32, i32_imm.value, true));
   // value->print(llvm::outs(), false);
 
-  ir::UIntImm u32_imm(common::UInt(32), 5);
+  ir::UIntImm u32_imm(cinn::common::UInt(32), 5);
   value = emitter->Visit(&u32_imm);
   ASSERT_EQ(value->getType(), u32);
   ASSERT_EQ(value, llvm::ConstantInt::get(u32, u32_imm.value, false));
 
-  ir::FloatImm float32_imm(common::Float(32), 2.5);
+  ir::FloatImm float32_imm(cinn::common::Float(32), 2.5);
   value = emitter->Visit(&float32_imm);
   ASSERT_EQ(value->getType(), f32);
   ASSERT_EQ(value, llvm::ConstantFP::get(f32, float32_imm.value));
 
-  ir::FloatImm float16_imm(common::Float16(), 2.5);
+  ir::FloatImm float16_imm(cinn::common::Float16(), 2.5);
   value = emitter->Visit(&float16_imm);
   ASSERT_EQ(value->getType(), f16);
   ASSERT_EQ(value, llvm::ConstantFP::get(f16, float16_imm.value));
 
-  ir::FloatImm bfloat16_imm(common::BFloat16(), 2.5);
+  ir::FloatImm bfloat16_imm(cinn::common::BFloat16(), 2.5);
   value = emitter->Visit(&bfloat16_imm);
   ASSERT_EQ(value->getType(), bf16);
   ASSERT_EQ(value, llvm::ConstantFP::get(bf16, bfloat16_imm.value));
@@ -198,7 +198,8 @@ TEST(CodeGenLLVM, Expr) {
   do {
     int x = 2;
     int y = 3;
-    auto op = CreateBinaryOp<ir::Add, ir::IntImm, int>(common::Int(32), x, y);
+    auto op =
+        CreateBinaryOp<ir::Add, ir::IntImm, int>(cinn::common::Int(32), x, y);
 
     expect_value = llvm::ConstantInt::get(i32, x + y);
     value = emitter->Visit(op.get());
@@ -213,8 +214,8 @@ TEST(CodeGenLLVM, Expr) {
   do {
     float x = 2.5;
     float y = 3.5;
-    auto op =
-        CreateBinaryOp<ir::Sub, ir::FloatImm, float>(common::Float(32), x, y);
+    auto op = CreateBinaryOp<ir::Sub, ir::FloatImm, float>(
+        cinn::common::Float(32), x, y);
 
     expect_value = llvm::ConstantFP::get(f32, x - y);
     value = emitter->Visit(op.get());
@@ -226,8 +227,8 @@ TEST(CodeGenLLVM, Expr) {
   do {
     float16 x{2.5};
     float16 y{3.5};
-    auto op =
-        CreateBinaryOp<ir::Sub, ir::FloatImm, float16>(common::Float16(), x, y);
+    auto op = CreateBinaryOp<ir::Sub, ir::FloatImm, float16>(
+        cinn::common::Float16(), x, y);
 
     expect_value = llvm::ConstantFP::get(f16, x - y);
     value = emitter->Visit(op.get());
@@ -240,7 +241,7 @@ TEST(CodeGenLLVM, Expr) {
     bfloat16 x{2.5};
     bfloat16 y{3.5};
     auto op = CreateBinaryOp<ir::Sub, ir::FloatImm, bfloat16>(
-        common::BFloat16(), x, y);
+        cinn::common::BFloat16(), x, y);
 
     expect_value = llvm::ConstantFP::get(bf16, x - y);
     value = emitter->Visit(op.get());
@@ -252,7 +253,8 @@ TEST(CodeGenLLVM, Expr) {
   do {
     int x = 5;
     int y = 3;
-    auto op = CreateBinaryOp<ir::Mul, ir::IntImm, float>(common::Int(64), x, y);
+    auto op =
+        CreateBinaryOp<ir::Mul, ir::IntImm, float>(cinn::common::Int(64), x, y);
     expect_value = llvm::ConstantInt::get(i64, x * y);
     value = emitter->Visit(op.get());
     ASSERT_EQ(value->getType(), i64);
@@ -263,8 +265,8 @@ TEST(CodeGenLLVM, Expr) {
   do {
     float x = 6;
     float y = 4;
-    auto op =
-        CreateBinaryOp<ir::Div, ir::FloatImm, float>(common::Float(32), x, y);
+    auto op = CreateBinaryOp<ir::Div, ir::FloatImm, float>(
+        cinn::common::Float(32), x, y);
     expect_value = llvm::ConstantFP::get(f32, x / y);
     value = emitter->Visit(op.get());
     ASSERT_EQ(value->getType(), f32);
@@ -275,8 +277,8 @@ TEST(CodeGenLLVM, Expr) {
   do {
     float16 x{6};
     float16 y{4};
-    auto op =
-        CreateBinaryOp<ir::Div, ir::FloatImm, float16>(common::Float16(), x, y);
+    auto op = CreateBinaryOp<ir::Div, ir::FloatImm, float16>(
+        cinn::common::Float16(), x, y);
     expect_value = llvm::ConstantFP::get(f16, x / y);
     value = emitter->Visit(op.get());
     ASSERT_EQ(value->getType(), f16);
@@ -288,7 +290,7 @@ TEST(CodeGenLLVM, Expr) {
     bfloat16 x{6};
     bfloat16 y{4};
     auto op = CreateBinaryOp<ir::Div, ir::FloatImm, bfloat16>(
-        common::BFloat16(), x, y);
+        cinn::common::BFloat16(), x, y);
     expect_value = llvm::ConstantFP::get(bf16, x / y);
     value = emitter->Visit(op.get());
     ASSERT_EQ(value->getType(), bf16);
@@ -299,7 +301,8 @@ TEST(CodeGenLLVM, Expr) {
   do {
     int x = 25;
     int y = 7;
-    auto op = CreateBinaryOp<ir::Mod, ir::IntImm, int>(common::Int(32), x, y);
+    auto op =
+        CreateBinaryOp<ir::Mod, ir::IntImm, int>(cinn::common::Int(32), x, y);
     expect_value = llvm::ConstantInt::get(i32, x % y);
     value = emitter->Visit(op.get());
     ASSERT_EQ(value->getType(), i32);
@@ -310,7 +313,8 @@ TEST(CodeGenLLVM, Expr) {
   do {
     int x = 3;
     int y = 3;
-    auto op = CreateBinaryOp<ir::EQ, ir::IntImm, int>(common::Int(32), x, y);
+    auto op =
+        CreateBinaryOp<ir::EQ, ir::IntImm, int>(cinn::common::Int(32), x, y);
     expect_value = llvm::ConstantInt::get(i1, 1);
     value = emitter->Visit(op.get());
     ASSERT_EQ(value->getType(), i1);
@@ -322,8 +326,8 @@ TEST(CodeGenLLVM, Expr) {
     float x = 3;
     float y = 3;
 
-    auto op =
-        CreateBinaryOp<ir::NE, ir::FloatImm, float>(common::Float(32), x, y);
+    auto op = CreateBinaryOp<ir::NE, ir::FloatImm, float>(
+        cinn::common::Float(32), x, y);
     expect_value = llvm::ConstantInt::get(i1, 0);
     value = emitter->Visit(op.get());
     ASSERT_EQ(value->getType(), i1);
@@ -334,7 +338,8 @@ TEST(CodeGenLLVM, Expr) {
   do {
     int x = 6;
     int y = 6;
-    auto op = CreateBinaryOp<ir::LT, ir::IntImm, int>(common::Int(32), x, y);
+    auto op =
+        CreateBinaryOp<ir::LT, ir::IntImm, int>(cinn::common::Int(32), x, y);
     value = emitter->Visit(op.get());
     expect_value = llvm::ConstantInt::get(i1, 0);
     ASSERT_EQ(value->getType(), i1);
@@ -345,7 +350,8 @@ TEST(CodeGenLLVM, Expr) {
   do {
     int x = 6;
     int y = 6;
-    auto op = CreateBinaryOp<ir::LE, ir::IntImm, int>(common::Int(32), x, y);
+    auto op =
+        CreateBinaryOp<ir::LE, ir::IntImm, int>(cinn::common::Int(32), x, y);
     value = emitter->Visit(op.get());
     expect_value = llvm::ConstantInt::get(i1, 1);
     ASSERT_EQ(value->getType(), i1);
@@ -356,7 +362,8 @@ TEST(CodeGenLLVM, Expr) {
   do {
     int x = 6;
     int y = 6;
-    auto op = CreateBinaryOp<ir::GT, ir::IntImm, int>(common::Int(32), x, y);
+    auto op =
+        CreateBinaryOp<ir::GT, ir::IntImm, int>(cinn::common::Int(32), x, y);
     value = emitter->Visit(op.get());
     expect_value = llvm::ConstantInt::get(i1, 0);
     ASSERT_EQ(value->getType(), i1);
@@ -367,7 +374,8 @@ TEST(CodeGenLLVM, Expr) {
   do {
     int x = 6;
     int y = 6;
-    auto op = CreateBinaryOp<ir::GE, ir::IntImm, int>(common::Int(32), x, y);
+    auto op =
+        CreateBinaryOp<ir::GE, ir::IntImm, int>(cinn::common::Int(32), x, y);
     value = emitter->Visit(op.get());
     expect_value = llvm::ConstantInt::get(i1, 1);
     ASSERT_EQ(value->getType(), i1);
@@ -382,7 +390,8 @@ TEST(CodeGenLLVM, Expr) {
   do {
     int x = 2;
     int y = 3;
-    auto op = CreateBinaryOp<ir::Min, ir::IntImm, int>(common::Int(32), x, y);
+    auto op =
+        CreateBinaryOp<ir::Min, ir::IntImm, int>(cinn::common::Int(32), x, y);
     value = emitter->Visit(op.get());
     expect_value = llvm::ConstantInt::get(i32, std::min(x, y));
     ASSERT_EQ(value->getType(), i32);
@@ -393,8 +402,8 @@ TEST(CodeGenLLVM, Expr) {
   do {
     float x = 2;
     float y = 3;
-    auto op =
-        CreateBinaryOp<ir::Max, ir::FloatImm, float>(common::Float(32), x, y);
+    auto op = CreateBinaryOp<ir::Max, ir::FloatImm, float>(
+        cinn::common::Float(32), x, y);
     value = emitter->Visit(op.get());
     expect_value = llvm::ConstantFP::get(f32, std::max(x, y));
     ASSERT_EQ(value->getType(), f32);
@@ -412,9 +421,9 @@ TEST(CodeGenLLVM, Expr) {
     // i32 -> f32
     LOG(INFO) << "test i32 -> f32";
     int v2 = 2;
-    auto x2 = std::make_unique<ir::IntImm>(common::Int(32), v2);
+    auto x2 = std::make_unique<ir::IntImm>(cinn::common::Int(32), v2);
     auto ex2 = ir::Expr(x2.release());
-    auto op2 = ir::Cast::Make(common::Float(32), std::move(ex2));
+    auto op2 = ir::Cast::Make(cinn::common::Float(32), std::move(ex2));
     value = emitter->Visit(&op2);
     expect_value = llvm::ConstantFP::get(f32, v2);
     ASSERT_EQ(value->getType(), f32);
@@ -423,9 +432,9 @@ TEST(CodeGenLLVM, Expr) {
     // f32 -> i32
     LOG(INFO) << "test f32 -> i32";
     float v3 = 3;
-    auto x3 = std::make_unique<ir::FloatImm>(common::Float(32), v3);
+    auto x3 = std::make_unique<ir::FloatImm>(cinn::common::Float(32), v3);
     auto ex3 = ir::Expr(x3.release());
-    auto op3 = ir::Cast::Make(common::Int(32), std::move(ex3));
+    auto op3 = ir::Cast::Make(cinn::common::Int(32), std::move(ex3));
     value = emitter->Visit(&op3);
     expect_value = llvm::ConstantInt::get(i32, v3);
     ASSERT_EQ(value->getType(), i32);
@@ -434,9 +443,9 @@ TEST(CodeGenLLVM, Expr) {
     // i32 -> f16
     LOG(INFO) << "test i32 -> f16";
     int v4 = 4;
-    auto x4 = std::make_unique<ir::IntImm>(common::Int(32), v4);
+    auto x4 = std::make_unique<ir::IntImm>(cinn::common::Int(32), v4);
     auto ex4 = ir::Expr(x4.release());
-    auto op4 = ir::Cast::Make(common::Float16(), std::move(ex4));
+    auto op4 = ir::Cast::Make(cinn::common::Float16(), std::move(ex4));
     value = emitter->Visit(&op4);
     expect_value = llvm::ConstantFP::get(f16, v4);
     ASSERT_EQ(value->getType(), f16);
@@ -445,9 +454,9 @@ TEST(CodeGenLLVM, Expr) {
     // f16 -> f32
     LOG(INFO) << "test f16 -> f32";
     float16 v5{5};
-    auto x5 = std::make_unique<ir::FloatImm>(common::Float16(), v5);
+    auto x5 = std::make_unique<ir::FloatImm>(cinn::common::Float16(), v5);
     auto ex5 = ir::Expr(x5.release());
-    auto op5 = ir::Cast::Make(common::Float(32), std::move(ex5));
+    auto op5 = ir::Cast::Make(cinn::common::Float(32), std::move(ex5));
     value = emitter->Visit(&op5);
     expect_value = llvm::ConstantFP::get(f32, v5);
     ASSERT_EQ(value->getType(), f32);
@@ -456,9 +465,9 @@ TEST(CodeGenLLVM, Expr) {
     // i32 -> bf16
     LOG(INFO) << "test i32 -> bf16";
     int v6 = 4;
-    auto x6 = std::make_unique<ir::IntImm>(common::Int(32), v6);
+    auto x6 = std::make_unique<ir::IntImm>(cinn::common::Int(32), v6);
     auto ex6 = ir::Expr(x6.release());
-    auto op6 = ir::Cast::Make(common::BFloat16(), std::move(ex6));
+    auto op6 = ir::Cast::Make(cinn::common::BFloat16(), std::move(ex6));
     value = emitter->Visit(&op6);
     expect_value = llvm::ConstantFP::get(bf16, v6);
     ASSERT_EQ(value->getType(), bf16);
@@ -467,9 +476,9 @@ TEST(CodeGenLLVM, Expr) {
     // bf16 -> f32
     LOG(INFO) << "test bf16 -> f32";
     bfloat16 v7{5};
-    auto x7 = std::make_unique<ir::FloatImm>(common::BFloat16(), v7);
+    auto x7 = std::make_unique<ir::FloatImm>(cinn::common::BFloat16(), v7);
     auto ex7 = ir::Expr(x7.release());
-    auto op7 = ir::Cast::Make(common::Float(32), std::move(ex7));
+    auto op7 = ir::Cast::Make(cinn::common::Float(32), std::move(ex7));
     value = emitter->Visit(&op7);
     expect_value = llvm::ConstantFP::get(f32, v7);
     ASSERT_EQ(value->getType(), f32);
@@ -515,7 +524,7 @@ TEST(CodeGenLLVM, Statement) {
 
     // ir::Tensor
     auto tensor_op = CreateIrTensor("x", {2, 3});
-    tensor_op->buffer = CreateIrBuffer(common::Int(32), "", {2, 3});
+    tensor_op->buffer = CreateIrBuffer(cinn::common::Int(32), "", {2, 3});
 
     // ir::Alloc
     auto alloc_op = std::make_unique<ir::Alloc>();
@@ -525,17 +534,19 @@ TEST(CodeGenLLVM, Statement) {
     auto store_op = std::make_unique<ir::Store>();
     store_op->tensor = ir::Expr(tensor_op);
     for (int i : {1, 1}) {
-      auto pi = std::make_unique<ir::IntImm>(common::Int(32), std::move(i));
+      auto pi =
+          std::make_unique<ir::IntImm>(cinn::common::Int(32), std::move(i));
       store_op->indices.emplace_back(pi.release());
     }
-    auto store_value = std::make_unique<ir::IntImm>(common::Int(32), 5);
+    auto store_value = std::make_unique<ir::IntImm>(cinn::common::Int(32), 5);
     store_op->value = ir::Expr(store_value.release());
 
     // ir::Load
     auto load_op = std::make_unique<ir::Load>();
     load_op->tensor = ir::Expr(tensor_op);
     for (int i : {1, 1}) {
-      auto pi = std::make_unique<ir::IntImm>(common::Int(32), std::move(i));
+      auto pi =
+          std::make_unique<ir::IntImm>(cinn::common::Int(32), std::move(i));
       load_op->indices.emplace_back(pi.release());
     }
 
@@ -544,7 +555,7 @@ TEST(CodeGenLLVM, Statement) {
     free_op->destination = ir::Expr(tensor_op->buffer);
 
     // ir::Call
-    auto call_op = std::make_unique<ir::Call>(common::Int(32));
+    auto call_op = std::make_unique<ir::Call>(cinn::common::Int(32));
     call_op->name = "codegen_llvm_test.Alloc_Store_Load_Free";
 
     // Emit llvm ir
diff --git a/paddle/cinn/backends/llvm/codegen_x86.cc b/paddle/cinn/backends/llvm/codegen_x86.cc
index 9de0603e2c9e26..cfd796162241c0 100644
--- a/paddle/cinn/backends/llvm/codegen_x86.cc
+++ b/paddle/cinn/backends/llvm/codegen_x86.cc
@@ -128,8 +128,8 @@ void CodeGenX86::CreateParallelLaunch(Expr body, int num_task) {
   symbol_table_->PushScope();
   UnpackVars(vars, data);
   ParallelEnv par_env;
-  auto task_id_name = common::UniqName("task_id");
-  auto num_task_name = common::UniqName("num_task");
+  auto task_id_name = cinn::common::UniqName("task_id");
+  auto num_task_name = cinn::common::UniqName("num_task");
   par_env.task_id = ir::Var(task_id_name, Int(32));
   par_env.num_task = ir::Var(num_task_name, Int(32));
   SetVar(task_id_name, task_id);
diff --git a/paddle/cinn/backends/llvm/codegen_x86_test.cc b/paddle/cinn/backends/llvm/codegen_x86_test.cc
index 42cd0f171435da..16c698fd88bd6e 100644
--- a/paddle/cinn/backends/llvm/codegen_x86_test.cc
+++ b/paddle/cinn/backends/llvm/codegen_x86_test.cc
@@ -39,7 +39,7 @@ TEST(Vectorize, basic) {
 
   LOG(INFO) << "fn: " << fn;
 
-  Module::Builder builder("module", common::DefaultHostTarget());
+  Module::Builder builder("module", cinn::common::DefaultHostTarget());
   builder.AddFunction(fn);
 
   auto module = builder.Build();
@@ -53,18 +53,21 @@ TEST(Vectorize, basic) {
 
   auto* fn_ptr = reinterpret_cast<lower_func_ptr_t>(fn_);
 
-  auto* A_buf = common::BufferBuilder(Float(32), {1024})
+  auto* A_buf = cinn::common::BufferBuilder(Float(32), {1024})
                     .set_random()
                     .set_align(64)
                     .Build();
-  auto* B_buf = common::BufferBuilder(Float(32), {1024})
+  auto* B_buf = cinn::common::BufferBuilder(Float(32), {1024})
                     .set_random()
                     .set_align(64)
                     .Build();
-  auto* C_buf =
-      common::BufferBuilder(Float(32), {1024}).set_zero().set_align(64).Build();
+  auto* C_buf = cinn::common::BufferBuilder(Float(32), {1024})
+                    .set_zero()
+                    .set_align(64)
+                    .Build();
 
-  auto args = common::ArgsBuilder().Add(A_buf).Add(B_buf).Add(C_buf).Build();
+  auto args =
+      cinn::common::ArgsBuilder().Add(A_buf).Add(B_buf).Add(C_buf).Build();
 
   fn_ptr(reinterpret_cast<void**>(args.data()), args.size());
 
diff --git a/paddle/cinn/backends/llvm/execution_engine_test.cc b/paddle/cinn/backends/llvm/execution_engine_test.cc
index 7adca52f34ca78..a66b63248a50dd 100644
--- a/paddle/cinn/backends/llvm/execution_engine_test.cc
+++ b/paddle/cinn/backends/llvm/execution_engine_test.cc
@@ -107,10 +107,10 @@ auto CreateTestCinnModule() {
       {M, N}, [&](Var i, Var j) { return A(i, j) + B(i, j); }, "C");
   C->Bind(C_buf);
 
-  common::Target target;
-  target.arch = common::Target::Arch::X86;
-  target.bits = common::Target::Bit::k32;
-  target.os = common::Target::OS::Linux;
+  cinn::common::Target target;
+  target.arch = cinn::common::Target::Arch::X86;
+  target.bits = cinn::common::Target::Bit::k32;
+  target.os = cinn::common::Target::OS::Linux;
   ir::Module::Builder builder("module1", target);
 
   auto stages = CreateStages({C});
@@ -154,7 +154,7 @@ TEST(llvm_test01, elementwise_add) {
 }
 
 TEST(llvm, module_call_lowered_func) {
-  ir::Module::Builder builder("some_module", common::DefaultHostTarget());
+  ir::Module::Builder builder("some_module", cinn::common::DefaultHostTarget());
   ir::Expr M(kM);
   ir::Expr N(kN);
   {  // define fn
@@ -184,7 +184,7 @@ TEST(llvm, module_call_lowered_func) {
     auto main_fn = lang::Lower("main", stages, {a, b, c}, {});
     builder.AddFunction(main_fn);
 
-    CodeGenC codegen(common::DefaultHostTarget());
+    CodeGenC codegen(cinn::common::DefaultHostTarget());
     codegen.SetInlineBuiltinCodes(false);
     LOG(INFO) << "module:\n"
               << codegen.Compile(builder.Build(), CodeGenC::OutputKind::CImpl);
@@ -318,7 +318,7 @@ TEST(ExecutionEngine, call_extern) {
   stages[add_out]->ComputeInline();
   auto func = Lower("comp", stages, {x, y, res});
 
-  Module::Builder builder("module0", common::DefaultHostTarget());
+  Module::Builder builder("module0", cinn::common::DefaultHostTarget());
   builder.AddFunction(func);
 
   auto engine = backends::ExecutionEngine::Create({1});
diff --git a/paddle/cinn/backends/llvm/llvm_intrin_rule.h b/paddle/cinn/backends/llvm/llvm_intrin_rule.h
index 77d22349ed2580..903c056196f4e4 100644
--- a/paddle/cinn/backends/llvm/llvm_intrin_rule.h
+++ b/paddle/cinn/backends/llvm/llvm_intrin_rule.h
@@ -104,7 +104,7 @@ void RegisterCpuIntrinRule() {
         Expr arg = node->read_args[0];
         Type type = arg->type();
         if (type.is_int() || type.is_uint()) {
-          *rv = common::make_bool(false, type.lanes());
+          *rv = cinn::common::make_bool(false, type.lanes());
         } else if (type.is_float()) {
           *rv = ir::EQ::Make(lang::Abs(arg), lang::Infinity(type)) &&
                 !(lang::IsNan(arg));
diff --git a/paddle/cinn/backends/llvm/llvm_util.cc b/paddle/cinn/backends/llvm/llvm_util.cc
index 32256ecc5c9ca0..f1c35d7f58e68d 100644
--- a/paddle/cinn/backends/llvm/llvm_util.cc
+++ b/paddle/cinn/backends/llvm/llvm_util.cc
@@ -26,7 +26,7 @@ namespace backends {
 using cinn::common::bfloat16;
 using cinn::common::float16;
 
-llvm::Type *CinnTypeToLLVMType(common::Type type,
+llvm::Type *CinnTypeToLLVMType(cinn::common::Type type,
                                llvm::Module *m,
                                bool is_vec) {
   llvm::Type *ir_type = nullptr;
@@ -118,10 +118,10 @@ llvm::Type *CinnTypeToLLVMType(common::Type type,
   return ir_type;
 }
 
-#define __(ty__)                                           \
-  template <>                                              \
-  llvm::Type *llvm_type_of<ty__>(llvm::Module * m) {       \
-    return CinnTypeToLLVMType(common::type_of<ty__>(), m); \
+#define __(ty__)                                                 \
+  template <>                                                    \
+  llvm::Type *llvm_type_of<ty__>(llvm::Module * m) {             \
+    return CinnTypeToLLVMType(cinn::common::type_of<ty__>(), m); \
   }
 
 __(int8_t)
diff --git a/paddle/cinn/backends/llvm/llvm_util.h b/paddle/cinn/backends/llvm/llvm_util.h
index dd1a79768ab027..de9227c3d94525 100644
--- a/paddle/cinn/backends/llvm/llvm_util.h
+++ b/paddle/cinn/backends/llvm/llvm_util.h
@@ -48,7 +48,7 @@ inline llvm::StringRef AsStringRef(absl::string_view str) {
   return llvm::StringRef(str.data(), str.size());
 }
 
-llvm::Type *CinnTypeToLLVMType(common::Type t,
+llvm::Type *CinnTypeToLLVMType(cinn::common::Type t,
                                llvm::Module *m,
                                bool is_vec = false);
 
diff --git a/paddle/cinn/backends/nvrtc/nvrtc_util.cc b/paddle/cinn/backends/nvrtc/nvrtc_util.cc
index b5ef59f6bdae2e..4ad94e506c1986 100644
--- a/paddle/cinn/backends/nvrtc/nvrtc_util.cc
+++ b/paddle/cinn/backends/nvrtc/nvrtc_util.cc
@@ -176,7 +176,7 @@ std::string Compiler::CompileWithNvcc(const std::string& cuda_c) {
   }
 
   // get unqiue prefix name
-  prefix_name_ = dir + "/" + common::UniqName("rtc_tmp");
+  prefix_name_ = dir + "/" + cinn::common::UniqName("rtc_tmp");
 
   auto cuda_c_file = prefix_name_ + ".cu";
   std::ofstream ofs(cuda_c_file, std::ios::out);
@@ -194,7 +194,7 @@ std::string Compiler::CompileWithNvcc(const std::string& cuda_c) {
 // std::ios::in); }
 
 void Compiler::CompileToPtx() {
-  auto include_dir = common::Context::Global().runtime_include_dir();
+  auto include_dir = cinn::common::Context::Global().runtime_include_dir();
   std::string include_dir_str = "";
   for (auto dir : include_dir) {
     if (include_dir_str.empty()) {
diff --git a/paddle/cinn/common/arithmatic.cc b/paddle/cinn/common/arithmatic.cc
index 1caad647bba69f..5cabe56dff2db5 100644
--- a/paddle/cinn/common/arithmatic.cc
+++ b/paddle/cinn/common/arithmatic.cc
@@ -125,7 +125,7 @@ GiNaC::ex ExprToGinacConverter::BuildHelper(ir::Expr expr) {
 }
 
 GiNaC::ex ExprToGinacConverter::operator()(Expr expr) {
-  // TODO(Superjomn) Replace this with common::IsPureMath(
+  // TODO(Superjomn) Replace this with cinn::common::IsPureMath(
   auto complex_nodes = ir::ir_utils::CollectIRNodes(expr, [](const Expr* n) {
     return n->As<Block>() ||    //
            n->As<PolyFor>() ||  //
diff --git a/paddle/cinn/common/axis.cc b/paddle/cinn/common/axis.cc
index 0e13c6bd0781a6..9913a38b5ed5cc 100644
--- a/paddle/cinn/common/axis.cc
+++ b/paddle/cinn/common/axis.cc
@@ -68,7 +68,7 @@ std::string axis_name(int level) {
 std::vector<ir::Var> GenDefaultAxis(int naxis) {
   std::vector<ir::Var> axis;
   for (int i = 0; i < naxis; i++) {
-    axis.emplace_back(common::axis_name(i));
+    axis.emplace_back(cinn::common::axis_name(i));
     CHECK(axis.back()->type().valid());
   }
   return axis;
diff --git a/paddle/cinn/common/cas.cc b/paddle/cinn/common/cas.cc
index 9a4d5bc3ed2794..a24549896de506 100644
--- a/paddle/cinn/common/cas.cc
+++ b/paddle/cinn/common/cas.cc
@@ -2309,14 +2309,14 @@ Expr SolveInequality(Expr inequality, Var val) {
 #undef __
   Expr all = AutoSimplify(a - b);
 
-  // if (common::IsPureMath(a) && common::IsPureMath(b)) {
+  // if (cinn::common::IsPureMath(a) && cinn::common::IsPureMath(b)) {
   if (true) {
-    auto _res_positive_ = common::Solve(a, b, val);  // NOLINT
+    auto _res_positive_ = cinn::common::Solve(a, b, val);  // NOLINT
     auto& res = std::get<0>(_res_positive_);
     auto& positive = std::get<1>(_res_positive_);
     // Simplify it with CAS to avoid random result from GiNac.
     res = AutoSimplify(res);
-    res = common::cast(res, val->type());
+    res = cinn::common::cast(res, val->type());
 
     if (le_n) {
       if (positive) return ir::LE::Make(val, res);
diff --git a/paddle/cinn/common/cas_test.cc b/paddle/cinn/common/cas_test.cc
index e260d91844d763..d37bd87c23685f 100644
--- a/paddle/cinn/common/cas_test.cc
+++ b/paddle/cinn/common/cas_test.cc
@@ -26,7 +26,7 @@
 namespace cinn {
 namespace common {
 
-using common::make_const;
+using cinn::common::make_const;
 using utils::GetStreamCnt;
 using utils::Join;
 using utils::Trim;
@@ -181,8 +181,8 @@ TEST(CAS, FracOp) {
   auto u4 = AutoSimplify(Expr(32768) * (((Expr(32) * x) + y) / 32));
   EXPECT_EQ(GetStreamCnt(u4), "((32768 * (y / 32)) + (32768 * x))");
 
-  common::cas_intervals_t var_intervals;
-  var_intervals.emplace("y", common::CasInterval(0, 31));
+  cinn::common::cas_intervals_t var_intervals;
+  var_intervals.emplace("y", cinn::common::CasInterval(0, 31));
   auto u = AutoSimplify((Expr(x) * 32 + y) / 32, var_intervals);
   EXPECT_EQ(GetStreamCnt(u), "x");
 
diff --git a/paddle/cinn/common/cinn_value.h b/paddle/cinn/common/cinn_value.h
index 587a79ec71c6ff..3cfb4214d76b9a 100755
--- a/paddle/cinn/common/cinn_value.h
+++ b/paddle/cinn/common/cinn_value.h
@@ -50,7 +50,7 @@ class CINNValuePack;
 /**
  * A _CINNValuePack_ is a shared Array of multiple CINNValue.
  */
-struct _CINNValuePack_ : public common::Object {
+struct _CINNValuePack_ : public cinn::common::Object {
   /**
    * Create a new CINNValuePack instance.
    * @param array The list of CINNValues.
diff --git a/paddle/cinn/common/cinn_value_test.cc b/paddle/cinn/common/cinn_value_test.cc
index 3419ba7849c09b..2e3e30183f61de 100644
--- a/paddle/cinn/common/cinn_value_test.cc
+++ b/paddle/cinn/common/cinn_value_test.cc
@@ -51,7 +51,7 @@ TEST(CINNValue, Expr) {
 
   {
     CINNValue copied = CINNValue(a);
-    ASSERT_TRUE(copied == common::make_const(1));
+    ASSERT_TRUE(copied == cinn::common::make_const(1));
   }
 }
 
diff --git a/paddle/cinn/common/common.h b/paddle/cinn/common/common.h
index e54d8aad4b31d7..34623d904515b3 100644
--- a/paddle/cinn/common/common.h
+++ b/paddle/cinn/common/common.h
@@ -28,25 +28,25 @@
 namespace cinn {
 
 // export some general concepts.
-using common::Context;
-using common::make_shared;
-using common::Object;
-using common::ref_count;
-using common::Shared;
-using common::UniqName;
+using cinn::common::Context;
+using cinn::common::make_shared;
+using cinn::common::Object;
+using cinn::common::ref_count;
+using cinn::common::Shared;
+using cinn::common::UniqName;
 
 // Type related.
-using common::Bool;
-using common::Float;
-using common::Int;
-using common::UInt;
-using common::Void;
+using cinn::common::Bool;
+using cinn::common::Float;
+using cinn::common::Int;
+using cinn::common::UInt;
+using cinn::common::Void;
 
-using common::type_of;
+using cinn::common::type_of;
 
-using common::Target;
-using common::Type;
-using common::UnkTarget;
+using cinn::common::Target;
+using cinn::common::Type;
+using cinn::common::UnkTarget;
 
 template <typename T>
 T& Reference(const T* x) {
@@ -63,7 +63,7 @@ static void CheckVarNameValid(const absl::string_view name) {
         name.find('\n') == std::string::npos &&  //
         name.find('\r') == std::string::npos)
       << "Some invalid character found";
-  CHECK(!common::IsAxisNameReserved(std::string(name)))
+  CHECK(!cinn::common::IsAxisNameReserved(std::string(name)))
       << "The name [" << name << "] is reserved for internal axis";
 }
 
diff --git a/paddle/cinn/common/equation_graph_topo_walker.h b/paddle/cinn/common/equation_graph_topo_walker.h
index b7fba86f5a0b2e..6098a54100d606 100644
--- a/paddle/cinn/common/equation_graph_topo_walker.h
+++ b/paddle/cinn/common/equation_graph_topo_walker.h
@@ -110,7 +110,7 @@ class EquationGraphTopoWalker final {
     for (VarIterT iter = begin; iter != end; ++iter) {
       VisitNextFunctions(*iter, [&](FT f) { starts.emplace_back(f); });
     }
-    common::BfsWalker<FT> bfs_walker{BfsVisitNextFunction};
+    cinn::common::BfsWalker<FT> bfs_walker{BfsVisitNextFunction};
     bfs_walker(starts.begin(), starts.end(), FunctionVisitor);
   }
 
diff --git a/paddle/cinn/common/graph_utils.cc b/paddle/cinn/common/graph_utils.cc
index a2b6861b899b49..d0341b8c5f64b9 100755
--- a/paddle/cinn/common/graph_utils.cc
+++ b/paddle/cinn/common/graph_utils.cc
@@ -211,7 +211,7 @@ bool GraphEdgeCompare::operator()(const Shared<GraphEdge> &a,
 }
 
 std::set<GraphNode *> Graph::CollectNodes(
-    std::function<bool(const common::GraphNode *)> &&teller) {
+    std::function<bool(const cinn::common::GraphNode *)> &&teller) {
   std::set<GraphNode *> res;
   for (auto *node : nodes()) {
     if (teller(node)) res.insert(node);
diff --git a/paddle/cinn/common/graph_utils.h b/paddle/cinn/common/graph_utils.h
index cb144e1c901c76..00c2a93f85e0fb 100644
--- a/paddle/cinn/common/graph_utils.h
+++ b/paddle/cinn/common/graph_utils.h
@@ -66,8 +66,8 @@ class GraphEdge : public Object {
 };
 
 struct GraphEdgeCompare {
-  bool operator()(const common::Shared<GraphEdge>& a,
-                  const common::Shared<GraphEdge>& b) const;
+  bool operator()(const cinn::common::Shared<GraphEdge>& a,
+                  const cinn::common::Shared<GraphEdge>& b) const;
 };
 
 /**
@@ -224,11 +224,11 @@ class GraphNode : public Object {
   //! The input links of the node.
   //! \note We record the raw pointer rather than the shared pointer to avoid
   //! cycle reference.
-  std::set<common::Shared<GraphEdge>, GraphEdgeCompare> inlinks_;
+  std::set<cinn::common::Shared<GraphEdge>, GraphEdgeCompare> inlinks_;
   //! The output links of the node.
   //! \note We record the raw pointer rather than the shared pointer to avoid
   //! cycle reference.
-  std::set<common::Shared<GraphEdge>, GraphEdgeCompare> outlinks_;
+  std::set<cinn::common::Shared<GraphEdge>, GraphEdgeCompare> outlinks_;
 
   mutable int visited_time_{};
   //! used to mark the index of node's input/output tensors
@@ -276,7 +276,7 @@ class Graph {
 
   //! Collect the nodes match the condition defined by \p teller in the graph.
   std::set<GraphNode*> CollectNodes(
-      std::function<bool(const common::GraphNode*)>&& teller);
+      std::function<bool(const cinn::common::GraphNode*)>&& teller);
 
   void DropNode(GraphNode* n) {
     auto it = std::find_if(
@@ -291,7 +291,7 @@ class Graph {
 
   void ClearUnlinkedNodes(
       absl::flat_hash_map<std::string, std::vector<int>>* shape_dict,
-      absl::flat_hash_map<std::string, common::Type>* type_dict,
+      absl::flat_hash_map<std::string, cinn::common::Type>* type_dict,
       absl::flat_hash_map<std::string, std::string>* layout_dict);
 
   size_t num_nodes() const { return nodes_.size(); }
diff --git a/paddle/cinn/common/ir_util.cc b/paddle/cinn/common/ir_util.cc
index d88289a8722b8a..774d7514e6fb23 100644
--- a/paddle/cinn/common/ir_util.cc
+++ b/paddle/cinn/common/ir_util.cc
@@ -69,8 +69,8 @@ Expr RampRelatedAdd(ir::Ramp *ramp, ir::Ramp *other) {
   CHECK(ramp);
   CHECK(other);
   if (ramp->lanes == other->lanes) {
-    Expr base_add = common::AutoSimplify(ramp->base + other->base);
-    Expr stride_add = common::AutoSimplify(ramp->stride + other->stride);
+    Expr base_add = cinn::common::AutoSimplify(ramp->base + other->base);
+    Expr stride_add = cinn::common::AutoSimplify(ramp->stride + other->stride);
     VLOG(2) << base_add;
     VLOG(2) << stride_add;
     return ir::Ramp::Make(base_add, stride_add, ramp->lanes);
@@ -156,7 +156,7 @@ Expr IndiceToAbsOffset(const std::vector<Expr> &shape,
       res = indice_prod;
     }
   }
-  return common::AutoSimplify(res);
+  return cinn::common::AutoSimplify(res);
 }
 
 Expr IndiceToAbsOffset(const std::vector<int> &shape,
diff --git a/paddle/cinn/common/ir_util.h b/paddle/cinn/common/ir_util.h
index 179c5dfd0d1245..3f5831e9b858cb 100644
--- a/paddle/cinn/common/ir_util.h
+++ b/paddle/cinn/common/ir_util.h
@@ -69,10 +69,10 @@ inline Expr make_one() {
   return make_const(static_cast<T>(1));
 }
 inline Expr make_bool(bool x) {
-  return common::make_shared<ir::UIntImm>(Bool(), x);
+  return cinn::common::make_shared<ir::UIntImm>(Bool(), x);
 }
 inline Expr make_bool(bool x, int lanes) {
-  return common::make_shared<ir::UIntImm>(Bool(lanes), x);
+  return cinn::common::make_shared<ir::UIntImm>(Bool(lanes), x);
 }
 // @}
 
diff --git a/paddle/cinn/common/make_subgraph_walker.h b/paddle/cinn/common/make_subgraph_walker.h
index 0f90c3e6b25cda..d712deb7dda1a4 100644
--- a/paddle/cinn/common/make_subgraph_walker.h
+++ b/paddle/cinn/common/make_subgraph_walker.h
@@ -26,19 +26,19 @@
 namespace cinn::common {
 
 template <typename NodeT, typename IterT>
-common::TopoWalker<NodeT> MakeSubgraphWalker(
-    const common::TopoWalker<NodeT>& walker,
+cinn::common::TopoWalker<NodeT> MakeSubgraphWalker(
+    const cinn::common::TopoWalker<NodeT>& walker,
     IterT src_begin,
     IterT src_end,
     IterT sink_begin,
     IterT sink_end) {
-  common::TopoWalker<NodeT> reversed_walker(walker.VisitNextNodes,
-                                            walker.VisitPrevNodes);
+  cinn::common::TopoWalker<NodeT> reversed_walker(walker.VisitNextNodes,
+                                                  walker.VisitPrevNodes);
   auto ReachableToOneSrc =
-      common::MakeIsReachableFromSrcPredicator<NodeT, IterT>(
+      cinn::common::MakeIsReachableFromSrcPredicator<NodeT, IterT>(
           walker, src_begin, src_end);
   auto ReachableToOneSink =
-      common::MakeIsReachableFromSrcPredicator<NodeT, IterT>(
+      cinn::common::MakeIsReachableFromSrcPredicator<NodeT, IterT>(
           reversed_walker, sink_begin, sink_end);
 
   auto VisitPrevNodes = [ReachableToOneSrc, ReachableToOneSink, walker](
@@ -61,7 +61,7 @@ common::TopoWalker<NodeT> MakeSubgraphWalker(
     });
   };
 
-  return common::TopoWalker<NodeT>(VisitPrevNodes, VisitNextNodes);
+  return cinn::common::TopoWalker<NodeT>(VisitPrevNodes, VisitNextNodes);
 }
 
 }  // namespace cinn::common
diff --git a/paddle/cinn/common/union_find.h b/paddle/cinn/common/union_find.h
index c42a14683ae3d5..18a2ee2bf69ae8 100644
--- a/paddle/cinn/common/union_find.h
+++ b/paddle/cinn/common/union_find.h
@@ -94,7 +94,7 @@ struct UnionFind {
     return res;
   }
 
-  std::vector<common::Shared<UnionFindNode>> nodes;
+  std::vector<cinn::common::Shared<UnionFindNode>> nodes;
 };
 
 }  // namespace common
diff --git a/paddle/cinn/frontend/computation_test.cc b/paddle/cinn/frontend/computation_test.cc
index b3b01af03aba98..e4666fb46e21d7 100644
--- a/paddle/cinn/frontend/computation_test.cc
+++ b/paddle/cinn/frontend/computation_test.cc
@@ -81,7 +81,7 @@ TEST(cinn_computation, basic_cpu) {
   auto c = builder.Add(a, b);
   auto d = builder.Add(a, c);
 
-  auto target = common::DefaultHostTarget();
+  auto target = cinn::common::DefaultHostTarget();
   auto comp = CinnComputation::BuildAndCompile(target, builder);
   std::vector<float> hostA(M * N);
   std::vector<float> hostB(M * N);
@@ -119,7 +119,7 @@ TEST(cinn_computation, basic_gpu) {
   auto c = builder.Add(a, b);
   auto d = builder.Add(a, c);
 
-  auto target = common::DefaultNVGPUTarget();
+  auto target = cinn::common::DefaultNVGPUTarget();
   auto comp = CinnComputation::BuildAndCompile(target, builder);
   std::vector<float> hostA(M * N);
   std::vector<float> hostB(M * N);
@@ -149,7 +149,7 @@ TEST(cinn_computation, basic_gpu) {
 
 TEST(cinn_computation, net_builder_cpu) {
   auto program = CreateTestProgram();
-  auto target = common::DefaultHostTarget();
+  auto target = cinn::common::DefaultHostTarget();
   auto compute = CinnComputation::Compile(target, program);
   auto inputs = compute->GetInputTensors();
   ASSERT_EQ(inputs.size(), 2);
@@ -183,7 +183,7 @@ TEST(cinn_computation, net_builder_cpu) {
 #ifdef CINN_WITH_CUDA
 TEST(cinn_computation, net_builder_gpu) {
   auto program = CreateTestProgram();
-  auto target = common::DefaultNVGPUTarget();
+  auto target = cinn::common::DefaultNVGPUTarget();
   auto compute = CinnComputation::Compile(target, program);
   auto inputs = compute->GetInputTensors();
   ASSERT_EQ(inputs.size(), 2);
@@ -223,7 +223,7 @@ TEST(cinn_computation, net_builder_gpu) {
 #endif
 
 TEST(cinn_computation, fc_execute_cpu) {
-  auto target = common::DefaultHostTarget();
+  auto target = cinn::common::DefaultHostTarget();
   ASSERT_NE(FLAGS_model_dir, "");
   auto compute = CinnComputation::CompilePaddleModel(
       target, FLAGS_model_dir, {"A"}, {{1, 30}}, false);
@@ -240,7 +240,7 @@ TEST(cinn_computation, fc_execute_cpu) {
 
 #ifdef CINN_WITH_CUDA
 TEST(cinn_computation, fc_execute_gpu) {
-  auto target = common::DefaultNVGPUTarget();
+  auto target = cinn::common::DefaultNVGPUTarget();
   ASSERT_NE(FLAGS_model_dir, "");
   auto compute = CinnComputation::CompilePaddleModel(
       target, FLAGS_model_dir, {"A"}, {{1, 30}}, false);
@@ -274,7 +274,7 @@ TEST(cinn_computation, decomposer_cpu) {
   // without decomposer
   {
     auto prog = CreateAddProgram();
-    auto target = common::DefaultHostTarget();
+    auto target = cinn::common::DefaultHostTarget();
     auto options = CinnComputation::DefaultCompileOptions();
     options.use_decomposer = false;
     auto compute = CinnComputation::Compile(target, prog, options);
@@ -284,7 +284,7 @@ TEST(cinn_computation, decomposer_cpu) {
   // with decomposer
   {
     auto prog = CreateAddProgram();
-    auto target = common::DefaultHostTarget();
+    auto target = cinn::common::DefaultHostTarget();
     auto options = CinnComputation::DefaultCompileOptions();
     options.use_decomposer = true;
     auto compute = CinnComputation::Compile(target, prog, options);
@@ -295,7 +295,7 @@ TEST(cinn_computation, decomposer_cpu) {
 #ifdef CINN_WITH_CUDA
 TEST(cinn_computation, gpu_stream) {
   // this test only shows the API usage
-  auto target = common::DefaultNVGPUTarget();
+  auto target = cinn::common::DefaultNVGPUTarget();
   auto prog = CreateAddProgram();
   auto options = CinnComputation::DefaultCompileOptions();
 
@@ -309,7 +309,7 @@ TEST(cinn_computation, gpu_stream) {
 
 TEST(cinn_computation, without_instantiate_variables) {
   // this test only shows the API usage
-  auto target = common::DefaultHostTarget();
+  auto target = cinn::common::DefaultHostTarget();
   auto prog = CreateAddProgram();
   auto options = CinnComputation::DefaultCompileOptions();
   options.with_instantiate_variables = false;
diff --git a/paddle/cinn/frontend/decomposer/activation.cc b/paddle/cinn/frontend/decomposer/activation.cc
index 040d1af9b1b986..bde48429c6d35e 100644
--- a/paddle/cinn/frontend/decomposer/activation.cc
+++ b/paddle/cinn/frontend/decomposer/activation.cc
@@ -28,8 +28,10 @@ void relu(const Instruction& instr, const DecomposerContext& context) {
   auto output = instr->outputs[0];
   auto* builder = context.builder();
 
-  auto bcast_zero = builder->FillConstant(
-      x->shape, 0.0f, common::UniqName("zero"), common::Type2Str(x->type));
+  auto bcast_zero = builder->FillConstant(x->shape,
+                                          0.0f,
+                                          cinn::common::UniqName("zero"),
+                                          cinn::common::Type2Str(x->type));
   auto out = builder->Max(x, bcast_zero);
 
   // map the the output of decomposed operator to the original.
@@ -46,8 +48,10 @@ void relu_grad(const Instruction& instr, const DecomposerContext& context) {
   auto dx = instr->outputs[0];
   auto* builder = context.builder();
 
-  auto bcast_zero = builder->FillConstant(
-      out->shape, 0.0f, common::UniqName("zero"), common::Type2Str(out->type));
+  auto bcast_zero = builder->FillConstant(out->shape,
+                                          0.0f,
+                                          cinn::common::UniqName("zero"),
+                                          cinn::common::Type2Str(out->type));
   auto condition = builder->GreaterThan(out, bcast_zero);
   auto res = builder->Select(condition, dout, bcast_zero);
 
@@ -65,12 +69,14 @@ void gelu(const Instruction& instr, const DecomposerContext& context) {
   auto* builder = context.builder();
 
   // x * (0.5 + 0.5 * erf(sqrtf(0.5) * x))
-  auto p_5 = builder->FillConstant(
-      x->shape, 0.5f, common::UniqName("p_5"), common::Type2Str(x->type));
+  auto p_5 = builder->FillConstant(x->shape,
+                                   0.5f,
+                                   cinn::common::UniqName("p_5"),
+                                   cinn::common::Type2Str(x->type));
   auto p_7 = builder->FillConstant(x->shape,
                                    std::sqrt(0.5),
-                                   common::UniqName("p_7"),
-                                   common::Type2Str(x->type));
+                                   cinn::common::UniqName("p_7"),
+                                   cinn::common::Type2Str(x->type));
   auto erf = builder->Erf(builder->Multiply(x, p_7));
   auto cdf = builder->Add(p_5, builder->Multiply(p_5, erf));
   auto out = builder->Multiply(x, cdf);
diff --git a/paddle/cinn/frontend/decomposer/activation_test.cc b/paddle/cinn/frontend/decomposer/activation_test.cc
index a024aa47537548..de8e6047011d27 100644
--- a/paddle/cinn/frontend/decomposer/activation_test.cc
+++ b/paddle/cinn/frontend/decomposer/activation_test.cc
@@ -77,7 +77,7 @@ TEST(Decomposer, softmax_decomposer) {
   }
   auto program = net_builder.Build();
 
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   RunDecomposer(&program, target);
 
   auto graph =
diff --git a/paddle/cinn/frontend/decomposer/batch_norm.cc b/paddle/cinn/frontend/decomposer/batch_norm.cc
index 19c53bd506b8f9..b2d59053e43dee 100644
--- a/paddle/cinn/frontend/decomposer/batch_norm.cc
+++ b/paddle/cinn/frontend/decomposer/batch_norm.cc
@@ -80,8 +80,8 @@ struct BatchNormHelper {
     auto element_count_1d =
         builder->FillConstant(sum->shape,
                               element_count,
-                              common::UniqName("element_count"),
-                              common::Type2Str(sum->type));
+                              cinn::common::UniqName("element_count"),
+                              cinn::common::Type2Str(sum->type));
     auto mean = builder->Divide(sum, element_count_1d);
     return mean;
   }
@@ -93,8 +93,8 @@ struct BatchNormHelper {
     auto element_count_1d =
         builder->FillConstant(x_square_sum->shape,
                               element_count,
-                              common::UniqName("element_count"),
-                              common::Type2Str(x_square_sum->type));
+                              cinn::common::UniqName("element_count"),
+                              cinn::common::Type2Str(x_square_sum->type));
     auto x_square_mean = builder->Divide(x_square_sum, element_count_1d);
     auto variance = builder->Subtract(
         x_square_mean, builder->Multiply(mean, builder->Identity(mean)));
@@ -103,10 +103,11 @@ struct BatchNormHelper {
 
   // std_variance_inv = rsqrt(variance + epsilon)
   Variable StdVarianceInv1d(Variable variance, float epsilon) {
-    auto epsilon_1d = builder->FillConstant(variance->shape,
-                                            epsilon,
-                                            common::UniqName("epsilon"),
-                                            common::Type2Str(variance->type));
+    auto epsilon_1d =
+        builder->FillConstant(variance->shape,
+                              epsilon,
+                              cinn::common::UniqName("epsilon"),
+                              cinn::common::Type2Str(variance->type));
     auto std_variance_inv = builder->Rsqrt(builder->Add(variance, epsilon_1d));
     return std_variance_inv;
   }
@@ -117,8 +118,8 @@ struct BatchNormHelper {
     auto epsilon_4d =
         builder->FillConstant(variance_4d->shape,
                               epsilon,
-                              common::UniqName("epsilon"),
-                              common::Type2Str(variance_4d->type));
+                              cinn::common::UniqName("epsilon"),
+                              cinn::common::Type2Str(variance_4d->type));
     auto std_variance_inv_4d =
         builder->Rsqrt(builder->Add(variance_4d, epsilon_4d));
     return std_variance_inv_4d;
@@ -129,14 +130,16 @@ struct BatchNormHelper {
   Variable UpdateMeanVariance(Variable moving_value,
                               Variable saved_value,
                               float momentum) {
-    auto factor_0 = builder->FillConstant(moving_value->shape,
-                                          momentum,
-                                          common::UniqName("factor_0"),
-                                          common::Type2Str(moving_value->type));
-    auto factor_1 = builder->FillConstant(saved_value->shape,
-                                          1.0f - momentum,
-                                          common::UniqName("factor_1"),
-                                          common::Type2Str(saved_value->type));
+    auto factor_0 =
+        builder->FillConstant(moving_value->shape,
+                              momentum,
+                              cinn::common::UniqName("factor_0"),
+                              cinn::common::Type2Str(moving_value->type));
+    auto factor_1 =
+        builder->FillConstant(saved_value->shape,
+                              1.0f - momentum,
+                              cinn::common::UniqName("factor_1"),
+                              cinn::common::Type2Str(saved_value->type));
     auto new_moving_value =
         builder->Add(builder->Multiply(moving_value, factor_0),
                      builder->Multiply(saved_value, factor_1));
@@ -253,11 +256,11 @@ void batch_norm_grad(const Instruction& instr,
   // => x_grad = tmp0 * (tmp1 - tmp2 - tmp3)
   auto scaled_std_variance_inv =
       builder->Multiply(scale, helper.StdVarianceInv1d(save_variance, epsilon));
-  auto element_count_1d =
-      builder->FillConstant(scaled_std_variance_inv->shape,
-                            helper.element_count,
-                            common::UniqName("element_count_1d"),
-                            common::Type2Str(scaled_std_variance_inv->type));
+  auto element_count_1d = builder->FillConstant(
+      scaled_std_variance_inv->shape,
+      helper.element_count,
+      cinn::common::UniqName("element_count_1d"),
+      cinn::common::Type2Str(scaled_std_variance_inv->type));
   auto tmp0 = builder->BroadcastTo(
       builder->Divide(scaled_std_variance_inv, element_count_1d),
       x->shape,
@@ -266,8 +269,8 @@ void batch_norm_grad(const Instruction& instr,
   auto element_count_4d =
       builder->FillConstant(y_grad->shape,
                             helper.element_count,
-                            common::UniqName("element_count_4d"),
-                            common::Type2Str(y_grad->type));
+                            cinn::common::UniqName("element_count_4d"),
+                            cinn::common::Type2Str(y_grad->type));
   auto tmp1 = builder->Multiply(y_grad, element_count_4d);
 
   auto tmp2 = builder->BroadcastTo(bias_grad, x->shape, {helper.channel_dim});
@@ -283,8 +286,8 @@ void batch_norm_grad(const Instruction& instr,
   auto epsilon_1d =
       builder->FillConstant(save_variance->shape,
                             epsilon,
-                            common::UniqName("epsilon"),
-                            common::Type2Str(save_variance->type));
+                            cinn::common::UniqName("epsilon"),
+                            cinn::common::Type2Str(save_variance->type));
   auto variance_add_eps = builder->Add(save_variance, epsilon_1d);
   auto variance_add_eps_4d =
       builder->BroadcastTo(variance_add_eps, x->shape, {helper.channel_dim});
diff --git a/paddle/cinn/frontend/decomposer/batch_norm_test.cc b/paddle/cinn/frontend/decomposer/batch_norm_test.cc
index 87c6cccd0dea7d..e395ab58b720c0 100644
--- a/paddle/cinn/frontend/decomposer/batch_norm_test.cc
+++ b/paddle/cinn/frontend/decomposer/batch_norm_test.cc
@@ -189,7 +189,7 @@ TEST(Decomposer, BatchNormTrain) {
   }
   auto program = net_builder.Build();
 
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   RunDecomposer(&program,
                 target,
                 cinn::frontend::DefaultTrainingOptimizeOptions().program_passes,
@@ -389,7 +389,7 @@ TEST(Decomposer, BatchNormGrad) {
   }
   auto program = net_builder.Build();
 
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   RunDecomposer(&program,
                 target,
                 cinn::frontend::DefaultTrainingOptimizeOptions().program_passes,
diff --git a/paddle/cinn/frontend/decomposer/test_helper.h b/paddle/cinn/frontend/decomposer/test_helper.h
index 526cee8182ea31..4a7bb9b2f80918 100644
--- a/paddle/cinn/frontend/decomposer/test_helper.h
+++ b/paddle/cinn/frontend/decomposer/test_helper.h
@@ -85,7 +85,7 @@ void CopyFromVector(const std::vector<T>& vec,
   size_t numel = tensor->shape().numel();
   CHECK_EQ(vec.size(), numel);
 
-  if (target == common::DefaultNVGPUTarget()) {
+  if (target == cinn::common::DefaultNVGPUTarget()) {
 #ifdef CINN_WITH_CUDA
     cudaMemcpy(data, vec.data(), numel * sizeof(T), cudaMemcpyHostToDevice);
 #else
@@ -204,7 +204,7 @@ void RunAndCheckShape(NetBuilder* builder,
                       T high = 1,
                       const std::vector<std::string>& passes = {"Decomposer"}) {
   auto prog = builder->Build();
-  Target target = common::DefaultTarget();
+  Target target = cinn::common::DefaultTarget();
   RunDecomposer(&prog, target, passes, output_names);
   auto graph = std::make_shared<hlir::framework::Graph>(prog, target);
   hlir::framework::ApplyPasses(graph.get(), DefaultOpFusionPasses());
diff --git a/paddle/cinn/frontend/decomposer/top_k_test.cc b/paddle/cinn/frontend/decomposer/top_k_test.cc
index a01cbcec0f6934..5dc70e36921d4a 100644
--- a/paddle/cinn/frontend/decomposer/top_k_test.cc
+++ b/paddle/cinn/frontend/decomposer/top_k_test.cc
@@ -29,7 +29,7 @@ TEST(Decomposer, top_k_decomposer) {
   }
   auto program = net_builder.Build();
 
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   RunDecomposer(&program, target);
 
   auto graph =
diff --git a/paddle/cinn/frontend/decomposer_registry.h b/paddle/cinn/frontend/decomposer_registry.h
index 258c81e3350083..3dc142468a9e5e 100644
--- a/paddle/cinn/frontend/decomposer_registry.h
+++ b/paddle/cinn/frontend/decomposer_registry.h
@@ -67,7 +67,7 @@ class InstrDecomposerRegistry : public Registry<Decomposer> {
   }
 
   inline const Decomposer* Get(const std::string& op_name,
-                               const common::Target& target) {
+                               const cinn::common::Target& target) {
     const Decomposer* decomposer = Find(op_name, target);
     CHECK(decomposer) << "Decomposer for [" << op_name << ", " << target
                       << "] is not registered";
@@ -75,7 +75,7 @@ class InstrDecomposerRegistry : public Registry<Decomposer> {
   }
 
   inline const Decomposer* Find(const std::string& name,
-                                const common::Target& target) {
+                                const cinn::common::Target& target) {
     return Registry<Decomposer>::Find(name + "_" + target.arch_str());
   }
 
diff --git a/paddle/cinn/frontend/decomposer_registry_test.cc b/paddle/cinn/frontend/decomposer_registry_test.cc
index ad3828706b1a6f..125b6cce97c646 100644
--- a/paddle/cinn/frontend/decomposer_registry_test.cc
+++ b/paddle/cinn/frontend/decomposer_registry_test.cc
@@ -21,7 +21,7 @@
 namespace cinn::frontend {
 
 TEST(InstrDecomposerRegistry, basic) {
-  common::Target target = common::DefaultHostTarget();
+  cinn::common::Target target = cinn::common::DefaultHostTarget();
   ASSERT_EQ(InstrDecomposerRegistry::Global()->Find("conv", target), nullptr);
   ASSERT_NE(InstrDecomposerRegistry::Global()->Find("relu", target), nullptr);
 }
diff --git a/paddle/cinn/frontend/interpreter_test.cc b/paddle/cinn/frontend/interpreter_test.cc
old mode 100755
new mode 100644
index ab3f211120b94b..240a82a646081c
--- a/paddle/cinn/frontend/interpreter_test.cc
+++ b/paddle/cinn/frontend/interpreter_test.cc
@@ -24,7 +24,8 @@ namespace cinn::frontend {
 
 TEST(Interpreter, basic) {
   Interpreter executor({"A"}, {{1, 30}});
-  executor.LoadPaddleModel(FLAGS_model_dir, common::DefaultTarget(), true);
+  executor.LoadPaddleModel(
+      FLAGS_model_dir, cinn::common::DefaultTarget(), true);
   executor.Run();
   // fc_0.tmp_2 is eliminated by OpFusion, so here
   // change to get tenor of the out variable
diff --git a/paddle/cinn/frontend/net_builder.cc b/paddle/cinn/frontend/net_builder.cc
index c7abf882e413d1..b9f6135bdd5b5e 100644
--- a/paddle/cinn/frontend/net_builder.cc
+++ b/paddle/cinn/frontend/net_builder.cc
@@ -28,8 +28,8 @@
 namespace cinn {
 namespace frontend {
 
-using common::Context;
-using common::Type;
+using cinn::common::Context;
+using cinn::common::Type;
 using hlir::framework::Operator;
 using utils::AttributeMap;
 using utils::ShapeType;
@@ -275,7 +275,7 @@ Variable NetBuilder::FillConstant(const std::vector<int>& shape,
                                   const std::string& name,
                                   const std::string& dtype,
                                   bool force_cpu) {
-  const auto& type = common::Str2Type(dtype);
+  const auto& type = cinn::common::Str2Type(dtype);
 
   utils::Attribute value;
   if (type.is_float()) {
@@ -533,7 +533,7 @@ Variable NetBuilder::Cast(const Variable& operand, const std::string& dtype) {
 
 Variable NetBuilder::BitcastConvert(const Variable& operand,
                                     const std::string& dtype) {
-  std::string input_data_type = common::Type2Str(operand->type);
+  std::string input_data_type = cinn::common::Type2Str(operand->type);
   return CustomInstr("bitcast_convert",
                      {operand},
                      {{"dtype", dtype}, {"input_data_type", input_data_type}})
@@ -1125,7 +1125,8 @@ Variable NetBuilder::Cholesky(const Variable& x, bool upper) {
                     : LessEqual(index_row, index_col);
   auto mask_mat = Reshape(mask, {m, m});
   auto mask_full = BroadcastTo(mask_mat, x->shape);
-  auto zeros = FillConstant(x->shape, 0.0f, "zeros", common::Type2Str(x->type));
+  auto zeros =
+      FillConstant(x->shape, 0.0f, "zeros", cinn::common::Type2Str(x->type));
   auto out = Select(mask_full, cholesky_out, zeros);
   return out;
 }
diff --git a/paddle/cinn/frontend/net_builder.h b/paddle/cinn/frontend/net_builder.h
index 17f33e8c2ac092..dde45e2d42fdaf 100644
--- a/paddle/cinn/frontend/net_builder.h
+++ b/paddle/cinn/frontend/net_builder.h
@@ -396,7 +396,7 @@ class NetBuilder {
    * @param id_hint The input variable's name. Default is None.
    * @return The new input.
    */
-  Placeholder CreateInput(const common::Type& type,
+  Placeholder CreateInput(const cinn::common::Type& type,
                           const cinn::utils::ShapeType& shape,
                           const std::string& id_hint = "");
 
@@ -411,8 +411,9 @@ class NetBuilder {
       const T& value,
       const std::string& name = "",
       const std::string& dtype = "") {
-    auto true_dtype =
-        dtype.empty() ? common::Type2Str(common::type_of<T>()) : dtype;
+    auto true_dtype = dtype.empty()
+                          ? cinn::common::Type2Str(cinn::common::type_of<T>())
+                          : dtype;
     auto out =
         CustomInstr(
             "const_scalar", {}, {{"value", value}, {"dtype", true_dtype}})
@@ -441,7 +442,8 @@ class NetBuilder {
 
     using TYPE = typename decltype(all_datas)::value_type;
     auto true_dtype =
-        dtype.empty() ? common::Type2Str(common::type_of<TYPE>()) : dtype;
+        dtype.empty() ? cinn::common::Type2Str(cinn::common::type_of<TYPE>())
+                      : dtype;
 
     const auto& real_shape = GetVectorShape(value);
 
@@ -526,8 +528,11 @@ class NetBuilder {
                         T value,
                         const std::string& name = "",
                         bool force_cpu = false) {
-    return FillConstant<T>(
-        shape, value, name, common::Type2Str(common::type_of<T>()), force_cpu);
+    return FillConstant<T>(shape,
+                           value,
+                           name,
+                           cinn::common::Type2Str(cinn::common::type_of<T>()),
+                           force_cpu);
   }
 
   /**
diff --git a/paddle/cinn/frontend/net_builder_test.cc b/paddle/cinn/frontend/net_builder_test.cc
index bede6049dbe38e..46839aa890ec5c 100644
--- a/paddle/cinn/frontend/net_builder_test.cc
+++ b/paddle/cinn/frontend/net_builder_test.cc
@@ -90,9 +90,9 @@ TEST(net_build, TestTransValidVarName) {
 TEST(net_build, program_execute_multi_elementwise_add) {
   auto program = CreateAddProgram();
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
 
   std::unordered_set<std::string> fetch_ids;
@@ -131,9 +131,9 @@ TEST(net_build, program_execute_fc) {
   auto program = builder.Build();
 
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
 
   std::unordered_set<std::string> fetch_ids;
@@ -176,9 +176,9 @@ TEST(net_build, program_execute_multi_elementwise_add_bf16) {
   auto program = builder.Build();
 
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
 
   std::unordered_set<std::string> fetch_ids;
@@ -218,9 +218,9 @@ TEST(net_build, program_execute_fc_bf16) {
   auto program = builder.Build();
 
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
 
   std::unordered_set<std::string> fetch_ids;
@@ -282,9 +282,9 @@ TEST(net_build, program_execute_pool2d) {
   auto program = builder.Build();
 
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
 
   std::unordered_set<std::string> fetch_ids;
@@ -314,9 +314,9 @@ TEST(net_build, program_execute_reverse) {
   auto program = builder.Build();
 
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
 
   std::unordered_set<std::string> fetch_ids;
@@ -348,9 +348,9 @@ TEST(net_build, program_execute_gather) {
   auto program = builder.Build();
 
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
   std::unordered_set<std::string> fetch_ids;
   auto graph = Optimize(&program, fetch_ids, target);
@@ -408,9 +408,9 @@ TEST(net_build, program_execute_gather_nd) {
   auto program = builder.Build();
 
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
 
   std::unordered_set<std::string> fetch_ids;
@@ -470,9 +470,9 @@ TEST(net_build, program_execute_cast) {
   auto program = builder.Build();
 
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
   std::unordered_set<std::string> fetch_ids;
   auto graph = Optimize(&program, fetch_ids, target);
@@ -525,9 +525,9 @@ TEST(net_build, program_execute_squeeze_case0) {
   auto program = builder.Build();
 
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
   std::unordered_set<std::string> fetch_ids;
   auto graph = Optimize(&program, fetch_ids, target);
@@ -585,9 +585,9 @@ TEST(net_build, program_execute_squeeze_case1) {
   auto program = builder.Build();
 
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
   std::unordered_set<std::string> fetch_ids;
   auto graph = Optimize(&program, fetch_ids, target);
@@ -645,9 +645,9 @@ TEST(net_build, program_execute_squeeze_case2) {
   auto program = builder.Build();
 
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
   std::unordered_set<std::string> fetch_ids;
   auto graph = Optimize(&program, fetch_ids, target);
@@ -704,9 +704,9 @@ TEST(net_build, program_execute_squeeze_case3) {
   auto program = builder.Build();
 
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
   std::unordered_set<std::string> fetch_ids;
   auto graph = Optimize(&program, fetch_ids, target);
@@ -763,9 +763,9 @@ TEST(net_build, program_execute_squeeze_case4) {
   auto program = builder.Build();
 
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
   std::unordered_set<std::string> fetch_ids;
   auto graph = Optimize(&program, fetch_ids, target);
@@ -820,9 +820,9 @@ TEST(net_build, program_execute_argsort) {
   auto program = builder.Build();
 
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
   std::unordered_set<std::string> fetch_ids;
   auto graph = Optimize(&program, fetch_ids, target);
@@ -882,9 +882,9 @@ TEST(net_build, program_execute_sort) {
   auto program = builder.Build();
 
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
   std::unordered_set<std::string> fetch_ids;
   auto graph = Optimize(&program, fetch_ids, target);
@@ -943,9 +943,9 @@ TEST(net_build, program_execute_arange_float) {
   auto program = builder.Build();
 
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
   std::unordered_set<std::string> fetch_ids;
   auto graph = Optimize(&program, fetch_ids, target);
@@ -985,9 +985,9 @@ TEST(net_build, program_execute_arange_int) {
   auto program = builder.Build();
 
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
   std::unordered_set<std::string> fetch_ids;
   auto graph = Optimize(&program, fetch_ids, target);
@@ -1029,9 +1029,9 @@ TEST(net_build, program_argmax_case1) {
   auto program = builder.Build();
 
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
   std::unordered_set<std::string> fetch_ids;
   auto graph = Optimize(&program, fetch_ids, target);
@@ -1107,7 +1107,7 @@ TEST(net_build, program_argmax_case2) {
   Variable output = builder.Argmax(input, 1, false);
   auto program = builder.Build();
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
   std::unordered_set<std::string> fetch_ids;
   auto graph = Optimize(&program, fetch_ids, target);
 
@@ -1182,9 +1182,9 @@ TEST(net_build, program_argmin_case1) {
   Variable output = builder.Argmin(input, 1, true);
   auto program = builder.Build();
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
 
   std::unordered_set<std::string> fetch_ids;
@@ -1261,9 +1261,9 @@ TEST(net_build, program_argmin_case2) {
   Variable output = builder.Argmin(input, 1, false);
   auto program = builder.Build();
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
   std::unordered_set<std::string> fetch_ids;
   auto graph = Optimize(&program, fetch_ids, target);
@@ -1339,9 +1339,9 @@ TEST(net_build, program_execute_repeat_axis_0) {
   auto program = builder.Build();
 
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
   std::unordered_set<std::string> fetch_ids;
   auto graph = Optimize(&program, fetch_ids, target);
@@ -1395,9 +1395,9 @@ TEST(net_build, program_execute_repeat_axis_1) {
   auto program = builder.Build();
 
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
   std::unordered_set<std::string> fetch_ids;
   auto graph = Optimize(&program, fetch_ids, target);
@@ -1457,9 +1457,9 @@ TEST(net_build, program_execute_one_hot) {
   auto program = builder.Build();
 
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
   std::unordered_set<std::string> fetch_ids;
   auto graph = Optimize(&program, fetch_ids, target);
diff --git a/paddle/cinn/frontend/op_mapper_registry.h b/paddle/cinn/frontend/op_mapper_registry.h
index 9351e60e8ff70f..396936bf63bdf6 100644
--- a/paddle/cinn/frontend/op_mapper_registry.h
+++ b/paddle/cinn/frontend/op_mapper_registry.h
@@ -59,7 +59,7 @@ class OpMapperContext {
  public:
   OpMapperContext(
       const hlir::framework::Scope& scope,
-      const common::Target& target,
+      const cinn::common::Target& target,
       NetBuilder* builder,
       std::unordered_map<std::string, Variable>* var_map,
       std::unordered_map<std::string, std::string>* var_model_to_program_map,
@@ -99,7 +99,7 @@ class OpMapperContext {
 
   struct FeedInfo {
     std::vector<int> shape;
-    common::Type type;
+    cinn::common::Type type;
   };
 
   void AddFeedInfo(const std::string& name, const FeedInfo& info);
@@ -108,7 +108,7 @@ class OpMapperContext {
 
  private:
   const hlir::framework::Scope& scope_;
-  const common::Target& target_;
+  const cinn::common::Target& target_;
   NetBuilder* builder_{nullptr};
 
   std::unordered_map<std::string, Variable>* var_map_{nullptr};
diff --git a/paddle/cinn/frontend/op_mappers/common_utils.h b/paddle/cinn/frontend/op_mappers/common_utils.h
index 387a2c1fe7a8c9..61e9dc2cda93f4 100644
--- a/paddle/cinn/frontend/op_mappers/common_utils.h
+++ b/paddle/cinn/frontend/op_mappers/common_utils.h
@@ -181,7 +181,7 @@ inline std::string GetPaddleDtype(const paddle::cpp::OpDesc& op_desc,
     return "";
   }
 
-  return common::Type2Str(dtype_cinn);
+  return cinn::common::Type2Str(dtype_cinn);
 }
 
 }  // namespace utils
diff --git a/paddle/cinn/frontend/op_mappers/paddle/clip.cc b/paddle/cinn/frontend/op_mappers/paddle/clip.cc
index 1dc659b7410f45..f060ec4175fc99 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/clip.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/clip.cc
@@ -37,7 +37,8 @@ void ClipOpMapper(const paddle::cpp::OpDesc& op_desc,
         << "The [Min] tensor shape of clip op should be [1], but here ["
         << cinn::utils::Join(min_val_tensor->shape, ", ") << "]";
     if (x->type != min_val_tensor->type) {
-      min_val_tensor = builder->Cast(min_val_tensor, common::Type2Str(x->type));
+      min_val_tensor =
+          builder->Cast(min_val_tensor, cinn::common::Type2Str(x->type));
     }
     min_val_tensor = builder->BroadcastTo(min_val_tensor, x->shape);
     x = builder->Max(x, min_val_tensor);
@@ -48,8 +49,8 @@ void ClipOpMapper(const paddle::cpp::OpDesc& op_desc,
     auto min_val_tensor =
         builder->FillConstant(x->shape,
                               min_value,
-                              common::UniqName(x->id + "_min"),
-                              common::Type2Str(x->type));
+                              cinn::common::UniqName(x->id + "_min"),
+                              cinn::common::Type2Str(x->type));
     x = builder->Max(x, min_val_tensor);
   }
 
@@ -62,7 +63,8 @@ void ClipOpMapper(const paddle::cpp::OpDesc& op_desc,
         << "The [Max] tensor shape of clip op should be [1], but here ["
         << cinn::utils::Join(max_val_tensor->shape, ", ") << "]";
     if (x->type != max_val_tensor->type) {
-      max_val_tensor = builder->Cast(max_val_tensor, common::Type2Str(x->type));
+      max_val_tensor =
+          builder->Cast(max_val_tensor, cinn::common::Type2Str(x->type));
     }
     max_val_tensor = builder->BroadcastTo(max_val_tensor, x->shape);
     x = builder->Min(x, max_val_tensor);
@@ -70,10 +72,11 @@ void ClipOpMapper(const paddle::cpp::OpDesc& op_desc,
     CHECK(op_desc.HasAttr("max"))
         << "The clip op should has [max] attribute or [Max] tensor input.";
     auto max_value = op_desc.GetAttr<float>("max");
-    auto max_val_tensor = builder->FillConstant(x->shape,
-                                                max_value,
-                                                common::UniqName("constant"),
-                                                common::Type2Str(x->type));
+    auto max_val_tensor =
+        builder->FillConstant(x->shape,
+                              max_value,
+                              cinn::common::UniqName("constant"),
+                              cinn::common::Type2Str(x->type));
     x = builder->Min(x, max_val_tensor);
   }
 
diff --git a/paddle/cinn/frontend/op_mappers/paddle/constant.cc b/paddle/cinn/frontend/op_mappers/paddle/constant.cc
index 8f38bb4ee90340..ca11b9f5780831 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/constant.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/constant.cc
@@ -91,7 +91,7 @@ void FillConstantOpMapper(const paddle::cpp::OpDesc& op_desc,
     CHECK(value_tensor->shape == cinn::utils::ShapeType{1})
         << "The shape of [ValueTensor] should be [1], but here ["
         << cinn::utils::Join(value_tensor->shape, ", ") << "]";
-    if (common::Type2Str(value_tensor->type) != dtype) {
+    if (cinn::common::Type2Str(value_tensor->type) != dtype) {
       value_tensor = ctx.Builder()->Cast(value_tensor, dtype);
     }
     out = ctx.Builder()->BroadcastTo(value_tensor, shape);
@@ -131,7 +131,7 @@ void FillAnyLikeOpMapper(const paddle::cpp::OpDesc& op_desc,
   auto dtype = utils::GetPaddleDtype(
       op_desc, "dtype", paddle::cpp::VarDescAPI::Type::FP32);
   if (dtype.empty()) {
-    dtype = common::Type2Str(x->type);
+    dtype = cinn::common::Type2Str(x->type);
   }
 
   VLOG(4) << "FillAnyLikeOp: fill constant (" << value << ") with shape ("
diff --git a/paddle/cinn/frontend/op_mappers/paddle/cumsum.cc b/paddle/cinn/frontend/op_mappers/paddle/cumsum.cc
index 080d53302dc172..3482bacded2216 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/cumsum.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/cumsum.cc
@@ -80,7 +80,7 @@ void CumsumOpMapper(const paddle::cpp::OpDesc& op_desc,
   mask = ctx.Builder()->BroadcastTo(mask, broadcast_shape);
   x = ctx.Builder()->BroadcastTo(x, broadcast_shape);
   auto false_value = ctx.Builder()->FillConstant(
-      x->shape, 0, UniqName("false_value"), common::Type2Str(x->type));
+      x->shape, 0, UniqName("false_value"), cinn::common::Type2Str(x->type));
   // Select elements with mask
   auto selected_x = ctx.Builder()->Select(mask, x, false_value);
   // Do reduce sum
diff --git a/paddle/cinn/frontend/op_mappers/paddle/elementwise.cc b/paddle/cinn/frontend/op_mappers/paddle/elementwise.cc
index 777b6a68d27af4..792ae1e922904d 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/elementwise.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/elementwise.cc
@@ -223,7 +223,7 @@ void PowOpMapper(const paddle::cpp::OpDesc& op_desc,
     y = ctx.Builder()->FillConstant(x->shape,
                                     factor,
                                     cinn::UniqName(x_name + "_factor"),
-                                    common::Type2Str(x->type));
+                                    cinn::common::Type2Str(x->type));
   } else {
     LOG(FATAL) << "Cannot found [FactorTensor] input or [factor] attribute in "
                   "paddle.pow! Please check.";
diff --git a/paddle/cinn/frontend/op_mappers/paddle/layer_norm.cc b/paddle/cinn/frontend/op_mappers/paddle/layer_norm.cc
index c9a138c3dbc4cd..3931145a5a05e4 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/layer_norm.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/layer_norm.cc
@@ -94,10 +94,11 @@ void LayerNormOpMapper(const paddle::cpp::OpDesc& op_desc,
   std::vector<int> shape{left, right};
   auto x_reshape = builder->Reshape(x, shape);
   auto x_reduce = builder->ReduceSum(x_reshape, {1});
-  auto ele_num = builder->FillConstant({left},
-                                       static_cast<float>(right),
-                                       common::UniqName("layer_norm_ele_num"),
-                                       common::Type2Str(x->type));
+  auto ele_num =
+      builder->FillConstant({left},
+                            static_cast<float>(right),
+                            cinn::common::UniqName("layer_norm_ele_num"),
+                            cinn::common::Type2Str(x->type));
   auto x_mean = builder->Divide(x_reduce, ele_num);
 
   // use `E[|x|^2] - |E[x]|^2` instead of `E[|x - E[x]|^2])` to compute variance
@@ -107,8 +108,8 @@ void LayerNormOpMapper(const paddle::cpp::OpDesc& op_desc,
   auto x_mean2 = builder->Multiply(x_mean, builder->Identity(x_mean));
   auto zero = builder->FillConstant({left},
                                     0.f,
-                                    common::UniqName("layer_norm_zero"),
-                                    common::Type2Str(x->type));
+                                    cinn::common::UniqName("layer_norm_zero"),
+                                    cinn::common::Type2Str(x->type));
   auto x_var = builder->Max(builder->Subtract(x2_mean, x_mean2), zero);
 
   // compute x norm
@@ -117,8 +118,8 @@ void LayerNormOpMapper(const paddle::cpp::OpDesc& op_desc,
   auto epsilon_var =
       builder->FillConstant({left},
                             epsilon,
-                            common::UniqName("layer_norm_epsilon"),
-                            common::Type2Str(x->type));
+                            cinn::common::UniqName("layer_norm_epsilon"),
+                            cinn::common::Type2Str(x->type));
   auto x_var_eps = builder->Add(x_var, epsilon_var);
   auto x_var_sqrt = builder->Sqrt(x_var_eps);
   auto y_out =
diff --git a/paddle/cinn/frontend/op_mappers/paddle/norm.cc b/paddle/cinn/frontend/op_mappers/paddle/norm.cc
index 8e45ead8bf185f..48731adcae060c 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/norm.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/norm.cc
@@ -41,10 +41,11 @@ struct NormHelper {
 
   // std_square_sum = sqrt(square_sum + epsilon)
   Variable StdSquareSum(Variable square_sum, float epsilon) {
-    auto epsilon_1d = builder->FillConstant(square_sum->shape,
-                                            epsilon,
-                                            common::UniqName("norm_epsilon"),
-                                            common::Type2Str(square_sum->type));
+    auto epsilon_1d =
+        builder->FillConstant(square_sum->shape,
+                              epsilon,
+                              cinn::common::UniqName("norm_epsilon"),
+                              cinn::common::Type2Str(square_sum->type));
     auto std_square_sum = builder->Sqrt(builder->Add(square_sum, epsilon_1d));
     return std_square_sum;
   }
@@ -99,14 +100,14 @@ void NormOpMapper(const paddle::cpp::OpDesc& op_desc,
   auto square_sum = helper.SquareSum(x);
   auto std_square_sum = helper.StdSquareSum(square_sum, epsilon);
   auto normalized = ctx.Builder()->Divide(x, std_square_sum);
-  auto y = ctx.Builder()->Cast(normalized, common::Type2Str(in_type));
+  auto y = ctx.Builder()->Cast(normalized, cinn::common::Type2Str(in_type));
 
   ctx.AddVar(out_name, y);
   ctx.AddVarModelToProgram(out_name, y->id);
 
   if (!norm_name.empty()) {
     auto norm_grad =
-        ctx.Builder()->Cast(std_square_sum, common::Type2Str(in_type));
+        ctx.Builder()->Cast(std_square_sum, cinn::common::Type2Str(in_type));
     ctx.AddVar(norm_name, norm_grad);
     ctx.AddVarModelToProgram(norm_name, norm_grad->id);
   }
diff --git a/paddle/cinn/frontend/op_mappers/paddle/reduce.cc b/paddle/cinn/frontend/op_mappers/paddle/reduce.cc
index 0d52d7ec6d7cd4..9162a1158edfaf 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/reduce.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/reduce.cc
@@ -85,7 +85,7 @@ void ReduceOpMapper(const paddle::cpp::OpDesc& op_desc,
 
   auto dtype = utils::GetPaddleDtype(
       op_desc, "out_dtype", static_cast<paddle::cpp::VarDescAPI::Type>(-1));
-  if (!dtype.empty() && common::Type2Str(out.value()->type) != dtype) {
+  if (!dtype.empty() && cinn::common::Type2Str(out.value()->type) != dtype) {
     out = ctx.Builder()->Cast(out.value(), dtype);
   }
 
diff --git a/paddle/cinn/frontend/op_mappers/paddle/scale.cc b/paddle/cinn/frontend/op_mappers/paddle/scale.cc
index b8c8b8c5498850..639af845edefed 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/scale.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/scale.cc
@@ -49,12 +49,13 @@ void ScaleOpMapper(const paddle::cpp::OpDesc& op_desc,
     CHECK(scale_tensor->shape == cinn::utils::ShapeType{1})
         << "The shape of [ScaleTensor] should be [1], but here ["
         << cinn::utils::Join(scale_tensor->shape, ", ") << "]";
-    scale_tensor = ctx.Builder()->Cast(scale_tensor, common::Type2Str(x->type));
+    scale_tensor =
+        ctx.Builder()->Cast(scale_tensor, cinn::common::Type2Str(x->type));
     scale_tensor = ctx.Builder()->BroadcastTo(scale_tensor, x->shape);
 
     if (bias != 0.0f) {
       auto bias_tensor = ctx.Builder()->FillConstant(
-          x->shape, bias, x->id + "_bias", common::Type2Str(x->type));
+          x->shape, bias, x->id + "_bias", cinn::common::Type2Str(x->type));
       if (bias_after_scale) {
         out = ctx.Builder()->Add(bias_tensor,
                                  ctx.Builder()->Multiply(x, scale_tensor));
diff --git a/paddle/cinn/frontend/op_mappers/paddle/scatter.cc b/paddle/cinn/frontend/op_mappers/paddle/scatter.cc
index ca7f15ab254c7e..8be9b563b4cd1e 100644
--- a/paddle/cinn/frontend/op_mappers/paddle/scatter.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/scatter.cc
@@ -40,10 +40,12 @@ void ScatterOpMapper(const paddle::cpp::OpDesc& op_desc,
   const auto& updates = ctx.GetVar(updates_name);
   CHECK(input->type == updates->type)
       << "checks whether the type of the input and the updates are the same.";
-  CHECK(indices->type == common::Int(32) || indices->type == common::Int(64))
+  CHECK(indices->type == cinn::common::Int(32) ||
+        indices->type == cinn::common::Int(64))
       << "checks whether the data type of the indices is either int32 or int64";
-  if (indices->type == common::Int(64)) {
-    indices = ctx.Builder()->Cast(indices, common::Type2Str(common::Int(32)));
+  if (indices->type == cinn::common::Int(64)) {
+    indices = ctx.Builder()->Cast(
+        indices, cinn::common::Type2Str(cinn::common::Int(32)));
   }
   CHECK_LE(indices->shape.size(), 2) << "Ids should be 0, 1 or 2 in scatter_op";
   if (indices->shape.size() == 0) {
@@ -61,8 +63,8 @@ void ScatterOpMapper(const paddle::cpp::OpDesc& op_desc,
     const auto& zeros =
         ctx.Builder()->FillConstant(updates->shape,
                                     0,
-                                    common::UniqName("scatter_zeros"),
-                                    common::Type2Str(updates->type));
+                                    cinn::common::UniqName("scatter_zeros"),
+                                    cinn::common::Type2Str(updates->type));
     out = ctx.Builder()->ScatterAssign(input, zeros, indices);
     out = ctx.Builder()->ScatterAdd(out, updates, indices);
   }
diff --git a/paddle/cinn/frontend/op_mappers/science/broadcast.cc b/paddle/cinn/frontend/op_mappers/science/broadcast.cc
index f5b3f9cd20f90c..a4cdf22391ec22 100644
--- a/paddle/cinn/frontend/op_mappers/science/broadcast.cc
+++ b/paddle/cinn/frontend/op_mappers/science/broadcast.cc
@@ -33,7 +33,7 @@ void FillConstantOpMapper(const paddle::cpp::OpDesc& op_desc,
       op_desc, "dtype", static_cast<int>(paddle::cpp::VarDescAPI::Type::FP32));
   auto dtype_pd = static_cast<paddle::cpp::VarDescAPI::Type>(dtype_id);
   auto dtype_cinn = utils::CppVarType2CommonType(dtype_pd);
-  auto dtype = common::Type2Str(dtype_cinn);
+  auto dtype = cinn::common::Type2Str(dtype_cinn);
 
   VLOG(4) << "fill constant (" << value << ") with shape ("
           << cinn::utils::Join(shape, ",") << ") and dtype [" << dtype << "]";
diff --git a/paddle/cinn/frontend/op_mappers/science/transform.cc b/paddle/cinn/frontend/op_mappers/science/transform.cc
index 45faa1961790dc..a58f93fd275979 100644
--- a/paddle/cinn/frontend/op_mappers/science/transform.cc
+++ b/paddle/cinn/frontend/op_mappers/science/transform.cc
@@ -413,7 +413,7 @@ void CastOpMapper(const paddle::cpp::OpDesc& op_desc,
       op_desc, "dtype", static_cast<int>(paddle::cpp::VarDescAPI::Type::FP32));
   auto dtype_pd = static_cast<paddle::cpp::VarDescAPI::Type>(dtype_id);
   auto dtype_cinn = utils::CppVarType2CommonType(dtype_pd);
-  auto dtype = common::Type2Str(dtype_cinn);
+  auto dtype = cinn::common::Type2Str(dtype_cinn);
 
   VLOG(4) << out_name << " = cast(" << x_name << ", dtype=" << dtype << ")";
 
diff --git a/paddle/cinn/frontend/optimize.cc b/paddle/cinn/frontend/optimize.cc
index 1f4572a87d47ed..d88952cabefba6 100644
--- a/paddle/cinn/frontend/optimize.cc
+++ b/paddle/cinn/frontend/optimize.cc
@@ -134,7 +134,7 @@ std::vector<std::string> DefaultOpFusionPasses() {
 std::shared_ptr<hlir::framework::Graph> Optimize(
     frontend::Program* program,
     const std::unordered_set<std::string>& fetch_ids,
-    common::Target target,
+    cinn::common::Target target,
     const OptimizeOptions& options) {
   cinn::hlir::framework::PassPrinter::GetInstance()->Begin(fetch_ids);
   // Apply program passes
@@ -154,7 +154,7 @@ std::shared_ptr<hlir::framework::Graph> Optimize(
 std::shared_ptr<hlir::framework::Graph> Optimize(
     frontend::Program* program,
     const std::unordered_set<std::string>& fetch_ids,
-    common::Target target,
+    cinn::common::Target target,
     const std::vector<std::string>& passes) {
   OptimizeOptions options;
 
diff --git a/paddle/cinn/frontend/optimize.h b/paddle/cinn/frontend/optimize.h
index 543c027308d7b1..b382d0c5a6fb4c 100755
--- a/paddle/cinn/frontend/optimize.h
+++ b/paddle/cinn/frontend/optimize.h
@@ -38,13 +38,13 @@ std::vector<std::string> DefaultOpFusionPasses();
 std::shared_ptr<hlir::framework::Graph> Optimize(
     frontend::Program* program,
     const std::unordered_set<std::string>& fetch_ids,
-    common::Target target,
+    cinn::common::Target target,
     const OptimizeOptions& options = DefaultTrainingOptimizeOptions());
 
 std::shared_ptr<hlir::framework::Graph> Optimize(
     frontend::Program* program,
     const std::unordered_set<std::string>& fetch_ids,
-    common::Target target,
+    cinn::common::Target target,
     const std::vector<std::string>& passes);
 
 }  // namespace frontend
diff --git a/paddle/cinn/frontend/paddle/model_parser.cc b/paddle/cinn/frontend/paddle/model_parser.cc
index 5c2bf8eb37363b..dce003b225fe74 100644
--- a/paddle/cinn/frontend/paddle/model_parser.cc
+++ b/paddle/cinn/frontend/paddle/model_parser.cc
@@ -49,7 +49,7 @@ int SizeOfType(framework_proto::VarType::Type type) {
 
 void TensorFromStream(std::istream &is,
                       hlir::framework::_Tensor_ *tensor,
-                      const common::Target &target) {
+                      const cinn::common::Target &target) {
   using Type = framework_proto::VarType::Type;
   uint32_t version;
   is.read(reinterpret_cast<char *>(&version), sizeof(version));
@@ -117,7 +117,7 @@ void TensorFromStream(std::istream &is,
 
 void LoadLoDTensor(std::istream &is,
                    hlir::framework::Variable *var,
-                   const common::Target &target) {
+                   const cinn::common::Target &target) {
   auto &tensor = absl::get<hlir::framework::Tensor>(*var);
   uint32_t version{};
   is.read(reinterpret_cast<char *>(&version), sizeof(version));
@@ -170,7 +170,7 @@ void LoadParams(const std::string &path) {}
 // Load directly to CPU, and latter transfer to other devices.
 void LoadParam(const std::string &path,
                hlir::framework::Variable *out,
-               const common::Target &target) {
+               const cinn::common::Target &target) {
   std::ifstream fin(path, std::ios::binary);
   CHECK(fin.is_open()) << "failed to open file " << path;
   LoadLoDTensor(fin, out, target);
@@ -190,7 +190,7 @@ void LoadCombinedParamsPb(const std::string &path,
                           hlir::framework::Scope *scope,
                           const cpp::ProgramDesc &cpp_prog,
                           bool params_from_memory,
-                          const common::Target &target) {
+                          const cinn::common::Target &target) {
   CHECK(scope);
   auto prog = cpp_prog;
   auto &main_block_desc = *prog.GetBlock<cpp::BlockDesc>(0);
@@ -236,7 +236,7 @@ void LoadModelPb(const std::string &model_dir,
                  cpp::ProgramDesc *cpp_prog,
                  bool combined,
                  bool model_from_memory,
-                 const common::Target &target) {
+                 const cinn::common::Target &target) {
   CHECK(cpp_prog);
   CHECK(scope);
   cpp_prog->ClearBlocks();
diff --git a/paddle/cinn/frontend/paddle/model_parser.h b/paddle/cinn/frontend/paddle/model_parser.h
index 8bc10108b79de7..03834a7f525c2e 100644
--- a/paddle/cinn/frontend/paddle/model_parser.h
+++ b/paddle/cinn/frontend/paddle/model_parser.h
@@ -30,14 +30,15 @@ namespace cinn::frontend::paddle {
 namespace framework_proto = ::cinn::frontend::paddle::proto;
 
 // Read a model and files of parameters in pb format.
-void LoadModelPb(const std::string& model_dir,
-                 const std::string& model_file,
-                 const std::string& param_file,
-                 hlir::framework::Scope* scope,
-                 cpp::ProgramDesc* cpp_prog,
-                 bool combined = true,
-                 bool model_from_memory = false,
-                 const common::Target& target = common::DefaultHostTarget());
+void LoadModelPb(
+    const std::string& model_dir,
+    const std::string& model_file,
+    const std::string& param_file,
+    hlir::framework::Scope* scope,
+    cpp::ProgramDesc* cpp_prog,
+    bool combined = true,
+    bool model_from_memory = false,
+    const cinn::common::Target& target = cinn::common::DefaultHostTarget());
 
 // Read a __model__ file.
 std::unique_ptr<framework_proto::ProgramDesc> LoadProgram(
@@ -45,7 +46,7 @@ std::unique_ptr<framework_proto::ProgramDesc> LoadProgram(
 
 void LoadLoDTensor(std::istream& is,
                    hlir::framework::Variable* var,
-                   const common::Target& target);
+                   const cinn::common::Target& target);
 
 // Read a single file containing all the parameters.
 void LoadParams(const std::string& path);
@@ -53,21 +54,21 @@ void LoadParams(const std::string& path);
 // Load a single parameter to an output tensor.
 void LoadParam(const std::string& path,
                hlir::framework::Variable* out,
-               const common::Target& target);
+               const cinn::common::Target& target);
 
 void LoadCombinedParamsPb(
     const std::string& path,
     hlir::framework::Scope* scope,
     const pb::ProgramDesc& prog,
     bool params_from_memory = false,
-    const common::Target& target = common::DefaultHostTarget());
+    const cinn::common::Target& target = cinn::common::DefaultHostTarget());
 
 // LoDTensor to ostream
 void TensorToStream(std::ostream& os, const hlir::framework::_Tensor_& tensor);
 void TensorFromStream(
     std::istream& is,
     hlir::framework::_Tensor_* tensor,
-    const common::Target& target = common::DefaultHostTarget());
+    const cinn::common::Target& target = cinn::common::DefaultHostTarget());
 void ReadBinaryFile(const std::string& filename, std::string* contents);
 
 }  // namespace cinn::frontend::paddle
diff --git a/paddle/cinn/frontend/paddle_model_convertor.cc b/paddle/cinn/frontend/paddle_model_convertor.cc
index 1e4aa1eb46b221..a70818f38af219 100644
--- a/paddle/cinn/frontend/paddle_model_convertor.cc
+++ b/paddle/cinn/frontend/paddle_model_convertor.cc
@@ -35,10 +35,10 @@ namespace frontend {
 using cinn::utils::Attribute;
 
 PaddleModelConvertor::PaddleModelConvertor()
-    : PaddleModelConvertor(common::DefaultTarget(), nullptr, nullptr) {}
+    : PaddleModelConvertor(cinn::common::DefaultTarget(), nullptr, nullptr) {}
 
 PaddleModelConvertor::PaddleModelConvertor(
-    const common::Target& target,
+    const cinn::common::Target& target,
     std::shared_ptr<NetBuilder> builder,
     std::shared_ptr<hlir::framework::Scope> scope)
     : target_(target), builder_(builder), scope_(scope) {
@@ -241,7 +241,7 @@ Program PaddleModelConvertor::operator()() { return builder_->Build(); }
 void PaddleModelConvertor::CreateInput(const std::string& dtype,
                                        const cinn::utils::ShapeType& shape,
                                        const std::string& name) {
-  OpMapperContext::FeedInfo feed_info = {shape, common::Str2Type(dtype)};
+  OpMapperContext::FeedInfo feed_info = {shape, cinn::common::Str2Type(dtype)};
 
   ctx_->AddFeedInfo(name, feed_info);
   RunOp("feed", {}, {{"Out", {name}}}, {});
diff --git a/paddle/cinn/frontend/paddle_model_convertor.h b/paddle/cinn/frontend/paddle_model_convertor.h
index ee83223d8c965f..a6243ac369bae8 100644
--- a/paddle/cinn/frontend/paddle_model_convertor.h
+++ b/paddle/cinn/frontend/paddle_model_convertor.h
@@ -41,7 +41,7 @@ class PaddleModelConvertor {
  public:
   PaddleModelConvertor();
 
-  PaddleModelConvertor(const common::Target& target,
+  PaddleModelConvertor(const cinn::common::Target& target,
                        std::shared_ptr<NetBuilder> builder = nullptr,
                        std::shared_ptr<hlir::framework::Scope> scope = nullptr);
 
@@ -103,7 +103,7 @@ class PaddleModelConvertor {
 
   std::unique_ptr<OpMapperContext> ctx_;
   std::shared_ptr<NetBuilder> builder_;
-  const common::Target& target_;
+  const cinn::common::Target& target_;
   std::shared_ptr<hlir::framework::Scope> scope_;
 };
 
diff --git a/paddle/cinn/frontend/paddle_model_convertor_test.cc b/paddle/cinn/frontend/paddle_model_convertor_test.cc
index 953a3919601320..30364c05e417e7 100644
--- a/paddle/cinn/frontend/paddle_model_convertor_test.cc
+++ b/paddle/cinn/frontend/paddle_model_convertor_test.cc
@@ -92,7 +92,7 @@ void RunProgram(const Target& target, Program* prog) {
 }
 
 TEST(PaddleModelConvertor, basic) {
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
 
   PaddleModelConvertor model_transform(target);
   model_transform.LoadModel(FLAGS_model_dir);
diff --git a/paddle/cinn/frontend/paddle_model_to_program.cc b/paddle/cinn/frontend/paddle_model_to_program.cc
index e17ca8863b5f57..52c91216dd9011 100644
--- a/paddle/cinn/frontend/paddle_model_to_program.cc
+++ b/paddle/cinn/frontend/paddle_model_to_program.cc
@@ -98,7 +98,8 @@ void PaddleModelToProgram::AddOpMapper_scale() {
       CHECK(scale_tensor_var) << "No scale tensor found in the scope";
       auto& scale_tensor =
           absl::get<hlir::framework::Tensor>(*scale_tensor_var);
-      scale = scale_tensor->mutable_data<float>(common::DefaultHostTarget())[0];
+      scale = scale_tensor->mutable_data<float>(
+          cinn::common::DefaultHostTarget())[0];
     }
     if (op_desc.HasAttr("bias")) {  // the old model format
       bias = op_desc.GetAttr<float>("bias");
diff --git a/paddle/cinn/frontend/paddle_model_to_program.h b/paddle/cinn/frontend/paddle_model_to_program.h
index ab520e608de377..2fe376a6086191 100644
--- a/paddle/cinn/frontend/paddle_model_to_program.h
+++ b/paddle/cinn/frontend/paddle_model_to_program.h
@@ -43,7 +43,7 @@ class PaddleModelToProgram {
   explicit PaddleModelToProgram(
       hlir::framework::Scope* scope,
       std::unordered_map<std::string, std::vector<int>> input_shape_map,
-      const common::Target& target)
+      const cinn::common::Target& target)
       : scope_(scope),
         input_shape_map_(input_shape_map),
         target_(target),
@@ -145,7 +145,7 @@ class PaddleModelToProgram {
   // map from var in Paddle model to var name in program.
   absl::flat_hash_map<std::string, std::string> var_model_to_program_map_;
   hlir::framework::Scope* scope_{};
-  common::Target target_;
+  cinn::common::Target target_;
 };
 
 }  // namespace frontend
diff --git a/paddle/cinn/frontend/pass/auto_broadcast.cc b/paddle/cinn/frontend/pass/auto_broadcast.cc
index 558105a44ad25e..785ceb9cad4a15 100644
--- a/paddle/cinn/frontend/pass/auto_broadcast.cc
+++ b/paddle/cinn/frontend/pass/auto_broadcast.cc
@@ -118,7 +118,7 @@ class AutoBroadcastPass : public ProgramPass {
  protected:
   void ApplyImpl(Program* program,
                  const std::unordered_set<std::string>& fetch_ids,
-                 const common::Target& target) override {
+                 const cinn::common::Target& target) override {
     NetBuilder builder("auto_broadcast_builder");
     for (auto& var : program->GetInputs()) {
       builder.CreateInput(var);
diff --git a/paddle/cinn/frontend/pass/auto_cast.cc b/paddle/cinn/frontend/pass/auto_cast.cc
index 838ff8b06f1ddf..ebe129af680e46 100644
--- a/paddle/cinn/frontend/pass/auto_cast.cc
+++ b/paddle/cinn/frontend/pass/auto_cast.cc
@@ -39,8 +39,9 @@ Instruction CreateNewCastInstruction(const Variable& input,
                                      const Variable& output) {
   Instruction new_cast_instr("cast", {input});
   new_cast_instr->outputs = {output};
-  new_cast_instr->attrs = {{"dtype", common::Type2Str(output->type)}};
-  new_cast_instr->attrs_ordered = {{"dtype", common::Type2Str(output->type)}};
+  new_cast_instr->attrs = {{"dtype", cinn::common::Type2Str(output->type)}};
+  new_cast_instr->attrs_ordered = {
+      {"dtype", cinn::common::Type2Str(output->type)}};
   return new_cast_instr;
 }
 
@@ -256,7 +257,7 @@ class AutoCastPass : public ProgramPass {
  protected:
   void ApplyImpl(Program* program,
                  const std::unordered_set<std::string>& fetch_ids,
-                 const common::Target& target) override {
+                 const cinn::common::Target& target) override {
     NetBuilder builder("auto_cast_builder");
     for (auto& var : program->GetInputs()) {
       builder.CreateInput(var);
diff --git a/paddle/cinn/frontend/pass/auto_cast_test.cc b/paddle/cinn/frontend/pass/auto_cast_test.cc
index 4b570a2755cdbc..80532a39a3bba0 100644
--- a/paddle/cinn/frontend/pass/auto_cast_test.cc
+++ b/paddle/cinn/frontend/pass/auto_cast_test.cc
@@ -33,11 +33,11 @@ namespace cinn::frontend {
 
 TEST(AutoCast, Exp) {
   NetBuilder builder("net_builder");
-  auto x = builder.CreateInput(common::Float16(), {4, 5, 3}, "X");
+  auto x = builder.CreateInput(cinn::common::Float16(), {4, 5, 3}, "X");
   auto out = builder.Exp(x);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::pair<std::vector<std::string>, std::vector<std::string>> passes{
       {}, {"AutoCast", "Decomposer"}};
   CompareProgramPassResult(&program, target, {out->id}, -2, passes);
@@ -45,11 +45,11 @@ TEST(AutoCast, Exp) {
 
 TEST(AutoCast, Exp_bf16) {
   NetBuilder builder("net_builder");
-  auto x = builder.CreateInput(common::BFloat16(), {4, 5, 3}, "X");
+  auto x = builder.CreateInput(cinn::common::BFloat16(), {4, 5, 3}, "X");
   auto out = builder.Exp(x);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::pair<std::vector<std::string>, std::vector<std::string>> passes{
       {}, {"AutoCast", "Decomposer"}};
   CompareProgramPassResult(&program, target, {out->id}, -2, passes);
@@ -57,7 +57,8 @@ TEST(AutoCast, Exp_bf16) {
 
 TEST(AutoCast, BatchNorm) {
   NetBuilder builder("net_builder");
-  auto x = builder.CreateInput(common::Float16(), {128, 64, 112, 112}, "X");
+  auto x =
+      builder.CreateInput(cinn::common::Float16(), {128, 64, 112, 112}, "X");
   auto scale = builder.FillConstant({64}, 1.0f, "scale", "float32");
   auto bias = builder.FillConstant({64}, 0.0f, "bias", "float32");
   auto mean = builder.FillConstant({64}, 0.0f, "mean", "float32");
@@ -66,7 +67,7 @@ TEST(AutoCast, BatchNorm) {
       x, scale, bias, mean, variance, 1e-5f, 0.9f, "NCHW", false);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::pair<std::vector<std::string>, std::vector<std::string>> passes{
       {}, {"AutoCast", "Decomposer"}};
   CompareProgramPassResult(&program, target, {out[0]->id}, -2, passes);
@@ -74,7 +75,8 @@ TEST(AutoCast, BatchNorm) {
 
 TEST(AutoCast, BatchNorm_bf16) {
   NetBuilder builder("net_builder");
-  auto x = builder.CreateInput(common::BFloat16(), {128, 64, 112, 112}, "X");
+  auto x =
+      builder.CreateInput(cinn::common::BFloat16(), {128, 64, 112, 112}, "X");
   auto scale = builder.FillConstant({64}, 1.0f, "scale", "float32");
   auto bias = builder.FillConstant({64}, 0.0f, "bias", "float32");
   auto mean = builder.FillConstant({64}, 0.0f, "mean", "float32");
@@ -83,7 +85,7 @@ TEST(AutoCast, BatchNorm_bf16) {
       x, scale, bias, mean, variance, 1e-5f, 0.9f, "NCHW", false);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::pair<std::vector<std::string>, std::vector<std::string>> passes{
       {}, {"AutoCast", "Decomposer"}};
   CompareProgramPassResult(&program, target, {out[0]->id}, -2, passes);
diff --git a/paddle/cinn/frontend/pass/cast_collapsing.cc b/paddle/cinn/frontend/pass/cast_collapsing.cc
index 5fc40d407029f5..7667336f544702 100644
--- a/paddle/cinn/frontend/pass/cast_collapsing.cc
+++ b/paddle/cinn/frontend/pass/cast_collapsing.cc
@@ -67,7 +67,7 @@ class CastCollapsingPass : public ProgramPass {
 
   void ApplyImpl(Program* program,
                  const std::unordered_set<std::string>& fetch_ids,
-                 const common::Target& target) const override {
+                 const cinn::common::Target& target) const override {
     // `out2instr` is used to represent the mapping of Output to Instruction.
     OutputToOpMap out2instr;
     // `in2instr` is used to represent the mapping of Input to Instruction.
diff --git a/paddle/cinn/frontend/pass/cast_collapsing_test.cc b/paddle/cinn/frontend/pass/cast_collapsing_test.cc
index 8384002c872cfb..42415da8ecdaa1 100644
--- a/paddle/cinn/frontend/pass/cast_collapsing_test.cc
+++ b/paddle/cinn/frontend/pass/cast_collapsing_test.cc
@@ -42,7 +42,7 @@ TEST(CastCollapsing, FuseTwoCast) {
   auto out = builder.Cast(x_t, "float32");
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{x.id()},
                     std::back_inserter(input_ids),
@@ -63,7 +63,7 @@ TEST(CastCollapsing, FuseThreeCast) {
   auto out = builder.Cast(x_2t, "float32");
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{x.id()},
                     std::back_inserter(input_ids),
@@ -82,7 +82,7 @@ TEST(CastCollapsing, ReplaceUselessCastWithIndentity) {
   auto out = builder.Cast(x, "float32");
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{x.id()},
                     std::back_inserter(input_ids),
@@ -104,7 +104,7 @@ TEST(CastCollapsing, FuseCastToUseless) {
   auto out = builder.Add(x_3t, x_3t);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{x.id()},
                     std::back_inserter(input_ids),
@@ -128,7 +128,7 @@ TEST(TransposeCollapsing, FuseTransposeWithMultiOutput) {
   auto out3 = builder.Transpose(x_3t, {0, 2, 1});
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{x.id()},
                     std::back_inserter(input_ids),
@@ -159,7 +159,7 @@ TEST(TransposeCollapsing, FuseTwoSecTranspose) {
   auto out2 = builder.Transpose(x_2t, {0, 2, 1});
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{x.id()},
                     std::back_inserter(input_ids),
@@ -181,7 +181,7 @@ TEST(TransposeCollapsing, FuseTwoHorizontalTranspose) {
   auto out = builder.Add(y_t1, y_t2);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{x.id()},
                     std::back_inserter(input_ids),
@@ -203,7 +203,7 @@ TEST(TransposeCollapsing, FuseVerAndHorTranspose) {
   auto out = builder.Add(y_t2, y_t3);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{x.id()},
                     std::back_inserter(input_ids),
diff --git a/paddle/cinn/frontend/pass/dead_code_eliminate.cc b/paddle/cinn/frontend/pass/dead_code_eliminate.cc
index 0c093cf75fd024..2776135281dc0a 100644
--- a/paddle/cinn/frontend/pass/dead_code_eliminate.cc
+++ b/paddle/cinn/frontend/pass/dead_code_eliminate.cc
@@ -35,7 +35,7 @@ class DeadCodeEliminatePass : public ProgramPass {
 
   void ApplyImpl(Program* program,
                  const std::unordered_set<std::string>& fetch_ids,
-                 const common::Target& target) override {
+                 const cinn::common::Target& target) override {
     if (!CheckFetchIds(*program, fetch_ids)) {
       return;
     }
diff --git a/paddle/cinn/frontend/pass/dead_code_eliminate_test.cc b/paddle/cinn/frontend/pass/dead_code_eliminate_test.cc
index 7e418f394dae35..d2eb4faa70723d 100644
--- a/paddle/cinn/frontend/pass/dead_code_eliminate_test.cc
+++ b/paddle/cinn/frontend/pass/dead_code_eliminate_test.cc
@@ -46,7 +46,7 @@ TEST(DeadCodeEliminate, remove_single) {
   std::vector<std::string> input_names = {x.id().data()};
   std::vector<std::string> output_names = {identity_1->id, reduce_sum_2->id};
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::pair<std::vector<std::string>, std::vector<std::string>> passes{
       {"Decomposer"}, {"DeadCodeEliminate"}};
   CompareResult(
@@ -75,7 +75,7 @@ TEST(DeadCodeEliminate, remove_multiple) {
   std::vector<std::string> input_names = {x.id().data()};
   std::vector<std::string> output_names = {reduce_sum_1->id};
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::pair<std::vector<std::string>, std::vector<std::string>> passes{
       {"Decomposer"}, {"DeadCodeEliminate"}};
   CompareResult(
diff --git a/paddle/cinn/frontend/pass/decomposer.cc b/paddle/cinn/frontend/pass/decomposer.cc
index b18ac57be73f31..06c685b418817c 100755
--- a/paddle/cinn/frontend/pass/decomposer.cc
+++ b/paddle/cinn/frontend/pass/decomposer.cc
@@ -30,7 +30,7 @@ class DecomposerPass : public ProgramPass {
 
   void ApplyImpl(Program* prog,
                  const std::unordered_set<std::string>& fetch_ids,
-                 const common::Target& target) const override {
+                 const cinn::common::Target& target) const override {
     // step 1: set the inputs of the origin program to the new program
     NetBuilder builder("decomposer_builder");
     for (auto& var : prog->GetInputs()) {
diff --git a/paddle/cinn/frontend/pass/decomposer_test.cc b/paddle/cinn/frontend/pass/decomposer_test.cc
index 811d38ea693ddb..f38f86de4f07cb 100644
--- a/paddle/cinn/frontend/pass/decomposer_test.cc
+++ b/paddle/cinn/frontend/pass/decomposer_test.cc
@@ -60,9 +60,9 @@ TEST(DecomposePass, basic) {
   }
 
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
 
   ProgramPass::Apply(&prog, {}, target, {"Decomposer"});
diff --git a/paddle/cinn/frontend/pass/expand_zero_dim_pass.cc b/paddle/cinn/frontend/pass/expand_zero_dim_pass.cc
index 9732478c75b237..cd058e0a2ecdeb 100644
--- a/paddle/cinn/frontend/pass/expand_zero_dim_pass.cc
+++ b/paddle/cinn/frontend/pass/expand_zero_dim_pass.cc
@@ -32,7 +32,7 @@ class ExpandZeroDimPass : public ProgramPass {
  protected:
   void ApplyImpl(Program* program,
                  const std::unordered_set<std::string>& fetch_ids,
-                 const common::Target& target) override {
+                 const cinn::common::Target& target) override {
     NetBuilder builder("expand_zero_dim_builder");
     for (int i = 0; i < program->size(); ++i) {
       auto& instr = (*program)[i];
diff --git a/paddle/cinn/frontend/pass/expand_zero_dim_pass_test.cc b/paddle/cinn/frontend/pass/expand_zero_dim_pass_test.cc
index 9f65ec5d0ea33a..6420dede273c6d 100644
--- a/paddle/cinn/frontend/pass/expand_zero_dim_pass_test.cc
+++ b/paddle/cinn/frontend/pass/expand_zero_dim_pass_test.cc
@@ -85,7 +85,7 @@ TEST(ExpandZeroDimPass, expand_zero_dim_1) {
   auto y = builder.CreateInput(Float(32), {}, "y");
   auto out = builder.Add(x, y);
   auto program = builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
 
   size_t origin_size = program.size();
   VLOG(1) << "Program Before ExpandZeroDimPass:\n" << program;
@@ -125,7 +125,7 @@ TEST(ExpandZeroDimPass, expand_zero_dim_2) {
   auto y = builder.CreateInput(Float(32), {}, "y");
   auto out = builder.Add(x, y);
   auto program = builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
 
   size_t origin_size = program.size();
   VLOG(1) << "Program Before ExpandZeroDimPass:\n" << program;
diff --git a/paddle/cinn/frontend/pass/fill_constant_folding.cc b/paddle/cinn/frontend/pass/fill_constant_folding.cc
index c6ee33bc6c79cc..b731c9ab3cf6a2 100644
--- a/paddle/cinn/frontend/pass/fill_constant_folding.cc
+++ b/paddle/cinn/frontend/pass/fill_constant_folding.cc
@@ -94,7 +94,7 @@ class FillConstantFoldingPass : public ProgramPass {
 
   void ApplyImpl(Program* program,
                  const std::unordered_set<std::string>& fetch_ids,
-                 const common::Target& target) const override {
+                 const cinn::common::Target& target) const override {
     auto in2instr = GetInputToOpMap(program);
 
     // `fill_constant_map` is used to represent the first fill_constant and its
diff --git a/paddle/cinn/frontend/pass/fill_constant_folding_test.cc b/paddle/cinn/frontend/pass/fill_constant_folding_test.cc
index 2300c16b3e3d00..e8385d972f733d 100644
--- a/paddle/cinn/frontend/pass/fill_constant_folding_test.cc
+++ b/paddle/cinn/frontend/pass/fill_constant_folding_test.cc
@@ -56,7 +56,7 @@ TEST(TransposeFolding, FoldTwoFillConstant) {
   auto transpose_y = builder.Transpose(y, {1, 0});
   auto out = builder.Add(transpose_x, transpose_y);
   auto program = builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
 
   size_t origin_size = program.size();
   VLOG(1) << "Program Before FillConstantFolding:\n" << program;
@@ -97,7 +97,7 @@ TEST(TransposeFolding, FoldTwoFillConstantWithSameOuput) {
   auto transpose_x = builder.Transpose(x, {1, 0});
   auto out = builder.Add(y, y);
   auto program = builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
 
   size_t origin_size = program.size();
   VLOG(1) << "Program Before FillConstantFolding:\n" << program;
@@ -136,7 +136,7 @@ TEST(TransposeFolding, FoldThreeFillConstant) {
   auto transpose_x = builder.Transpose(x, {1, 0});
   auto out = builder.Add(y, z);
   auto program = builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   size_t origin_size = program.size();
   VLOG(1) << "Program Before FillConstantFolding:\n" << program;
   // Program {
@@ -175,7 +175,7 @@ TEST(TransposeFolding, FoldThreeFillConstantWithOneDiff) {
   auto transpose_x = builder.Transpose(x, {1, 0});
   auto out = builder.Add(y, z);
   auto program = builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   auto scope = hlir::framework::BuildScope(target, graph);
 
diff --git a/paddle/cinn/frontend/pass/fill_constant_rewriter.cc b/paddle/cinn/frontend/pass/fill_constant_rewriter.cc
index 569d1ba77f859f..2ede43f4ae9b6d 100644
--- a/paddle/cinn/frontend/pass/fill_constant_rewriter.cc
+++ b/paddle/cinn/frontend/pass/fill_constant_rewriter.cc
@@ -154,7 +154,7 @@ class FillConstantRewriterPass : public ProgramPass {
 
   void ApplyImpl(Program* program,
                  const std::unordered_set<std::string>& fetch_ids,
-                 const common::Target& target) override {
+                 const cinn::common::Target& target) override {
     auto input2instr = GetInput2Instr(program);
 
     std::unordered_set<const Instruction*> remove_instr;
diff --git a/paddle/cinn/frontend/pass/gemm_rewriter.cc b/paddle/cinn/frontend/pass/gemm_rewriter.cc
index 9a43ea4ade125d..fe178c0b88137b 100644
--- a/paddle/cinn/frontend/pass/gemm_rewriter.cc
+++ b/paddle/cinn/frontend/pass/gemm_rewriter.cc
@@ -39,7 +39,7 @@ class GemmRewriterPass : public ProgramPass {
 
   void ApplyImpl(Program* prog,
                  const std::unordered_set<std::string>& fetch_ids,
-                 const common::Target& target) override {
+                 const cinn::common::Target& target) override {
     if (target.arch != Target::Arch::NVGPU || !prog->size()) {
       return;
     }
diff --git a/paddle/cinn/frontend/pass/gemm_rewriter_test.cc b/paddle/cinn/frontend/pass/gemm_rewriter_test.cc
index 88a4f7482f48ec..22f81a6b9a22fe 100755
--- a/paddle/cinn/frontend/pass/gemm_rewriter_test.cc
+++ b/paddle/cinn/frontend/pass/gemm_rewriter_test.cc
@@ -43,7 +43,7 @@ TEST(GemmRwriter, BatchedTransLeft) {
   auto out = builder.Add(d, e);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{a.id(), c.id(), e.id()},
                     std::back_inserter(input_ids),
@@ -67,7 +67,7 @@ TEST(GemmRwriter, BatchedTransRight) {
   auto out = builder.Add(e, f);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{a.id(), b.id(), f.id()},
                     std::back_inserter(input_ids),
@@ -92,7 +92,7 @@ TEST(GemmRwriter, BatchedTransTwo) {
   auto out = builder.Add(e, f);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{a.id(), c.id(), f.id()},
                     std::back_inserter(input_ids),
@@ -115,7 +115,7 @@ TEST(GemmRwriter, BatchedNoTrans) {
   auto out = builder.Add(e, f);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{a.id(), b.id(), f.id()},
                     std::back_inserter(input_ids),
@@ -139,7 +139,7 @@ TEST(GemmRwriter, TransLeft) {
   auto out = builder.Add(d, e);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{a.id(), c.id(), e.id()},
                     std::back_inserter(input_ids),
@@ -163,7 +163,7 @@ TEST(GemmRwriter, TransRight) {
   auto out = builder.Add(e, f);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{a.id(), b.id(), f.id()},
                     std::back_inserter(input_ids),
@@ -188,7 +188,7 @@ TEST(GemmRwriter, TransTwo) {
   auto out = builder.Add(e, f);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{a.id(), c.id(), f.id()},
                     std::back_inserter(input_ids),
@@ -211,7 +211,7 @@ TEST(GemmRwriter, NoTrans) {
   auto out = builder.Add(e, f);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{a.id(), b.id(), f.id()},
                     std::back_inserter(input_ids),
@@ -245,7 +245,7 @@ TEST(GemmRwriter, BatchedComplex) {
   auto out = builder.Add(p, q);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{d.id(), z.id()},
                     std::back_inserter(input_ids),
@@ -276,7 +276,7 @@ TEST(GemmRwriter, Complex) {
   auto out = builder.Add(p, q);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{c.id(), z.id()},
                     std::back_inserter(input_ids),
diff --git a/paddle/cinn/frontend/pass/pass_test_helper.h b/paddle/cinn/frontend/pass/pass_test_helper.h
index 0aeebef4022fec..89e98cf5cc5a7e 100644
--- a/paddle/cinn/frontend/pass/pass_test_helper.h
+++ b/paddle/cinn/frontend/pass/pass_test_helper.h
@@ -73,7 +73,7 @@ inline void PrintMatrix(const std::vector<float>& mat, int bs, int m, int n) {
 }
 
 inline void RunGraph(std::shared_ptr<hlir::framework::Graph> graph,
-                     const common::Target& target,
+                     const cinn::common::Target& target,
                      const std::shared_ptr<hlir::framework::Scope>& scope,
                      const std::vector<std::string>& output_ids,
                      const std::vector<std::string>& graph_passes) {
@@ -91,7 +91,7 @@ inline void RunGraph(std::shared_ptr<hlir::framework::Graph> graph,
 
 inline std::vector<float> RunProgram(
     const Program& program,
-    const common::Target& target,
+    const cinn::common::Target& target,
     const std::vector<std::string>& input_ids,
     const std::vector<std::string>& output_ids,
     const std::vector<std::string>& graph_passes,
@@ -177,7 +177,7 @@ struct OptimizeConfig {
 };
 
 inline void CompareResult(Program* program,
-                          const common::Target& target,
+                          const cinn::common::Target& target,
                           const std::vector<std::string>& input_ids,
                           const std::vector<std::string>& output_ids,
                           size_t size_diff,
@@ -222,7 +222,7 @@ inline void CompareResult(Program* program,
 
 inline bool CompareProgramPassResult(
     Program* program,
-    const common::Target& target,
+    const cinn::common::Target& target,
     const std::unordered_set<std::string>& fetch_ids,
     const size_t size_diff,
     const OptimizeConfig& passes) {
diff --git a/paddle/cinn/frontend/pass/program_topoerror_test.cc b/paddle/cinn/frontend/pass/program_topoerror_test.cc
index bf1355e16387f7..95f84729241691 100644
--- a/paddle/cinn/frontend/pass/program_topoerror_test.cc
+++ b/paddle/cinn/frontend/pass/program_topoerror_test.cc
@@ -60,7 +60,7 @@ TEST(TransposeFoldingInput, TransposeWithMultiMamtul) {
   auto out = builder.Add(dot1, dot2);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
                     std::back_inserter(input_ids),
diff --git a/paddle/cinn/frontend/pass/remove_identity.cc b/paddle/cinn/frontend/pass/remove_identity.cc
index bd80a45701dedd..81719b75677bfb 100644
--- a/paddle/cinn/frontend/pass/remove_identity.cc
+++ b/paddle/cinn/frontend/pass/remove_identity.cc
@@ -144,7 +144,7 @@ class RemoveIdentityPass : public ProgramPass {
  protected:
   void ApplyImpl(Program* program,
                  const std::unordered_set<std::string>& fetch_ids,
-                 const common::Target& target) override {
+                 const cinn::common::Target& target) override {
     CollectInfo(*program, fetch_ids);
 
     VLOG(3) << "Total remove " << remove_idxs_.size() << " instructions.";
diff --git a/paddle/cinn/frontend/pass/test_helper.h b/paddle/cinn/frontend/pass/test_helper.h
index fb636a6d27c676..ea3ed61f1165b3 100644
--- a/paddle/cinn/frontend/pass/test_helper.h
+++ b/paddle/cinn/frontend/pass/test_helper.h
@@ -74,7 +74,7 @@ std::vector<T> CopyToVector(const hlir::framework::Tensor tensor) {
 
 class PassTest {
  public:
-  PassTest() { target_ = common::DefaultTarget(); }
+  PassTest() { target_ = cinn::common::DefaultTarget(); }
 
   int RunAndCheck(NetBuilder* builder,
                   const std::vector<std::string>& program_passes,
diff --git a/paddle/cinn/frontend/pass/transpose_collapsing.cc b/paddle/cinn/frontend/pass/transpose_collapsing.cc
index ecf71ae55a0aac..8f6530853acefe 100644
--- a/paddle/cinn/frontend/pass/transpose_collapsing.cc
+++ b/paddle/cinn/frontend/pass/transpose_collapsing.cc
@@ -78,7 +78,7 @@ class TransposeCollapsingPass : public ProgramPass {
 
   void ApplyImpl(Program* program,
                  const std::unordered_set<std::string>& fetch_ids,
-                 const common::Target& target) const override {
+                 const cinn::common::Target& target) const override {
     // `out2instr` is used to represent the mapping of Output to Instruction.
     OutputToOpMap out2instr;
     // `in2instr` is used to represent the mapping of Input to Instruction.
diff --git a/paddle/cinn/frontend/pass/transpose_collapsing_test.cc b/paddle/cinn/frontend/pass/transpose_collapsing_test.cc
index c82a4d3b3b24ce..b1cdf7f09e7c96 100644
--- a/paddle/cinn/frontend/pass/transpose_collapsing_test.cc
+++ b/paddle/cinn/frontend/pass/transpose_collapsing_test.cc
@@ -38,7 +38,7 @@ void SetInputData(const hlir::framework::Tensor& tensor, Target target) {
     host_memory[i] = static_cast<float>(i);
   }
 #ifdef CINN_WITH_CUDA
-  if (target == common::DefaultNVGPUTarget()) {
+  if (target == cinn::common::DefaultNVGPUTarget()) {
     cudaMemcpy(data,
                host_memory.data(),
                tensor->shape().numel() * sizeof(float),
@@ -46,7 +46,7 @@ void SetInputData(const hlir::framework::Tensor& tensor, Target target) {
     return;
   }
 #endif
-  CHECK(target == common::DefaultHostTarget());
+  CHECK(target == cinn::common::DefaultHostTarget());
   std::copy(host_memory.begin(), host_memory.end(), data);
 }
 std::vector<std::vector<float>> RunWithProgram(
@@ -88,7 +88,7 @@ TEST(TransposeCollapsing, FuseTwoTranspose) {
   auto x_t = builder.Transpose(x, {0, 2, 1});
   auto out = builder.Transpose(x_t, {2, 1, 0});
   auto program = builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
 
   std::initializer_list<std::string> fetch_list = {out->id};
 
@@ -127,7 +127,7 @@ TEST(TransposeCollapsing, FuseThreeTranspose) {
   auto x_2t = builder.Transpose(x_1t, {2, 1, 0});
   auto out = builder.Transpose(x_2t, {1, 2, 0});
   auto program = builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
 
   std::initializer_list<std::string> fetch_list = {out->id};
 
@@ -166,7 +166,7 @@ TEST(TransposeCollapsing, RemoveUselessTranspose) {
   auto x_t = builder.Transpose(x, {0, 1, 2});
   auto out = builder.Add(x, x_t);
   auto program = builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
 
   std::initializer_list<std::string> fetch_list = {out->id};
 
@@ -201,7 +201,7 @@ TEST(TransposeCollapsing, ReplaceUselessTransposeWithIndentity) {
   auto x = builder.CreateInput(Float(32), {4, 5, 3}, "X");
   auto out = builder.Transpose(x, {0, 1, 2});
   auto program = builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
 
   std::initializer_list<std::string> fetch_list = {out->id};
 
@@ -241,7 +241,7 @@ TEST(TransposeCollapsing, FuseTransposeToUseless) {
   auto x_3t = builder.Transpose(x_2t, {0, 2, 1});
   auto out = builder.Add(x_3t, x_3t);
   auto program = builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
 
   std::initializer_list<std::string> fetch_list = {out->id};
 
@@ -286,7 +286,7 @@ TEST(TransposeCollapsing, FuseTransposeWithMultiOutput) {
   auto out2 = builder.Sqrt(x_2t);
   auto out3 = builder.Sqrt(x_3t);
   auto program = builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
 
   std::initializer_list<std::string> fetch_list = {
       out1->id, out2->id, out3->id};
@@ -338,7 +338,7 @@ TEST(TransposeCollapsing, FuseTwoSecTranspose) {
   auto x_4t = builder.Transpose(x_3t, {2, 1, 0});
   auto out2 = builder.Sqrt(x_4t);
   auto program = builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
 
   std::initializer_list<std::string> fetch_list = {out1->id, out2->id};
 
@@ -384,7 +384,7 @@ TEST(TransposeCollapsing, FuseTwoHorizontalTranspose) {
   auto y_t2 = builder.Transpose(x, {0, 2, 1});
   auto out = builder.Add(y_t1, y_t2);
   auto program = builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
 
   std::initializer_list<std::string> fetch_list = {out->id};
 
@@ -426,7 +426,7 @@ TEST(TransposeCollapsing, FuseVerAndHorTranspose) {
   auto y_t3 = builder.Transpose(x, {1, 2, 0});
   auto out = builder.Add(y_t2, y_t3);
   auto program = builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
 
   std::initializer_list<std::string> fetch_list = {out->id};
 
diff --git a/paddle/cinn/frontend/pass/transpose_folding_base.h b/paddle/cinn/frontend/pass/transpose_folding_base.h
index 4acc8e4f6d1f67..1703505b94a27d 100644
--- a/paddle/cinn/frontend/pass/transpose_folding_base.h
+++ b/paddle/cinn/frontend/pass/transpose_folding_base.h
@@ -52,7 +52,7 @@ class TransposeFoldingBase : public ProgramPass {
 
   void ApplyImpl(Program* program,
                  const std::unordered_set<std::string>& fetch_ids,
-                 const common::Target& target) override {
+                 const cinn::common::Target& target) override {
     set_target_instrs();
     set_fold_instrs();
     set_skip_instrs();
diff --git a/paddle/cinn/frontend/pass/transpose_folding_input_test.cc b/paddle/cinn/frontend/pass/transpose_folding_input_test.cc
index b8befcc609c830..708572cefd5f11 100644
--- a/paddle/cinn/frontend/pass/transpose_folding_input_test.cc
+++ b/paddle/cinn/frontend/pass/transpose_folding_input_test.cc
@@ -56,7 +56,7 @@ TEST(TransposeFoldingInput, FoldIntoDotBatchedCase1) {
   auto out = builder.Matmul(transpose_x, y);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
                     std::back_inserter(input_ids),
@@ -81,7 +81,7 @@ TEST(TransposeFoldingInput, FoldIntoDotBachedCase2) {
   auto out = builder.Matmul(x, transpose_y);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
                     std::back_inserter(input_ids),
@@ -107,7 +107,7 @@ TEST(TransposeFoldingInput, FoldIntoDotBachedCase3) {
   auto out = builder.Matmul(transpose_x, transpose_y);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
                     std::back_inserter(input_ids),
@@ -132,7 +132,7 @@ TEST(TransposeFoldingInput, FoldIntoDotCase1) {
   auto out = builder.Matmul(x, transpose_y);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
                     std::back_inserter(input_ids),
@@ -162,7 +162,7 @@ TEST(TransposeFoldingInput, FoldIntoDotCase2) {
   auto out = builder.Add(d, q);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{c.id(), z.id()},
                     std::back_inserter(input_ids),
@@ -187,7 +187,7 @@ TEST(TransposeFoldingInput, TransposeOutInFetchIds) {
   auto out = builder.Matmul(x, transpose_y);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
                     std::back_inserter(input_ids),
@@ -220,7 +220,7 @@ TEST(TransposeFoldingInput, TransposeOutUsedByOtherInstrs) {
   auto out = builder.Add(transpose_y, dot);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
                     std::back_inserter(input_ids),
@@ -249,7 +249,7 @@ TEST(TransposeFoldingInput, TransposeTwiceWithMatmul) {
   auto dot2 = builder.Matmul(z, x_t_t);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{x.id(), y.id(), z.id()},
                     std::back_inserter(input_ids),
@@ -277,7 +277,7 @@ TEST(TransposeFoldingInput, TransposeWithMultiMamtul) {
   auto out = builder.Add(dot1, dot2);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
                     std::back_inserter(input_ids),
diff --git a/paddle/cinn/frontend/pass/transpose_folding_output_test.cc b/paddle/cinn/frontend/pass/transpose_folding_output_test.cc
index 4004acbd8d0ea4..5a5e013b37fde0 100755
--- a/paddle/cinn/frontend/pass/transpose_folding_output_test.cc
+++ b/paddle/cinn/frontend/pass/transpose_folding_output_test.cc
@@ -43,7 +43,7 @@ TEST(TransposeFoldingOutput, BatchedMatmulTransLeft) {
   auto out = builder.Subtract(e, f);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{a.id(), c.id(), f.id()},
                     std::back_inserter(input_ids),
@@ -71,7 +71,7 @@ TEST(TransposeFoldingOutput, BatchedGemmTransLeft) {
   auto out = builder.Add(e, f);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{a.id(), c.id(), f.id()},
                     std::back_inserter(input_ids),
@@ -99,7 +99,7 @@ TEST(TransposeFoldingOutput, BatchedMatmulTransRight) {
   auto out = builder.Subtract(e, f);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{a.id(), b.id(), f.id()},
                     std::back_inserter(input_ids),
@@ -127,7 +127,7 @@ TEST(TransposeFoldingOutput, BatchedGemmTransRight) {
   auto out = builder.Add(e, f);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{a.id(), b.id(), f.id()},
                     std::back_inserter(input_ids),
@@ -156,7 +156,7 @@ TEST(TransposeFoldingOutput, BatchedMatmulTransTwo) {
   auto out = builder.Subtract(f, g);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{a.id(), c.id(), f.id()},
                     std::back_inserter(input_ids),
@@ -185,7 +185,7 @@ TEST(TransposeFoldingOutput, BatchedGemmTransTwo) {
   auto out = builder.Add(f, g);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{a.id(), c.id(), f.id()},
                     std::back_inserter(input_ids),
@@ -212,7 +212,7 @@ TEST(TransposeFoldingOutput, BatchedMatmulNoTrans) {
   auto out = builder.Subtract(f, g);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{a.id(), c.id(), f.id()},
                     std::back_inserter(input_ids),
@@ -239,7 +239,7 @@ TEST(TransposeFoldingOutput, BatchedGemmNoTrans) {
   auto out = builder.Add(f, g);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{a.id(), c.id(), f.id()},
                     std::back_inserter(input_ids),
@@ -267,7 +267,7 @@ TEST(TransposeFoldingOutput, MatmulTransLeft) {
   auto out = builder.Subtract(e, f);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{a.id(), c.id(), f.id()},
                     std::back_inserter(input_ids),
@@ -295,7 +295,7 @@ TEST(TransposeFoldingOutput, GemmTransLeft) {
   auto out = builder.Add(e, f);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{a.id(), c.id(), f.id()},
                     std::back_inserter(input_ids),
@@ -323,7 +323,7 @@ TEST(TransposeFoldingOutput, MatmulTransRight) {
   auto out = builder.Subtract(e, f);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{a.id(), b.id(), f.id()},
                     std::back_inserter(input_ids),
@@ -351,7 +351,7 @@ TEST(TransposeFoldingOutput, GemmTransRight) {
   auto out = builder.Add(e, f);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{a.id(), b.id(), f.id()},
                     std::back_inserter(input_ids),
@@ -380,7 +380,7 @@ TEST(TransposeFoldingOutput, MatmulTransTwo) {
   auto out = builder.Subtract(f, g);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{a.id(), c.id(), f.id()},
                     std::back_inserter(input_ids),
@@ -409,7 +409,7 @@ TEST(TransposeFoldingOutput, GemmTransTwo) {
   auto out = builder.Add(f, g);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{a.id(), c.id(), f.id()},
                     std::back_inserter(input_ids),
@@ -436,7 +436,7 @@ TEST(TransposeFoldingOutput, MatmulNoTrans) {
   auto out = builder.Subtract(f, g);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{a.id(), c.id(), f.id()},
                     std::back_inserter(input_ids),
@@ -463,7 +463,7 @@ TEST(TransposeFoldingOutput, GemmNoTrans) {
   auto out = builder.Add(f, g);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{a.id(), c.id(), f.id()},
                     std::back_inserter(input_ids),
@@ -505,7 +505,7 @@ TEST(TransposeFoldingOutput, BatchedComplex) {
   auto out = builder.Add(i, j);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{d.id(), z.id()},
                     std::back_inserter(input_ids),
@@ -544,7 +544,7 @@ TEST(TransposeFoldingOutput, Complex) {
   auto out = builder.Add(i, j);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{c.id(), z.id()},
                     std::back_inserter(input_ids),
@@ -575,7 +575,7 @@ TEST(TransposeFoldingOutput, MultiTransCaseOne) {
   auto out = builder.Add(h, g);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(
       std::vector<absl::string_view>{a.id(), b.id(), e.id(), h.id()},
@@ -606,7 +606,7 @@ TEST(TransposeFoldingOutput, MultiTransCaseTwo) {
   auto out = builder.Add(h, g);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{a.id(), b.id(), h.id()},
                     std::back_inserter(input_ids),
diff --git a/paddle/cinn/frontend/pass/transpose_scale_folding_test.cc b/paddle/cinn/frontend/pass/transpose_scale_folding_test.cc
index 296ba7fba96a89..5d18b6ff8cbe22 100644
--- a/paddle/cinn/frontend/pass/transpose_scale_folding_test.cc
+++ b/paddle/cinn/frontend/pass/transpose_scale_folding_test.cc
@@ -40,7 +40,7 @@ TEST(ScaleFolding, FoldIntoDotBatchedCase1) {
   auto out = builder.Matmul(scale_x, y);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultTarget();
+  cinn::common::Target target = cinn::common::DefaultTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
                     std::back_inserter(input_ids),
@@ -65,7 +65,7 @@ TEST(ScaleFolding, FoldIntoDotBatchedCase2) {
   auto out = builder.Matmul(scale_x, y);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultTarget();
+  cinn::common::Target target = cinn::common::DefaultTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
                     std::back_inserter(input_ids),
@@ -90,7 +90,7 @@ TEST(ScaleFolding, FoldIntoDotBatchedCase3) {
   auto out = builder.Matmul(scale_x, y);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultTarget();
+  cinn::common::Target target = cinn::common::DefaultTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
                     std::back_inserter(input_ids),
@@ -115,7 +115,7 @@ TEST(ScaleFolding, FoldIntoDotBatchedCase4) {
   auto out = builder.Matmul(x, scale_y);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultTarget();
+  cinn::common::Target target = cinn::common::DefaultTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
                     std::back_inserter(input_ids),
@@ -141,7 +141,7 @@ TEST(ScaleFolding, FoldIntoDotBatchedCase5) {
   auto out = builder.Matmul(scale_x, scale_y);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultTarget();
+  cinn::common::Target target = cinn::common::DefaultTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
                     std::back_inserter(input_ids),
@@ -168,7 +168,7 @@ TEST(ScaleFolding, FoldIntoDotBatchedCase6) {
   auto out = builder.Scale(orig_out, 2.0f);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultTarget();
+  cinn::common::Target target = cinn::common::DefaultTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
                     std::back_inserter(input_ids),
@@ -198,7 +198,7 @@ TEST(TransposeScaleFolding, BatchComplexCase1) {
   auto out = builder.Transpose(scale_out, {0, 2, 1});
   auto program = builder.Build();
 
-  common::Target target = common::DefaultTarget();
+  cinn::common::Target target = cinn::common::DefaultTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
                     std::back_inserter(input_ids),
@@ -228,7 +228,7 @@ TEST(TransposeScaleFolding, BatchComplexCase2) {
   auto out = builder.Scale(transpose_out, 2.0f);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultTarget();
+  cinn::common::Target target = cinn::common::DefaultTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
                     std::back_inserter(input_ids),
@@ -254,7 +254,7 @@ TEST(TransposeScaleFolding, BatchComplexCase3) {
   auto out = builder.Matmul(transpose_x, scale_y);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultTarget();
+  cinn::common::Target target = cinn::common::DefaultTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
                     std::back_inserter(input_ids),
@@ -279,7 +279,7 @@ TEST(TransposeScaleFolding, BatchComplexCase4) {
   auto out = builder.Matmul(transpose_x, scale_x);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultTarget();
+  cinn::common::Target target = cinn::common::DefaultTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{x.id()},
                     std::back_inserter(input_ids),
@@ -308,7 +308,7 @@ TEST(TransposeScaleFolding, BatchComplexCase5) {
   auto out = builder.Matmul(transpose_o, z);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultTarget();
+  cinn::common::Target target = cinn::common::DefaultTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
                     std::back_inserter(input_ids),
@@ -335,7 +335,7 @@ TEST(TransposeScaleFolding, BatchComplexCase6) {
   auto out = builder.Transpose(out_matmul, {0, 2, 1});
   auto program = builder.Build();
 
-  common::Target target = common::DefaultTarget();
+  cinn::common::Target target = cinn::common::DefaultTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{x.id()},
                     std::back_inserter(input_ids),
@@ -364,7 +364,7 @@ TEST(TransposeBroadCastFolding, BatchComplexCase1) {
   auto out = builder.Scale(out_trans, 2.0f);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultTarget();
+  cinn::common::Target target = cinn::common::DefaultTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
                     std::back_inserter(input_ids),
@@ -395,7 +395,7 @@ TEST(TransposeBroadCastFolding, BatchComplexCase2) {
   auto out = builder.Scale(out_trans, 2.0f);
   auto program = builder.Build();
 
-  common::Target target = common::DefaultTarget();
+  cinn::common::Target target = cinn::common::DefaultTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{x.id(), y.id()},
                     std::back_inserter(input_ids),
diff --git a/paddle/cinn/frontend/program_pass.cc b/paddle/cinn/frontend/program_pass.cc
index 1cd0903f97a031..3e60dbddb899a2 100644
--- a/paddle/cinn/frontend/program_pass.cc
+++ b/paddle/cinn/frontend/program_pass.cc
@@ -23,7 +23,7 @@ namespace frontend {
 
 void ProgramPass::Apply(Program* prog,
                         const std::unordered_set<std::string>& fetch_ids,
-                        const common::Target& target,
+                        const cinn::common::Target& target,
                         const std::vector<std::string>& passes) {
   std::vector<const ProgramPass*> fpass;
   for (auto& name : passes) {
diff --git a/paddle/cinn/frontend/program_pass.h b/paddle/cinn/frontend/program_pass.h
index ecdb23ef2e1f07..1572b3ed5c7ca2 100755
--- a/paddle/cinn/frontend/program_pass.h
+++ b/paddle/cinn/frontend/program_pass.h
@@ -38,7 +38,7 @@ class ProgramPass {
    */
   static void Apply(Program* prog,
                     const std::unordered_set<std::string>& fetch_ids,
-                    const common::Target& target,
+                    const cinn::common::Target& target,
                     const std::vector<std::string>& passes);
 
   const std::string& name() const { return name_; }
@@ -46,10 +46,10 @@ class ProgramPass {
  protected:
   virtual void ApplyImpl(Program* prog,
                          const std::unordered_set<std::string>& fetch_ids,
-                         const common::Target& target) {}
+                         const cinn::common::Target& target) {}
   virtual void ApplyImpl(Program* prog,
                          const std::unordered_set<std::string>& fetch_ids,
-                         const common::Target& target) const {
+                         const cinn::common::Target& target) const {
     return const_cast<ProgramPass*>(this)->ApplyImpl(prog, fetch_ids, target);
   }
 
diff --git a/paddle/cinn/frontend/syntax.cc b/paddle/cinn/frontend/syntax.cc
index 9b534d8d146c01..8f9d9a44b13008 100644
--- a/paddle/cinn/frontend/syntax.cc
+++ b/paddle/cinn/frontend/syntax.cc
@@ -44,7 +44,8 @@ void Instruction::PrepareOutputs() {
 Instruction::Instruction(absl::string_view op_type,
                          const std::vector<Variable>& inputs,
                          Program* parent)
-    : common::Shared<_Instruction_>(common::make_shared<_Instruction_>()) {
+    : cinn::common::Shared<_Instruction_>(
+          cinn::common::make_shared<_Instruction_>()) {
   get()->op_type = std::string(op_type);
   get()->parent_program = parent;
   get()->inputs = inputs;
@@ -173,7 +174,7 @@ Variable Program::fused_meta_batchnorm_inference(
     epsilon = absl::get<float>(attr_store.at("epsilon"));
   }
   auto eps_var =
-      primitive_const_scalar<float>(epsilon, common::UniqName("epsilon"));
+      primitive_const_scalar<float>(epsilon, cinn::common::UniqName("epsilon"));
   CHECK(!scale->shape.empty()) << "scale's shape is empty.";
   auto broadcast_eps = primitive_broadcast_to(eps_var, scale->shape, {0});
   auto var_add_eps = add(variance, broadcast_eps);
@@ -203,7 +204,7 @@ Variable Program::fused_batchnorm_inference(
     epsilon = absl::get<float>(attr_store.at("epsilon"));
   }
   auto eps_var =
-      primitive_const_scalar<float>(epsilon, common::UniqName("epsilon"));
+      primitive_const_scalar<float>(epsilon, cinn::common::UniqName("epsilon"));
   CHECK(!scale->shape.empty()) << "scale's shape is empty.";
   auto var_add_eps = elementwise_add(variance, eps_var);
   auto rsrqt_var = primitive_rsqrt(var_add_eps);
@@ -301,7 +302,7 @@ LoadPaddleProgram(const std::string& model_dir,
                   std::unordered_map<std::string, std::vector<int>>&
                       input_shape_map,  // NOLINT
                   bool is_combined,
-                  const common::Target& target) {
+                  const cinn::common::Target& target) {
   VLOG(1) << "Loading Paddle model from " << model_dir;
   PaddleModelToProgram paddle_to_program(scope, input_shape_map, target);
   return std::make_tuple(paddle_to_program(model_dir, is_combined),
diff --git a/paddle/cinn/frontend/syntax.h b/paddle/cinn/frontend/syntax.h
index a405e22ddb565c..a2fa024be41313 100644
--- a/paddle/cinn/frontend/syntax.h
+++ b/paddle/cinn/frontend/syntax.h
@@ -41,9 +41,9 @@ namespace frontend {
 struct Program;
 struct Variable;
 
-struct _Variable_ : public common::Object {
+struct _Variable_ : public cinn::common::Object {
   std::string id;
-  common::Type type;
+  cinn::common::Type type;
   std::vector<int> shape;
   bool is_const = false;
 
@@ -54,17 +54,18 @@ struct _Variable_ : public common::Object {
 /**
  * Variable represents the variable in a computation.
  */
-struct Variable : public common::Shared<_Variable_> {
+struct Variable : public cinn::common::Shared<_Variable_> {
   /**
    * Constructor.
    * @param id_hint The identifier of the variable, if null, a random ID will be
    * assigned.
    */
   explicit Variable(const std::string& id_hint = "")
-      : common::Shared<_Variable_>(common::make_shared<_Variable_>()) {
+      : cinn::common::Shared<_Variable_>(
+            cinn::common::make_shared<_Variable_>()) {
     if (!id_hint.empty()) CheckVarNameValid(id_hint);
-    get()->id =
-        id_hint.empty() ? common::Context::Global().NewName("var") : id_hint;
+    get()->id = id_hint.empty() ? cinn::common::Context::Global().NewName("var")
+                                : id_hint;
   }
 
   void set_id(const std::string& id) { operator->()->id = id; }
@@ -85,13 +86,14 @@ class Placeholder {
    * @param shape Shape of the fed
    * @param id ID of the fed
    */
-  Placeholder(const common::Type& type,
+  Placeholder(const cinn::common::Type& type,
               const std::vector<int>& shape,
               absl::string_view id_hint = "",
               bool is_const = false) {
     if (!id_hint.empty()) CheckVarNameValid(std::string(id_hint));
-    id_ = id_hint.empty() ? common::Context::Global().NewName("placeholder")
-                          : (std::string)id_hint;
+    id_ = id_hint.empty()
+              ? cinn::common::Context::Global().NewName("placeholder")
+              : (std::string)id_hint;
     var_ = Variable(id_);
     var_->shape = shape;
     var_->type = type;
@@ -124,7 +126,7 @@ class Placeholder {
 /**
  * Data of a Instruction.
  */
-struct _Instruction_ : public common::Object {
+struct _Instruction_ : public cinn::common::Object {
   using attr_t = hlir::framework::AttrType;
 
   std::string op_type;
@@ -145,7 +147,7 @@ struct _Instruction_ : public common::Object {
  * Instruction is the basic computational unit of a Program, similar to the
  * operator concept in a DNN platform.
  */
-struct Instruction : public common::Shared<_Instruction_> {
+struct Instruction : public cinn::common::Shared<_Instruction_> {
   explicit Instruction(absl::string_view op_type,
                        const std::vector<Variable>& inputs = {},
                        Program* parent = nullptr);
@@ -528,12 +530,13 @@ std::tuple<std::unique_ptr<Program>,
            absl::flat_hash_map<std::string, Variable>,
            absl::flat_hash_map<std::string, std::string>,
            absl::flat_hash_set<std::string>>
-LoadPaddleProgram(const std::string& model_dir,
-                  hlir::framework::Scope* scope,
-                  std::unordered_map<std::string, std::vector<int>>&
-                      input_shape_map,  // NOLINT
-                  bool is_combined,
-                  const common::Target& target = common::DefaultHostTarget());
+LoadPaddleProgram(
+    const std::string& model_dir,
+    hlir::framework::Scope* scope,
+    std::unordered_map<std::string, std::vector<int>>&
+        input_shape_map,  // NOLINT
+    bool is_combined,
+    const cinn::common::Target& target = cinn::common::DefaultHostTarget());
 
 std::ostream& operator<<(std::ostream& os, const Variable& x);
 std::ostream& operator<<(std::ostream& os, const Instruction& instr);
diff --git a/paddle/cinn/frontend/syntax_test.cc b/paddle/cinn/frontend/syntax_test.cc
index 1cc76ef2950619..1c38076a0b7851 100644
--- a/paddle/cinn/frontend/syntax_test.cc
+++ b/paddle/cinn/frontend/syntax_test.cc
@@ -63,7 +63,7 @@ TEST(syntax, basic) {
 
 TEST(syntax, program_execute_multi_elementwise_add) {
   auto program = CreateAddProgram();
-  Target target = common::DefaultTarget();
+  Target target = cinn::common::DefaultTarget();
   std::unordered_set<std::string> fetch_ids;
   auto graph = Optimize(&program, fetch_ids, target);
   // auto graph    = std::make_shared<hlir::framework::Graph>(*program, target);
@@ -84,7 +84,7 @@ TEST(syntax, program_execute_multi_elementwise_add) {
 
 TEST(syntax, program_execute_multi_elementwise_add2) {
   auto program = CreateAddProgram();
-  Target target = common::DefaultTarget();
+  Target target = cinn::common::DefaultTarget();
   std::unordered_set<std::string> fetch_ids;
   auto graph = Optimize(&program, fetch_ids, target);
   LOG(INFO) << "graph:\n" << graph->Visualize();
@@ -119,7 +119,7 @@ std::get<2>(programTuple);
 
   LOG(INFO) << "program:\n" << *program;
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
   std::unordered_set<std::string> fetch_ids;
   auto graph = cinn::frontend::Optimize(program.get(), fetch_ids, target);
 
diff --git a/paddle/cinn/frontend/var_type_utils.h b/paddle/cinn/frontend/var_type_utils.h
index b11c222da3f801..85a70ee4f53a99 100644
--- a/paddle/cinn/frontend/var_type_utils.h
+++ b/paddle/cinn/frontend/var_type_utils.h
@@ -24,10 +24,11 @@ namespace cinn {
 namespace frontend {
 namespace utils {
 
-inline common::Type CppVarType2CommonType(paddle::cpp::VarDescAPI::Type type) {
+inline cinn::common::Type CppVarType2CommonType(
+    paddle::cpp::VarDescAPI::Type type) {
 #define SET_TYPE_CASE_ITEM(v_type, c_type)    \
   case paddle::cpp::VarDescAPI::Type::v_type: \
-    return common::c_type();                  \
+    return cinn::common::c_type();            \
     break;
 
   static std::vector<std::string> var_type_names_ = {"BOOL",              // 0
@@ -87,7 +88,7 @@ inline common::Type CppVarType2CommonType(paddle::cpp::VarDescAPI::Type type) {
                  << static_cast<int>(type) << ")";
   }
 #undef SET_DATA_TYPE_CASE_ITEM
-  return common::Type();
+  return cinn::common::Type();
 }
 
 inline OpMapperContext::FeedInfo GetFeedInfoFromDesc(
diff --git a/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc b/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc
index 33c8bbe1b86240..68a09ad7a9868b 100644
--- a/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc
+++ b/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc
@@ -16,9 +16,9 @@
 
 #include <vector>
 #include "glog/logging.h"
+#include "paddle/common/enforce.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
 #include "paddle/pir/core/builtin_type.h"
-#include "paddle/pir/core/enforce.h"
 #include "paddle/pir/core/op_base.h"
 #include "paddle/pir/dialect/control_flow/ir/cf_op.h"
 
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/add_broadcast_to_elementwise_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/add_broadcast_to_elementwise_pass.cc
index f8d95617c6cb4d..e36e3a3cc156c4 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/add_broadcast_to_elementwise_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/add_broadcast_to_elementwise_pass.cc
@@ -16,11 +16,11 @@
 
 #include "paddle/cinn/hlir/dialect/operator/ir/cinn_op.h"
 #include "paddle/cinn/hlir/framework/pir/utils.h"
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
 #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 #include "paddle/fluid/pir/drr/api/match_context.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/pir/core/builtin_dialect.h"
 #include "paddle/pir/pass/pass.h"
 #include "paddle/pir/pattern_rewrite/pattern_applicator.h"
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_pass.cc
index 28eb1c0da8abcb..1c28039718a745 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_pass.cc
@@ -193,7 +193,7 @@ class GraphGroupFuseHelper final : public FuseHelper {
             Visit(node_producer);
           }
         };
-    common::IsReachablePredicator<OpGroupPtr> is_reachable(
+    cinn::common::IsReachablePredicator<OpGroupPtr> is_reachable(
         MinDepth4Node, MaxDepth4Node, VisitNextNodes);
     return is_reachable(consumer, producer, [](OpGroupPtr) {});
   }
@@ -468,12 +468,12 @@ struct HorizontalFuseUtil {
     }
 
     size_t size_ele =
-        phi::product(GetMasterNode(*ele_group).outputs()[0].shape());
+        ::common::product(GetMasterNode(*ele_group).outputs()[0].shape());
 
     bool can_fuse = false;
     reduce_group->WalkOpNodes([&](const cinn::dialect::ir::OpNode& op) {
       if (op.kind() == OpPatternKind::kReduction) {
-        size_t size_master = phi::product(op.outputs()[0].shape());
+        size_t size_master = ::common::product(op.outputs()[0].shape());
         if (size_ele == size_master) {
           can_fuse = true;
         }
@@ -1880,13 +1880,13 @@ class GeneralFusionMergePassHelper {
           continue;
         }
 
-        auto producer_output_shape = phi::vectorize(
+        auto producer_output_shape = ::common::vectorize(
             GetValueShape((*producer->output_ops.begin())->result(0)));
 
-        auto consumer_output_shape = phi::vectorize(
+        auto consumer_output_shape = ::common::vectorize(
             GetValueShape((*consumer->output_ops.begin())->result(0)));
 
-        auto consumer_master_input_shape = phi::vectorize(GetValueShape(
+        auto consumer_master_input_shape = ::common::vectorize(GetValueShape(
             (*(consumer->master_ops.begin()))->operand_source(0)));
 
         int producer_output_numel =
@@ -1933,9 +1933,9 @@ class GeneralFusionMergePassHelper {
           continue;
         }
 
-        auto shape0 = phi::vectorize(
+        auto shape0 = ::common::vectorize(
             GetValueShape((*producer->output_ops.begin())->result(0)));
-        auto shape1 = phi::vectorize(
+        auto shape1 = ::common::vectorize(
             GetValueShape((*consumer->output_ops.begin())->result(0)));
 
         if (std::accumulate(
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_pass_utils.h b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_pass_utils.h
index 1b996676d449df..7e874ecb8e95a1 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_pass_utils.h
+++ b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_pass_utils.h
@@ -47,7 +47,7 @@ static bool IsSameSize(const OpGroupPtr& src, const OpGroupPtr& dst) {
   auto size_0 = src_master_node.outputs()[0].shape();
   auto size_1 = dst_master_node.outputs()[0].shape();
 
-  return phi::product(size_0) == phi::product(size_1);
+  return ::common::product(size_0) == ::common::product(size_1);
 }
 
 static std::unordered_set<cinn::dialect::ir::OpNode> GetInputOps(
@@ -139,7 +139,8 @@ static int GetSharedSize(const cinn::dialect::ir::OpNode& op_node) {
     for (int idx = axes.back() + 1; idx < inshape.size(); ++idx) {
       lane = inshape[idx];
     }
-    // int max_num_threads = common::DefaultNVGPUTarget().max_num_threads();
+    // int max_num_threads =
+    // cinn::common::DefaultNVGPUTarget().max_num_threads();
     int max_num_threads = 1000;
     if (lane > max_num_threads / 2) {
       return 0;
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_util.h b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_util.h
index 7754a9e0932d3a..d59f673d53f7ba 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_util.h
+++ b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_util.h
@@ -84,8 +84,8 @@ inline bool is_same_size(const std::shared_ptr<ir::Group>& first,
     return true;
   }
 
-  auto size_0 = phi::product(output_var_0);
-  auto size_1 = phi::product(output_var_1);
+  auto size_0 = ::common::product(output_var_0);
+  auto size_1 = ::common::product(output_var_1);
   return size_0 == size_1;
 }
 
@@ -145,11 +145,11 @@ inline bool honrizontal_elementwise_fuse_reduce(
 
   auto ele_node_shape =
       GetValueShape((*ele_group->master_ops.begin())->result(0));
-  int32_t size_ele = phi::product(ele_node_shape);
+  int32_t size_ele = ::common::product(ele_node_shape);
   // TODO(phlrain): seems extrame danger herem, why compare multi Master Node?
   for (auto* master : reduce_group->master_ops) {
     auto master_node_shape = GetValueShape(master->result(0));
-    int32_t size_master = phi::product(master_node_shape);
+    int32_t size_master = ::common::product(master_node_shape);
     if (size_ele == size_master) {
       return true;
     }
@@ -160,7 +160,7 @@ inline bool honrizontal_elementwise_fuse_reduce(
 
 inline bool elementwise_fuse_reduce(const std::shared_ptr<ir::Group>& first,
                                     const std::shared_ptr<ir::Group>& second) {
-  // if (helper->target_ == common::DefaultHostTarget()) {
+  // if (helper->target_ == cinn::common::DefaultHostTarget()) {
   //   return true;
   // }
   // if same shape with horizontal relation
@@ -205,11 +205,11 @@ inline bool elementwise_fuse_reduce(const std::shared_ptr<ir::Group>& first,
     bool flag = true;
     auto first_node_shape =
         GetValueShape((*first->master_ops.begin())->result(0));
-    int32_t size_first = phi::product(first_node_shape);
+    int32_t size_first = ::common::product(first_node_shape);
 
     for (::pir::Operation* master : masters_in_consumers) {
       auto second_node_shape = GetValueShape(master->result(0));
-      int32_t size_second = phi::product(second_node_shape);
+      int32_t size_second = ::common::product(second_node_shape);
       if (size_first != size_second) {
         flag = false;
         break;
@@ -301,10 +301,10 @@ inline bool broadcast_fuse_reduce(const std::shared_ptr<ir::Group>& first,
   // CHECK(reducer) << "Can't find reduce op in group " << second->group_id;
 
   auto input_shape = GetValueShape(reducer->operand_source(0));
-  auto input_size = phi::product(input_shape);
+  auto input_size = ::common::product(input_shape);
 
   auto output_shape = GetValueShape((*first->master_ops.begin())->result(0));
-  auto output_size = phi::product(output_shape);
+  auto output_size = ::common::product(output_shape);
 
   if (input_size == output_size) {
     return elementwise_fuse_reduce(first, second);
@@ -435,9 +435,9 @@ inline bool reduce_fuse_broadcast(const std::shared_ptr<ir::Group>& first,
     // First type conditions
     // Get some reduce information
     auto reducer_input_shape =
-        phi::vectorize(GetValueShape(reducer->operand_source(0)));
+        ::common::vectorize(GetValueShape(reducer->operand_source(0)));
     auto reducer_output_shape =
-        phi::vectorize(GetValueShape(reducer->result(0)));
+        ::common::vectorize(GetValueShape(reducer->result(0)));
     std::vector<int64_t> reduce_axes = GetVectorAttr(reducer, "dim");
 
     auto keep_dim =
@@ -456,7 +456,7 @@ inline bool reduce_fuse_broadcast(const std::shared_ptr<ir::Group>& first,
       reduce_size *= reducer_input_shape[idx - 1];
     }
     // Check if the reduce size exceeds the hardware limit
-    // if (helper->target_ == common::DefaultNVGPUTarget() &&
+    // if (helper->target_ == cinn::common::DefaultNVGPUTarget() &&
     //     reduce_size > helper->target_.max_num_threads()) {
     //   return false;
     // }
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/op_with_group_merge_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/op_with_group_merge_pass.cc
index fffd2edc027b61..54005eb22f25b3 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/op_with_group_merge_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/op_with_group_merge_pass.cc
@@ -186,7 +186,7 @@ bool WithoutLastDimInReduce(const std::vector<int64_t>& inshape,
 }
 
 int GetSharedSize(::pir::Operation* op) {
-  auto inshape = phi::vectorize<int64_t>(GetValueShape(op->result(0)));
+  auto inshape = ::common::vectorize<int64_t>(GetValueShape(op->result(0)));
 
   auto axes = GetVectorAttr(op, "dim");
 
@@ -195,8 +195,9 @@ int GetSharedSize(::pir::Operation* op) {
     for (size_t idx = axes.back() + 1; idx < inshape.size(); ++idx) {
       lane = inshape[idx];
     }
-    // int max_num_threads = common::DefaultNVGPUTarget().max_num_threads();
-    // todo(phlrain): get gpu max threads
+    // int max_num_threads =
+    // cinn::common::DefaultNVGPUTarget().max_num_threads(); todo(phlrain): get
+    // gpu max threads
     int max_num_threads = 2048;
     if (lane > max_num_threads / 2) {
       return 0;
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/op_with_group_merge_util.h b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/op_with_group_merge_util.h
index ef8aa1fd2d565d..5169ef85198933 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/op_with_group_merge_util.h
+++ b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/op_with_group_merge_util.h
@@ -97,14 +97,14 @@ inline bool is_same_size(::pir::Operation* producer,
   if (producer_shape == consumer_shape) {
     return true;
   }
-  auto psize = phi::product(producer_shape);
-  auto csize = phi::product(consumer_shape);
+  auto psize = ::common::product(producer_shape);
+  auto csize = ::common::product(consumer_shape);
   return psize == csize;
 }
 
 inline bool without_last_dimension_in_reduce(
     ::pir::Operation* producer, const std::shared_ptr<Group>& consumer) {
-  auto in_shape = phi::vectorize<int64_t>(GetFirstInputShape(producer));
+  auto in_shape = ::common::vectorize<int64_t>(GetFirstInputShape(producer));
   auto reduce_axes = GetVectorAttr(producer, "dim");
   return WithoutLastDimInReduce(in_shape, reduce_axes);
 }
@@ -121,14 +121,14 @@ inline bool reduce_fuse_reduce(::pir::Operation* producer,
   }
   // check reduce has same input shape and output shape
   auto producer_input_shape =
-      phi::vectorize<int64_t>(GetValueShape(producer->operand_source(0)));
+      ::common::vectorize<int64_t>(GetValueShape(producer->operand_source(0)));
   auto producer_output_shape =
-      phi::vectorize<int64_t>(GetValueShape(producer->result(0)));
+      ::common::vectorize<int64_t>(GetValueShape(producer->result(0)));
 
   auto reducer_input_shape =
-      phi::vectorize<int64_t>(GetValueShape(reducer->operand_source(0)));
+      ::common::vectorize<int64_t>(GetValueShape(reducer->operand_source(0)));
   auto reducer_output_shape =
-      phi::vectorize<int64_t>(GetValueShape(reducer->result(0)));
+      ::common::vectorize<int64_t>(GetValueShape(reducer->result(0)));
 
   auto producer_reduce_dim = GetVectorAttr(producer, "dim");
   auto reducer_reduce_dim = GetVectorAttr(reducer, "dim");
@@ -238,7 +238,7 @@ inline bool horizontal_or_vertical_reduce_relation(
   }
 
   // check producer has same shape with reducer op.
-  auto reduce_shape = phi::vectorize(GetFirstInputShape(reducer));
+  auto reduce_shape = ::common::vectorize(GetFirstInputShape(reducer));
   auto reduce_axes = GetVectorAttr(reducer, "dim");
 
   for (auto& axis : reduce_axes) {
@@ -248,8 +248,9 @@ inline bool horizontal_or_vertical_reduce_relation(
     }
   }
 
-  auto op_shape = phi::vectorize<int64_t>(GetValueShape(producer->result(0)));
-  // auto op_shape = phi::vectorize<int64_t>(GetFirstInputShape(producer));
+  auto op_shape =
+      ::common::vectorize<int64_t>(GetValueShape(producer->result(0)));
+  // auto op_shape = ::common::vectorize<int64_t>(GetFirstInputShape(producer));
   auto op_size = std::accumulate(
       op_shape.begin(), op_shape.end(), 1, std::multiplies<int>());
   auto reduce_size = std::accumulate(
@@ -273,7 +274,7 @@ inline bool horizontal_or_vertical_reduce_relation(
     break;
   }
 
-  // helper->target_ == common::DefaultNVGPUTarget()
+  // helper->target_ == cinn::common::DefaultNVGPUTarget()
   // succesive_reduce_dimension <= helper->target_.max_num_threads()
   // TODO(phlrain): support is_gpu_target and max_thread
   bool is_gpu_target = true;
@@ -330,11 +331,12 @@ inline bool reduce_fuse_broadcast(::pir::Operation* producer,
     return false;
   }
 
-  // if (helper->target_ != common::DefaultNVGPUTarget()) {
+  // if (helper->target_ != cinn::common::DefaultNVGPUTarget()) {
   //   return true;
   // }
 
-  auto rinput_shape = phi::vectorize<int64_t>(GetFirstInputShape(producer));
+  auto rinput_shape =
+      ::common::vectorize<int64_t>(GetFirstInputShape(producer));
   auto reduce_axes = GetVectorAttr(producer, "dim");
   auto keep_dim = producer->attributes()
                       .at("keep_dim")
@@ -359,7 +361,7 @@ inline bool reduce_fuse_broadcast(::pir::Operation* producer,
   // }
 
   auto routput_shape =
-      phi::vectorize<int64_t>(GetValueShape(producer->result(0)));
+      ::common::vectorize<int64_t>(GetValueShape(producer->result(0)));
   auto find_reducer =
       [&](::pir::Operation* op,
           ::pir::Operation* reducer,
diff --git a/paddle/cinn/hlir/dialect/runtime/ir/jit_kernel_op.cc b/paddle/cinn/hlir/dialect/runtime/ir/jit_kernel_op.cc
index cb996d9c8bd74f..3d6b82dff36651 100644
--- a/paddle/cinn/hlir/dialect/runtime/ir/jit_kernel_op.cc
+++ b/paddle/cinn/hlir/dialect/runtime/ir/jit_kernel_op.cc
@@ -16,8 +16,8 @@
 
 #include "paddle/cinn/hlir/dialect/operator/ir/op_attribute.h"
 #include "paddle/cinn/hlir/framework/pir_compiler.h"
+#include "paddle/common/enforce.h"
 #include "paddle/pir/core/builtin_attribute.h"
-#include "paddle/pir/core/enforce.h"
 
 namespace cinn {
 namespace dialect {
diff --git a/paddle/cinn/hlir/framework/accuracy_checker.cc b/paddle/cinn/hlir/framework/accuracy_checker.cc
index 95b5b4102414f5..82777b89ba52ad 100644
--- a/paddle/cinn/hlir/framework/accuracy_checker.cc
+++ b/paddle/cinn/hlir/framework/accuracy_checker.cc
@@ -237,7 +237,7 @@ std::string AccuracyChecker::CheckTensor(const Tensor& tensor,
                                          const std::string& arg_name) {
   Tensor cpu_tensor;
   cpu_tensor->Resize(tensor->shape());
-  T* dst = cpu_tensor->mutable_data<T>(common::DefaultHostTarget());
+  T* dst = cpu_tensor->mutable_data<T>(cinn::common::DefaultHostTarget());
 
   const T* src = tensor->data<T>();
   size_t numel = tensor->shape().numel();
@@ -259,7 +259,7 @@ std::string AccuracyChecker::CheckBuffer(const cinn_buffer_t* buffer,
 
   Tensor cpu_tensor;
   cpu_tensor->Resize(Shape(shape));
-  T* dst = cpu_tensor->mutable_data<T>(common::DefaultHostTarget());
+  T* dst = cpu_tensor->mutable_data<T>(cinn::common::DefaultHostTarget());
 
   const T* src = reinterpret_cast<const T*>(buffer->memory);
   size_t numel = cpu_tensor->shape().numel();
@@ -273,12 +273,12 @@ std::string AccuracyChecker::CheckBuffer(const cinn_buffer_t* buffer,
 template <typename T>
 void AccuracyChecker::MemcpyDeviceToHost(const T* src, size_t numel, T* dst) {
 #ifdef CINN_WITH_CUDA
-  if (target_ == common::DefaultNVGPUTarget()) {
+  if (target_ == cinn::common::DefaultNVGPUTarget()) {
     cudaMemcpy(dst, src, numel * sizeof(T), cudaMemcpyDeviceToHost);
     return;
   }
 #endif
-  if (target_ == common::DefaultHostTarget()) {
+  if (target_ == cinn::common::DefaultHostTarget()) {
     for (size_t i = 0; i < numel; ++i) {
       dst[i] = src[i];
     }
diff --git a/paddle/cinn/hlir/framework/accuracy_checker_test.cc b/paddle/cinn/hlir/framework/accuracy_checker_test.cc
index f5070101f8eb79..05efb7bd925c6f 100644
--- a/paddle/cinn/hlir/framework/accuracy_checker_test.cc
+++ b/paddle/cinn/hlir/framework/accuracy_checker_test.cc
@@ -49,20 +49,20 @@ void SetRandomTensor(Tensor tensor, Target target, bool generate_nan) {
   GenerateRandomData(random_nan_vec.data(), numel, generate_nan);
 
 #ifdef CINN_WITH_CUDA
-  if (target == common::DefaultNVGPUTarget()) {
+  if (target == cinn::common::DefaultNVGPUTarget()) {
     cudaMemcpy(dst,
                random_nan_vec.data(),
                numel * sizeof(float),
                cudaMemcpyHostToDevice);
   }
 #endif
-  if (target == common::DefaultHostTarget()) {
+  if (target == cinn::common::DefaultHostTarget()) {
     std::copy(random_nan_vec.begin(), random_nan_vec.end(), dst);
   }
 }
 
 TEST(AccuracyChecker, tensor) {
-  Target target = common::DefaultTarget();
+  Target target = cinn::common::DefaultTarget();
   Scope scope;
   scope.Var<Tensor>("x");
   auto out = scope.GetTensor("x");
@@ -106,7 +106,7 @@ void InstantiateScope(Scope* scope, Target target) {
 }
 
 TEST(AccuracyChecker, instruction) {
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
   Scope scope;
   InstantiateScope(&scope, target);
 
@@ -148,7 +148,7 @@ void InitName2PodArgs(Target target,
 }
 
 TEST(AccuracyChecker, instruction_podargs) {
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
   std::vector<cinn_buffer_t> args_buffer(2);
   std::map<std::string, cinn_pod_value_t> name2podargs;
   InitName2PodArgs(target, &args_buffer, &name2podargs);
diff --git a/paddle/cinn/hlir/framework/buffer.cc b/paddle/cinn/hlir/framework/buffer.cc
index 83427abe9cbe7d..524433c165a5cf 100755
--- a/paddle/cinn/hlir/framework/buffer.cc
+++ b/paddle/cinn/hlir/framework/buffer.cc
@@ -44,7 +44,7 @@ void Buffer::Resize(uint32_t alignment, uint32_t size) {
   }
 }
 
-void Buffer::SetTarget(const common::Target& target) {
+void Buffer::SetTarget(const cinn::common::Target& target) {
   target_ = target;
   memory_mng_cache_ = MemoryManager::Global().RetrieveSafely(target_.arch);
 }
@@ -59,7 +59,7 @@ void Buffer::ResizeLazy(uint32_t alignment, uint32_t size) {
   Resize(alignment, size);
 }
 
-void Buffer::Resize(uint32_t size, const common::Target& target) {
+void Buffer::Resize(uint32_t size, const cinn::common::Target& target) {
   if (target.arch != target_.arch) {
     Free();
     SetTarget(target);
@@ -69,7 +69,7 @@ void Buffer::Resize(uint32_t size, const common::Target& target) {
 
 void Buffer::Resize(uint32_t alignment,
                     uint32_t size,
-                    const common::Target& target) {
+                    const cinn::common::Target& target) {
   if (target.arch != target_.arch) {
     Free();
     SetTarget(target);
@@ -77,7 +77,7 @@ void Buffer::Resize(uint32_t alignment,
   Resize(alignment, size);
 }
 
-void Buffer::ResizeLazy(uint32_t size, const common::Target& target) {
+void Buffer::ResizeLazy(uint32_t size, const cinn::common::Target& target) {
   if (target.arch != target_.arch) {
     Free();
     SetTarget(target);
@@ -87,7 +87,7 @@ void Buffer::ResizeLazy(uint32_t size, const common::Target& target) {
 
 void Buffer::ResizeLazy(uint32_t alignment,
                         uint32_t size,
-                        const common::Target& target) {
+                        const cinn::common::Target& target) {
   if (target.arch != target_.arch) {
     Free();
     SetTarget(target);
diff --git a/paddle/cinn/hlir/framework/buffer.h b/paddle/cinn/hlir/framework/buffer.h
index 4d5e7cb0afbeaf..78d832c5493b4d 100644
--- a/paddle/cinn/hlir/framework/buffer.h
+++ b/paddle/cinn/hlir/framework/buffer.h
@@ -34,7 +34,7 @@ namespace framework {
  */
 struct Buffer final {
   Buffer() = default;
-  explicit Buffer(const common::Target& target) { SetTarget(target); }
+  explicit Buffer(const cinn::common::Target& target) { SetTarget(target); }
   ~Buffer() { Free(); }
   //! Resize the memory hold by this buffer *exactlly* to \p size.
   void Resize(uint32_t size);
@@ -45,16 +45,18 @@ struct Buffer final {
   void ResizeLazy(uint32_t alignment, uint32_t size);
 
   //! Resize the memory to \p size in target \p target.
-  void Resize(uint32_t size, const common::Target& target);
-  void Resize(uint32_t alignment, uint32_t size, const common::Target& target);
+  void Resize(uint32_t size, const cinn::common::Target& target);
+  void Resize(uint32_t alignment,
+              uint32_t size,
+              const cinn::common::Target& target);
 
   //! Lazily resize the memory to \p size in target \p target.
-  void ResizeLazy(uint32_t size, const common::Target& target);
+  void ResizeLazy(uint32_t size, const cinn::common::Target& target);
   void ResizeLazy(uint32_t alignment,
                   uint32_t size,
-                  const common::Target& target);
+                  const cinn::common::Target& target);
 
-  void SetTarget(const common::Target& target);
+  void SetTarget(const cinn::common::Target& target);
 
   const cinn_buffer_t* data() const { return &data_; }
   cinn_buffer_t* data() { return &data_; }
@@ -81,7 +83,7 @@ struct Buffer final {
   cinn_buffer_t data_;
 
   //! The place where this buffer locates.
-  common::Target target_;
+  cinn::common::Target target_;
 
   //! Number of bytes of this buffer.
   uint32_t size_{};
diff --git a/paddle/cinn/hlir/framework/buffer_test.cc b/paddle/cinn/hlir/framework/buffer_test.cc
index 6687077b263098..2fa644321b29b5 100755
--- a/paddle/cinn/hlir/framework/buffer_test.cc
+++ b/paddle/cinn/hlir/framework/buffer_test.cc
@@ -25,7 +25,7 @@ namespace hlir {
 namespace framework {
 
 TEST(Buffer, basic) {
-  Buffer buffer(common::DefaultHostTarget());
+  Buffer buffer(cinn::common::DefaultHostTarget());
   buffer.Resize(10 * sizeof(float));
   auto* data = reinterpret_cast<float*>(buffer.data()->memory);
   for (int i = 0; i < 10; i++) data[i] = i;
@@ -34,7 +34,7 @@ TEST(Buffer, basic) {
 #ifdef CINN_WITH_CUDA
 TEST(Buffer, nvgpu) {
   const int num_elements = 10;
-  Buffer buffer(common::DefaultNVGPUTarget());
+  Buffer buffer(cinn::common::DefaultNVGPUTarget());
   buffer.Resize(num_elements * sizeof(float));
   auto* data = reinterpret_cast<float*>(buffer.data()->memory);
   std::vector<float> host_data(num_elements);
diff --git a/paddle/cinn/hlir/framework/graph.cc b/paddle/cinn/hlir/framework/graph.cc
index faf8cd9f38c7e6..c94f150e3937c5 100644
--- a/paddle/cinn/hlir/framework/graph.cc
+++ b/paddle/cinn/hlir/framework/graph.cc
@@ -31,7 +31,7 @@ namespace cinn {
 namespace hlir {
 namespace framework {
 
-using DTypeDict = absl::flat_hash_map<std::string, common::Type>;
+using DTypeDict = absl::flat_hash_map<std::string, cinn::common::Type>;
 using ShapeDict = absl::flat_hash_map<std::string, shape_t>;
 
 void Graph::Initialize(const frontend::Program& prog,
@@ -51,7 +51,7 @@ void Graph::Initialize(const frontend::Program& prog,
     Shared<Node> node_ptr(node_tmp);
     node_tmp->attrs.attr_store = temp->attrs;
     for (auto& input_v : temp->inputs) {
-      common::GraphNode* graph_node = this->RetrieveNode(input_v->id);
+      cinn::common::GraphNode* graph_node = this->RetrieveNode(input_v->id);
       if (!graph_node) {
         dtype_dict[input_v->id] = input_v->type;
         shape_dict[input_v->id] = input_v->shape;
@@ -65,7 +65,7 @@ void Graph::Initialize(const frontend::Program& prog,
     }
     int out_idx = 0;
     for (auto& output_v : temp->outputs) {
-      common::GraphNode* graph_node = this->RetrieveNode(output_v->id);
+      cinn::common::GraphNode* graph_node = this->RetrieveNode(output_v->id);
       if (!graph_node) {
         dtype_dict[output_v->id] = output_v->type;
         shape_dict[output_v->id] = output_v->shape;
@@ -92,10 +92,11 @@ std::vector<std::vector<Node*>> Graph::FusionGroupsToGroups() {
   std::vector<std::vector<Node*>> groups;
   if (fusion_groups.empty()) {
     // if no fusion_groups, the graph will be treated as a big group
-    const auto& nodes = this->CollectNodes([](const common::GraphNode* node) {
-      return node->safe_as<Node>() != nullptr &&
-             node->safe_as<Node>()->op() != nullptr;
-    });
+    const auto& nodes =
+        this->CollectNodes([](const cinn::common::GraphNode* node) {
+          return node->safe_as<Node>() != nullptr &&
+                 node->safe_as<Node>()->op() != nullptr;
+        });
     std::vector<Node*> group;
     group.reserve(nodes.size());
     for (auto* node : nodes) {
@@ -200,8 +201,9 @@ std::string Graph::DebugGroupedGraph(
     const auto& shape = shape_dict.count(id)
                             ? cinn::utils::Join(shape_dict.at(id), ", ")
                             : "-1";
-    const auto& dtype =
-        dtype_dict.count(id) ? common::Type2Str(dtype_dict.at(id)) : "float32";
+    const auto& dtype = dtype_dict.count(id)
+                            ? cinn::common::Type2Str(dtype_dict.at(id))
+                            : "float32";
 
     // generator python create_input code
     debug_str << "    " << id << " = builder.create_input(type=\"" << dtype
diff --git a/paddle/cinn/hlir/framework/graph.h b/paddle/cinn/hlir/framework/graph.h
index 4c014043a1e248..d6ef914f0846ec 100644
--- a/paddle/cinn/hlir/framework/graph.h
+++ b/paddle/cinn/hlir/framework/graph.h
@@ -198,16 +198,20 @@ class Graph : public cinn::common::Graph {
   std::vector<std::shared_ptr<Group>> fusion_groups;
 
   void RegisterNode(size_t key, Node* node) {
-    this->common::Graph::RegisterNode(key, node->as<common::GraphNode>());
+    this->cinn::common::Graph::RegisterNode(
+        key, node->as<cinn::common::GraphNode>());
   }
   void RegisterNode(size_t key, NodeData* node) {
-    this->common::Graph::RegisterNode(key, node->as<common::GraphNode>());
+    this->cinn::common::Graph::RegisterNode(
+        key, node->as<cinn::common::GraphNode>());
   }
   void RegisterNode(const std::string& key, Node* node) {
-    this->common::Graph::RegisterNode(key, node->as<common::GraphNode>());
+    this->cinn::common::Graph::RegisterNode(
+        key, node->as<cinn::common::GraphNode>());
   }
   void RegisterNode(const std::string& key, NodeData* node) {
-    this->common::Graph::RegisterNode(key, node->as<common::GraphNode>());
+    this->cinn::common::Graph::RegisterNode(
+        key, node->as<cinn::common::GraphNode>());
   }
 
   /**
diff --git a/paddle/cinn/hlir/framework/graph_compiler.cc b/paddle/cinn/hlir/framework/graph_compiler.cc
index acd4387efb7121..ffa599805f13ef 100644
--- a/paddle/cinn/hlir/framework/graph_compiler.cc
+++ b/paddle/cinn/hlir/framework/graph_compiler.cc
@@ -277,7 +277,7 @@ void GraphCompiler::InsertBufferHandlers(
       const auto& malloc_var_names = m_it->second;
       auto function_name = "malloc_buffer_instruction_" + std::to_string(step);
       auto malloc_instr =
-          std::make_unique<Instruction>(common::DefaultHostTarget(),
+          std::make_unique<Instruction>(cinn::common::DefaultHostTarget(),
                                         context->scope.get(),
                                         malloc_var_names,
                                         std::vector<std::string>({}),
@@ -300,7 +300,7 @@ void GraphCompiler::InsertBufferHandlers(
       const auto& free_var_names = f_it->second;
       auto function_name = "free_buffer_instruction_" + std::to_string(step);
       auto free_instr =
-          std::make_unique<Instruction>(common::DefaultHostTarget(),
+          std::make_unique<Instruction>(cinn::common::DefaultHostTarget(),
                                         context->scope.get(),
                                         std::vector<std::string>({}),
                                         free_var_names,
@@ -350,7 +350,7 @@ std::shared_ptr<Scope> BuildScope(Target target,
 
 std::vector<ir::LoweredFunc> GetFuncFromImpl(
     const std::shared_ptr<OpImpl>& impl,
-    const common::CINNValuePack& cinn_inputs,
+    const cinn::common::CINNValuePack& cinn_inputs,
     std::vector<ir::Tensor>& all_arg_tensors,  // NOLINT
     const std::vector<std::string>& input_output_nodes,
     const std::string& node_id,
@@ -359,7 +359,7 @@ std::vector<ir::LoweredFunc> GetFuncFromImpl(
                      utils::EventType::kOrdinary);
   // 1.Call Op's Compute function, using the default stages and LowerVec to get
   // IR tree.
-  common::CINNValuePack C = impl->fcompute(cinn_inputs);
+  cinn::common::CINNValuePack C = impl->fcompute(cinn_inputs);
 
   // 2. Collect tensors and arguments
   // Add output tensors to all_arg_tensors
@@ -367,7 +367,7 @@ std::vector<ir::LoweredFunc> GetFuncFromImpl(
     ir::Expr temp = C[i];
     // checkout whether the tensor is with buffer.
     if (!temp.as_tensor_ref()->buffer.defined() ||
-        target != common::DefaultNVGPUTarget()) {
+        target != cinn::common::DefaultNVGPUTarget()) {
       all_arg_tensors.push_back(temp.as_tensor_ref());
     }
   }
@@ -386,18 +386,18 @@ std::vector<ir::LoweredFunc> GetFuncFromImpl(
     VLOG(4) << fun;
   }
 
-  std::vector<common::CINNValue> schedule_inputs;
+  std::vector<cinn::common::CINNValue> schedule_inputs;
   for (int i = 0; i < C.size() - 1; ++i) {
     CHECK(C[i].is_tensor());
-    schedule_inputs.push_back(common::CINNValue(C[i]));
+    schedule_inputs.push_back(cinn::common::CINNValue(C[i]));
   }
   for (auto& f : funcs) {
-    schedule_inputs.push_back(common::CINNValue(f->body));
+    schedule_inputs.push_back(cinn::common::CINNValue(f->body));
   }
 
   // 3. Call Op's Schedule function, optimizing the IR tree by new IR schedule
-  common::CINNValuePack expr_pack =
-      impl->fschedule(common::CINNValuePack{schedule_inputs});
+  cinn::common::CINNValuePack expr_pack =
+      impl->fschedule(cinn::common::CINNValuePack{schedule_inputs});
 
   // 4. Optimize the LoweredFunc
   VLOG(3) << "expr_pack.size() is : " << expr_pack.size()
diff --git a/paddle/cinn/hlir/framework/graph_compiler.h b/paddle/cinn/hlir/framework/graph_compiler.h
index ddbc29b504efcf..d972fc856c825d 100644
--- a/paddle/cinn/hlir/framework/graph_compiler.h
+++ b/paddle/cinn/hlir/framework/graph_compiler.h
@@ -115,7 +115,7 @@ std::shared_ptr<Scope> BuildScope(Target target,
 // Given params, lower the op to LoweredFunc using new IR Schedule
 std::vector<ir::LoweredFunc> GetFuncFromImpl(
     const std::shared_ptr<OpImpl>& impl,
-    const common::CINNValuePack& cinn_inputs,
+    const cinn::common::CINNValuePack& cinn_inputs,
     std::vector<ir::Tensor>& tensor_inputs,  // NOLINT
     const std::vector<std::string>& input_output_nodes,
     const std::string& node_id,
diff --git a/paddle/cinn/hlir/framework/graph_compiler_test.cc b/paddle/cinn/hlir/framework/graph_compiler_test.cc
index e9ee1a21a4edf8..49f959eb90174b 100644
--- a/paddle/cinn/hlir/framework/graph_compiler_test.cc
+++ b/paddle/cinn/hlir/framework/graph_compiler_test.cc
@@ -30,7 +30,7 @@ namespace cinn {
 namespace hlir {
 namespace framework {
 
-using common::Float;
+using cinn::common::Float;
 using frontend::Placeholder;
 
 TEST(GraphCompilerTest, TestRemoveInvaildVariables) {
@@ -41,7 +41,7 @@ TEST(GraphCompilerTest, TestRemoveInvaildVariables) {
   auto c = builder.Add(a, b, 1);
   auto d = builder.Relu(c);
 
-  auto target = common::DefaultHostTarget();
+  auto target = cinn::common::DefaultHostTarget();
   auto program = builder.Build();
   auto graph = Optimize(&program, {}, target);
 
@@ -66,7 +66,7 @@ TEST(GraphCompilerTest, TestInsertBufferHandlers) {
   auto c = builder.Add(a, b, 1);
   auto d = builder.Relu(c);
 
-  auto target = common::DefaultHostTarget();
+  auto target = cinn::common::DefaultHostTarget();
   auto program = builder.Build();
   auto graph = Optimize(&program, {}, target);
   auto scope = BuildScope(target, graph);
@@ -191,7 +191,7 @@ void RunCublas(
   auto C = net_builder.Matmul(A, B, trans_a, trans_b);
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
 
   hlir::framework::ApplyPass(graph.get(), "TransToCustomCallPass");
@@ -245,7 +245,7 @@ TEST(GraphCompilerTest, TestLowering) {
   auto c = builder.Add(a, b, 1);
   auto d = builder.Relu(c);
 
-  auto target = common::DefaultNVGPUTarget();
+  auto target = cinn::common::DefaultNVGPUTarget();
   auto program = builder.Build();
   auto graph = Optimize(&program, {}, target);
   auto scope = BuildScope(target, graph);
@@ -265,7 +265,7 @@ TEST(GraphCompilerTest, TestCodegenAndJit) {
   auto c = builder.Add(a, b, 1);
   auto d = builder.Relu(c);
 
-  auto target = common::DefaultNVGPUTarget();
+  auto target = cinn::common::DefaultNVGPUTarget();
   auto program = builder.Build();
   auto graph = Optimize(&program, {}, target);
   auto scope = BuildScope(target, graph);
@@ -285,7 +285,7 @@ TEST(GraphCompilerTest, TestBuildInstruction) {
   auto c = builder.Add(a, b, 1);
   auto d = builder.Relu(c);
 
-  auto target = common::DefaultNVGPUTarget();
+  auto target = cinn::common::DefaultNVGPUTarget();
   auto program = builder.Build();
   auto graph = Optimize(&program, {}, target);
   auto scope = BuildScope(target, graph);
diff --git a/paddle/cinn/hlir/framework/graph_test.cc b/paddle/cinn/hlir/framework/graph_test.cc
index 42aec09eca3c2d..b9741931d25235 100644
--- a/paddle/cinn/hlir/framework/graph_test.cc
+++ b/paddle/cinn/hlir/framework/graph_test.cc
@@ -35,7 +35,7 @@ TEST(Graph, visualize) {
   auto reduce_sum_1 = builder.ReduceSum(relu_1, {1});
   auto program = builder.Build();
 
-  auto target = common::DefaultHostTarget();
+  auto target = cinn::common::DefaultHostTarget();
   auto graph = std::make_shared<Graph>(program, target);
   ApplyPass(graph.get(), "OpFusion");
 
@@ -56,7 +56,7 @@ TEST(Graph, visualize_recompute) {
   auto add_3 = builder.Add(y, broadcast_to_2);
   auto program = builder.Build();
 
-  auto target = common::DefaultHostTarget();
+  auto target = cinn::common::DefaultHostTarget();
   auto graph = std::make_shared<Graph>(program, target);
   ApplyPass(graph.get(), "OpFusionPass");
   ApplyPass(graph.get(), "FusionMergePass");
diff --git a/paddle/cinn/hlir/framework/instruction.cc b/paddle/cinn/hlir/framework/instruction.cc
index d9b624e518e22d..7a85318654efc5 100644
--- a/paddle/cinn/hlir/framework/instruction.cc
+++ b/paddle/cinn/hlir/framework/instruction.cc
@@ -83,7 +83,7 @@ void Instruction::UpdateArgsCache(
   args_cached_.resize(cache_size);
 
   for (int i = 0; i < cache_size; ++i) {
-    common::ArgsBuilder builder;
+    cinn::common::ArgsBuilder builder;
     std::vector<std::string> all_args = in_args_[i];
     all_args.insert(
         std::end(all_args), out_args_[i].begin(), out_args_[i].end());
@@ -175,7 +175,7 @@ void Instruction::Run(
       CHECK(fn_ptrs_[idx]) << "The LoweredFunc address should be set first by "
                               "calling SetLoweredFunc method";
       if (!dryrun) {
-        if (target_ == common::DefaultNVGPUTarget()) {
+        if (target_ == cinn::common::DefaultNVGPUTarget()) {
           ((lower_func_ptr_g)fn_ptrs_[idx])(
               static_cast<void*>(pod_args.data()), pod_args.size(), stream);
         } else {
@@ -211,7 +211,7 @@ void Instruction::Run(
                                              pod_args[1],
                                              pod_args[2],
                                              static_cast<cudaStream_t>(stream),
-                                             common::Layout::kNHWC);
+                                             cinn::common::Layout::kNHWC);
 
       } else {
         absl::flat_hash_map<std::string, int> attrs_map = {
@@ -231,7 +231,7 @@ void Instruction::Run(
                                              pod_args[1],
                                              pod_args[2],
                                              static_cast<cudaStream_t>(stream),
-                                             common::Layout::kNCHW);
+                                             cinn::common::Layout::kNCHW);
       }
     } else if (str_attrs[0] == "backward_data") {
       // w, dy, dx
@@ -322,7 +322,7 @@ void Instruction::Run(
       CHECK(fn_ptrs_[idx]) << "The LoweredFunc address should be set first by "
                               "calling SetLoweredFunc method";
       if (!dryrun) {
-        if (target_ == common::DefaultNVGPUTarget()) {
+        if (target_ == cinn::common::DefaultNVGPUTarget()) {
           ((lower_func_ptr_g)fn_ptrs_[idx])(
               static_cast<void*>(pod_args.data()), pod_args.size(), stream);
         } else {
@@ -341,7 +341,7 @@ void Instruction::Run(
     CHECK(fn_ptrs_[idx]) << "The LoweredFunc address should be set first by "
                             "calling SetLoweredFunc method";
     if (!dryrun) {
-      if (target_ == common::DefaultNVGPUTarget()) {
+      if (target_ == cinn::common::DefaultNVGPUTarget()) {
         ((lower_func_ptr_g)fn_ptrs_[idx])(
             static_cast<void*>(pod_args.data()), pod_args.size(), stream);
       } else {
diff --git a/paddle/cinn/hlir/framework/instruction.h b/paddle/cinn/hlir/framework/instruction.h
index c6ec1aae014f5f..4e1a92e4b1c46a 100644
--- a/paddle/cinn/hlir/framework/instruction.h
+++ b/paddle/cinn/hlir/framework/instruction.h
@@ -109,7 +109,7 @@ class Instruction {
         auto& pod_args = args_cached_[idx];
         CHECK(fn_ptrs_[idx]) << "The LoweredFunc address should be set first "
                                 "by calling SetLoweredFunc method";
-        if (target_ == common::DefaultNVGPUTarget()) {
+        if (target_ == cinn::common::DefaultNVGPUTarget()) {
           ((lower_func_ptr_g)fn_ptrs_[idx])(
               static_cast<void*>(pod_args.data()), pod_args.size(), stream);
         } else {
diff --git a/paddle/cinn/hlir/framework/instruction_test.cc b/paddle/cinn/hlir/framework/instruction_test.cc
index 85c99282ee747d..2e2b412cf4fdff 100644
--- a/paddle/cinn/hlir/framework/instruction_test.cc
+++ b/paddle/cinn/hlir/framework/instruction_test.cc
@@ -46,7 +46,7 @@ std::unique_ptr<backends::SimpleJIT> GetLoweredFunc(int M, int N) {
   auto stages = CreateStages({z});
   auto fn = Lower("fn", stages, {x, y, z});
 
-  ir::Module::Builder builder("some_module", common::DefaultHostTarget());
+  ir::Module::Builder builder("some_module", cinn::common::DefaultHostTarget());
   builder.AddFunction(fn);
 
   auto jit = backends::SimpleJIT::Create();
@@ -59,7 +59,7 @@ void InstantiateScope(int M, int N, Scope* scope) {
     auto* var = scope->Var<Tensor>(name);
     auto& tensor = absl::get<Tensor>(*var);
     tensor->Resize(Shape{{M, N}});
-    auto* data = tensor->mutable_data<float>(common::DefaultHostTarget());
+    auto* data = tensor->mutable_data<float>(cinn::common::DefaultHostTarget());
     for (int i = 0; i < M * N; i++) {
       data[i] = (rand() * 1.f) / RAND_MAX;  // NOLINT
     }
@@ -73,7 +73,8 @@ TEST(Instruction, basic) {
   Scope scope;
   InstantiateScope(M, N, &scope);
   // create Instruction
-  Instruction instr(common::DefaultHostTarget(), &scope, {"x", "y"}, {"z"});
+  Instruction instr(
+      cinn::common::DefaultHostTarget(), &scope, {"x", "y"}, {"z"});
   auto jit = GetLoweredFunc(M, N);
   auto fn_ptr = jit->Lookup("fn");
   CHECK(fn_ptr);
@@ -106,8 +107,8 @@ TEST(Instruction, RunWithRawPodArgs) {
   // case 1: create cinn_pod_value_t arguments dicrectly
   std::vector<cinn_buffer_t> args_buffer(
       3);  // store {"x", "y", "z"} buffer objects
-  auto* default_memory_mng =
-      MemoryManager::Global().RetrieveSafely(common::DefaultHostTarget().arch);
+  auto* default_memory_mng = MemoryManager::Global().RetrieveSafely(
+      cinn::common::DefaultHostTarget().arch);
 
   int count = 0;
   for (const auto& name : std::vector<std::string>({"x", "y", "z"})) {
@@ -128,8 +129,10 @@ TEST(Instruction, RunWithRawPodArgs) {
   auto jit = GetLoweredFunc(M, N);
   auto fn_ptr = jit->Lookup("fn");
   CHECK(fn_ptr);
-  Instruction instr(
-      common::DefaultHostTarget(), nullptr, {"x", "y"}, {"z"});  // empty scope
+  Instruction instr(cinn::common::DefaultHostTarget(),
+                    nullptr,
+                    {"x", "y"},
+                    {"z"});  // empty scope
   instr.SetLoweredFunc(reinterpret_cast<void*>(fn_ptr));
   instr.Finalize();
 
@@ -312,12 +315,15 @@ TEST(Instruction, CONV_FORWARD) {
       Operator::GetAttrs<InferShapeFunction>("infershape")[conv2d];
 
   CUDA_CALL(cudaSetDevice(0));
-  auto buffer_x =
-      common::BufferBuilder(Float(32), {in, ic, ih, iw}).set_random().Build();
-  auto buffer_w =
-      common::BufferBuilder(Float(32), {fn, fc, fh, fw}).set_random().Build();
-  auto buffer_y =
-      common::BufferBuilder(Float(32), {on, oc, oh, ow}).set_random().Build();
+  auto buffer_x = cinn::common::BufferBuilder(Float(32), {in, ic, ih, iw})
+                      .set_random()
+                      .Build();
+  auto buffer_w = cinn::common::BufferBuilder(Float(32), {fn, fc, fh, fw})
+                      .set_random()
+                      .Build();
+  auto buffer_y = cinn::common::BufferBuilder(Float(32), {on, oc, oh, ow})
+                      .set_random()
+                      .Build();
 
   void *dev_x = nullptr, *dev_w = nullptr, *dev_y = nullptr;
   CUDA_CALL(cudaMalloc(&dev_x, buffer_x->memory_size));
@@ -353,7 +359,7 @@ TEST(Instruction, CONV_FORWARD) {
   std::vector<cinn_pod_value_t> pod_args = {x, w, y};
 
   Scope scope;
-  auto target = common::DefaultNVGPUTarget();
+  auto target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> in_args, out_args;
   TestInstruction instr(target, &scope, in_args, out_args, "conv2d");
 
@@ -403,12 +409,15 @@ TEST(Instruction, CONV_BACKWARD_DATA) {
       Operator::GetAttrs<InferShapeFunction>("infershape")[conv2d];
 
   CUDA_CALL(cudaSetDevice(0));
-  auto buffer_x =
-      common::BufferBuilder(Float(32), {in, ic, ih, iw}).set_random().Build();
-  auto buffer_w =
-      common::BufferBuilder(Float(32), {fn, fc, fh, fw}).set_random().Build();
-  auto buffer_y =
-      common::BufferBuilder(Float(32), {on, oc, oh, ow}).set_random().Build();
+  auto buffer_x = cinn::common::BufferBuilder(Float(32), {in, ic, ih, iw})
+                      .set_random()
+                      .Build();
+  auto buffer_w = cinn::common::BufferBuilder(Float(32), {fn, fc, fh, fw})
+                      .set_random()
+                      .Build();
+  auto buffer_y = cinn::common::BufferBuilder(Float(32), {on, oc, oh, ow})
+                      .set_random()
+                      .Build();
 
   void *dev_x = nullptr, *dev_w = nullptr, *dev_y = nullptr;
   CUDA_CALL(cudaMalloc(&dev_x, buffer_x->memory_size));
@@ -445,7 +454,7 @@ TEST(Instruction, CONV_BACKWARD_DATA) {
   std::vector<cinn_pod_value_t> pod_args = {w, y, x};
 
   Scope scope;
-  auto target = common::DefaultNVGPUTarget();
+  auto target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> in_args, out_args;
   TestInstruction instr(target, &scope, in_args, out_args, "conv2d");
 
@@ -510,12 +519,15 @@ TEST(Instruction, CONV_BACKWARD_FILTER) {
   ASSERT_EQ(infer_shape[0][3], fw);
 
   CUDA_CALL(cudaSetDevice(0));
-  auto buffer_x =
-      common::BufferBuilder(Float(32), {in, ic, ih, iw}).set_random().Build();
-  auto buffer_w =
-      common::BufferBuilder(Float(32), {fn, fc, fh, fw}).set_random().Build();
-  auto buffer_y =
-      common::BufferBuilder(Float(32), {on, oc, oh, ow}).set_random().Build();
+  auto buffer_x = cinn::common::BufferBuilder(Float(32), {in, ic, ih, iw})
+                      .set_random()
+                      .Build();
+  auto buffer_w = cinn::common::BufferBuilder(Float(32), {fn, fc, fh, fw})
+                      .set_random()
+                      .Build();
+  auto buffer_y = cinn::common::BufferBuilder(Float(32), {on, oc, oh, ow})
+                      .set_random()
+                      .Build();
 
   void *dev_x = nullptr, *dev_w = nullptr, *dev_y = nullptr;
   CUDA_CALL(cudaMalloc(&dev_x, buffer_x->memory_size));
@@ -552,7 +564,7 @@ TEST(Instruction, CONV_BACKWARD_FILTER) {
   std::vector<cinn_pod_value_t> pod_args = {x, y, w};
 
   Scope scope;
-  auto target = common::DefaultNVGPUTarget();
+  auto target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> in_args, out_args;
   TestInstruction instr(target, &scope, in_args, out_args, "conv2d");
 
diff --git a/paddle/cinn/hlir/framework/memory.cc b/paddle/cinn/hlir/framework/memory.cc
index 6c567bb84f6b72..bfc33b31beda9e 100755
--- a/paddle/cinn/hlir/framework/memory.cc
+++ b/paddle/cinn/hlir/framework/memory.cc
@@ -25,7 +25,7 @@ namespace cinn {
 namespace hlir {
 namespace framework {
 
-using common::Target;
+using cinn::common::Target;
 
 namespace {
 
diff --git a/paddle/cinn/hlir/framework/memory.h b/paddle/cinn/hlir/framework/memory.h
old mode 100755
new mode 100644
index ee84433ed29e45..3b8c59887d7fed
--- a/paddle/cinn/hlir/framework/memory.h
+++ b/paddle/cinn/hlir/framework/memory.h
@@ -41,7 +41,7 @@ class MemoryInterface {
  */
 class MemoryManager final {
  public:
-  using key_t = common::Target::Arch;
+  using key_t = cinn::common::Target::Arch;
 
   static MemoryManager& Global() {
     static auto* x = new MemoryManager;
@@ -69,7 +69,8 @@ class MemoryManager final {
  private:
   MemoryManager();
 
-  absl::flat_hash_map<common::Target::Arch, std::unique_ptr<MemoryInterface>>
+  absl::flat_hash_map<cinn::common::Target::Arch,
+                      std::unique_ptr<MemoryInterface>>
       memory_mngs_;
 
   CINN_DISALLOW_COPY_AND_ASSIGN(MemoryManager);
diff --git a/paddle/cinn/hlir/framework/node.cc b/paddle/cinn/hlir/framework/node.cc
index 20b2eb90921f02..4f50d930f4c7e2 100644
--- a/paddle/cinn/hlir/framework/node.cc
+++ b/paddle/cinn/hlir/framework/node.cc
@@ -22,22 +22,26 @@ namespace cinn {
 namespace hlir {
 namespace framework {
 
-std::tuple<common::GraphEdge*, common::GraphEdge*> Node::LinkTo(
+std::tuple<cinn::common::GraphEdge*, cinn::common::GraphEdge*> Node::LinkTo(
     NodeData* other) {
-  return this->common::GraphNode::LinkTo(other->as<common::GraphNode>());
+  return this->cinn::common::GraphNode::LinkTo(
+      other->as<cinn::common::GraphNode>());
 }
 
-std::tuple<common::GraphEdge*, common::GraphEdge*> NodeData::LinkTo(
+std::tuple<cinn::common::GraphEdge*, cinn::common::GraphEdge*> NodeData::LinkTo(
     Node* other) {
-  return this->common::GraphNode::LinkTo(other->as<common::GraphNode>());
+  return this->cinn::common::GraphNode::LinkTo(
+      other->as<cinn::common::GraphNode>());
 }
 
 void Node::Controls(NodeData* other) {
-  return this->common::GraphNode::Controls(other->as<common::GraphNode>());
+  return this->cinn::common::GraphNode::Controls(
+      other->as<cinn::common::GraphNode>());
 }
 
 void NodeData::Controls(Node* other) {
-  return this->common::GraphNode::Controls(other->as<common::GraphNode>());
+  return this->cinn::common::GraphNode::Controls(
+      other->as<cinn::common::GraphNode>());
 }
 
 namespace {
@@ -82,15 +86,17 @@ std::ostream& operator<<(std::ostream& os, const NodeAttr& node_attr) {
 }
 
 //! Using index to sort the input/output tensors
-bool edge_index_compare(const common::Shared<common::GraphEdge>& a,
-                        const common::Shared<common::GraphEdge>& b) {
+bool edge_index_compare(
+    const cinn::common::Shared<cinn::common::GraphEdge>& a,
+    const cinn::common::Shared<cinn::common::GraphEdge>& b) {
   CHECK_NOTNULL(a.get());
   CHECK_NOTNULL(b.get());
   return a->index() < b->index();
 }
 
-std::vector<common::Shared<common::GraphEdge>> Node::inlinks_in_order() const {
-  std::vector<common::Shared<common::GraphEdge>> ordered_links;
+std::vector<cinn::common::Shared<cinn::common::GraphEdge>>
+Node::inlinks_in_order() const {
+  std::vector<cinn::common::Shared<cinn::common::GraphEdge>> ordered_links;
   for (auto& in_edge : this->inlinks()) {
     ordered_links.push_back(in_edge);
     CHECK_GE(in_edge->index(), 0)
@@ -101,8 +107,9 @@ std::vector<common::Shared<common::GraphEdge>> Node::inlinks_in_order() const {
   return ordered_links;
 }
 
-std::vector<common::Shared<common::GraphEdge>> Node::outlinks_in_order() const {
-  std::vector<common::Shared<common::GraphEdge>> ordered_links;
+std::vector<cinn::common::Shared<cinn::common::GraphEdge>>
+Node::outlinks_in_order() const {
+  std::vector<cinn::common::Shared<cinn::common::GraphEdge>> ordered_links;
   for (auto& out_edge : this->outlinks()) {
     ordered_links.push_back(out_edge);
     CHECK_GE(out_edge->index(), 0)
@@ -113,7 +120,7 @@ std::vector<common::Shared<common::GraphEdge>> Node::outlinks_in_order() const {
   return ordered_links;
 }
 
-NodeData* InsertGraphOpNodeAfter(common::Graph* graph,
+NodeData* InsertGraphOpNodeAfter(cinn::common::Graph* graph,
                                  Node* insert_node,
                                  NodeData* input_nodedata,
                                  Node* out_node,
@@ -122,11 +129,11 @@ NodeData* InsertGraphOpNodeAfter(common::Graph* graph,
   CHECK(insert_node);
   CHECK(input_nodedata);
   input_nodedata->Controls(insert_node);
-  common::Shared<Node> node_ptr(insert_node);
+  cinn::common::Shared<Node> node_ptr(insert_node);
   auto* out_nodedata = new NodeData(
-      node_ptr, 0, 0, common::UniqName(insert_node->id() + "_out"));
+      node_ptr, 0, 0, cinn::common::UniqName(insert_node->id() + "_out"));
   insert_node->Controls(out_nodedata);
-  std::vector<common::GraphNode*> old_sources;
+  std::vector<cinn::common::GraphNode*> old_sources;
   auto input_links = out_node->inlinks_in_order();
 
   if (out_node) {
@@ -151,7 +158,7 @@ NodeData* InsertGraphOpNodeAfter(common::Graph* graph,
   return out_nodedata;
 }
 
-NodeData* InsertGraphOpNodeBefore(common::Graph* graph,
+NodeData* InsertGraphOpNodeBefore(cinn::common::Graph* graph,
                                   Node* insert_node,
                                   Node* input_node,
                                   NodeData* dst_data,
@@ -161,9 +168,9 @@ NodeData* InsertGraphOpNodeBefore(common::Graph* graph,
   CHECK(input_node);
   CHECK(dst_data);
   auto node_ptr = dst_data->source_node;
-  auto* input_node_out =
-      new NodeData(node_ptr, 0, 0, common::UniqName(input_node->id() + "_out"));
-  std::vector<common::GraphNode*> old_sinks;
+  auto* input_node_out = new NodeData(
+      node_ptr, 0, 0, cinn::common::UniqName(input_node->id() + "_out"));
+  std::vector<cinn::common::GraphNode*> old_sinks;
   const auto& old_outlinks = input_node->outlinks_in_order();
   for (auto& link : old_outlinks) {
     auto sink = link->sink();
@@ -173,7 +180,7 @@ NodeData* InsertGraphOpNodeBefore(common::Graph* graph,
   }
   input_node_out->Controls(insert_node);
   insert_node->Controls(dst_data);
-  dst_data->source_node = common::Shared<Node>(insert_node);
+  dst_data->source_node = cinn::common::Shared<Node>(insert_node);
 
   for (int i = 0; i < old_sinks.size(); i++) {
     if (i == pos) {
diff --git a/paddle/cinn/hlir/framework/node.h b/paddle/cinn/hlir/framework/node.h
index 31d316bbbff8d4..764492df45f383 100644
--- a/paddle/cinn/hlir/framework/node.h
+++ b/paddle/cinn/hlir/framework/node.h
@@ -32,7 +32,7 @@ namespace framework {
 class Node;
 class NodeData;
 
-using NodePtr = common::Shared<Node>;
+using NodePtr = cinn::common::Shared<Node>;
 using AttrType = utils::Attribute;
 using AttrMapType = utils::AttributeMap;
 
@@ -65,7 +65,7 @@ std::ostream &operator<<(std::ostream &os, const NodeAttr &node_attr);
 /**
  * \brief Node represents an operation in a computation graph.
  */
-class Node : public common::GraphNode {
+class Node : public cinn::common::GraphNode {
  public:
   Node() = default;
   Node(const Operator *op, const std::string &name, std::string id = {}) {
@@ -74,7 +74,8 @@ class Node : public common::GraphNode {
     this->id_ = std::move(id);
   }
   const char *type_info() const override { return __type_info__; }
-  std::tuple<common::GraphEdge *, common::GraphEdge *> LinkTo(NodeData *other);
+  std::tuple<cinn::common::GraphEdge *, cinn::common::GraphEdge *> LinkTo(
+      NodeData *other);
 
   // This node determines another node, which means the other node depeneds on
   // this node.
@@ -92,11 +93,13 @@ class Node : public common::GraphNode {
 
   //! Get the input tensors in order to match tensors correctly. If do refresh,
   //! we will update the links.
-  std::vector<common::Shared<common::GraphEdge>> inlinks_in_order() const;
+  std::vector<cinn::common::Shared<cinn::common::GraphEdge>> inlinks_in_order()
+      const;
 
   //! Get the output tensors in order to match tensors correctly. If do refresh,
   //! we will update the links.
-  std::vector<common::Shared<common::GraphEdge>> outlinks_in_order() const;
+  std::vector<cinn::common::Shared<cinn::common::GraphEdge>> outlinks_in_order()
+      const;
 
   inline const Operator *op() const { return this->attrs.op; }
 
@@ -123,7 +126,7 @@ class Node : public common::GraphNode {
 
   template <class... Args>
   static NodePtr Create(Args &&...args) {
-    return common::Shared<Node>(new Node(std::forward<Args>(args)...));
+    return cinn::common::Shared<Node>(new Node(std::forward<Args>(args)...));
   }
 
   static constexpr char *__type_info__ = "hlir_framework_node";
@@ -138,7 +141,7 @@ class Node : public common::GraphNode {
 /**
  * \brief NodeData represents the output data from an operator.
  */
-class NodeData : public common::GraphNode {
+class NodeData : public cinn::common::GraphNode {
   using attr_t = AttrType;
 
  public:
@@ -155,7 +158,8 @@ class NodeData : public common::GraphNode {
 
   NodeData() : source_node(), output_index(), version(), id_(), is_const_() {}
 
-  std::tuple<common::GraphEdge *, common::GraphEdge *> LinkTo(Node *other);
+  std::tuple<cinn::common::GraphEdge *, cinn::common::GraphEdge *> LinkTo(
+      Node *other);
 
   // This node determines another node, which means the other node depeneds on
   // this node.
@@ -219,13 +223,13 @@ class NodeData : public common::GraphNode {
 };
 
 // insert op_node after input_data
-NodeData *InsertGraphOpNodeAfter(common::Graph *graph,
+NodeData *InsertGraphOpNodeAfter(cinn::common::Graph *graph,
                                  Node *insert_node,
                                  NodeData *input_nodedata,
                                  Node *dst_node,
                                  int pos);
 // insert op_node before out_data
-NodeData *InsertGraphOpNodeBefore(common::Graph *graph,
+NodeData *InsertGraphOpNodeBefore(cinn::common::Graph *graph,
                                   Node *insert_node,
                                   Node *input_node,
                                   NodeData *dst_data,
diff --git a/paddle/cinn/hlir/framework/op_lowering.h b/paddle/cinn/hlir/framework/op_lowering.h
index 87d89360d4fff0..8e69ebe4a618fc 100644
--- a/paddle/cinn/hlir/framework/op_lowering.h
+++ b/paddle/cinn/hlir/framework/op_lowering.h
@@ -30,7 +30,7 @@ namespace cinn {
 namespace hlir {
 namespace framework {
 
-using common::Target;
+using cinn::common::Target;
 using GroupPtr = std::shared_ptr<hlir::framework::Graph::Group>;
 
 template <typename T>
diff --git a/paddle/cinn/hlir/framework/op_lowering_impl.cc b/paddle/cinn/hlir/framework/op_lowering_impl.cc
index f955e7b96cf61a..0db3788b007cca 100644
--- a/paddle/cinn/hlir/framework/op_lowering_impl.cc
+++ b/paddle/cinn/hlir/framework/op_lowering_impl.cc
@@ -32,8 +32,8 @@ namespace cinn {
 namespace hlir {
 namespace framework {
 
-using common::bfloat16;
-using common::float16;
+using cinn::common::bfloat16;
+using cinn::common::float16;
 
 using framework::Node;
 using framework::NodeData;
@@ -41,7 +41,7 @@ using framework::OpPatternKind;
 using framework::shape_t;
 using framework::StrategyFunction;
 
-using common::Type;
+using cinn::common::Type;
 
 using cinn::hlir::op::ExternalApiRegistry;
 
@@ -211,10 +211,11 @@ std::vector<ir::LoweredFunc> OpLowererImpl::LowerCustomCall(
   } else {
     external_api = ExternalApiRegistry::Global()->GetExternalApi(node, target_);
   }
-  std::vector<common::CINNValue> compute_args = {
-      common::CINNValue(group->GetFuncName()), common::CINNValue(external_api)};
-  common::CINNValuePack pack =
-      impl->fcompute(common::CINNValuePack{compute_args});
+  std::vector<cinn::common::CINNValue> compute_args = {
+      cinn::common::CINNValue(group->GetFuncName()),
+      cinn::common::CINNValue(external_api)};
+  cinn::common::CINNValuePack pack =
+      impl->fcompute(cinn::common::CINNValuePack{compute_args});
   if (pack.size() != 1) {
     std::ostringstream err_msg;
     err_msg << "Lowering custom call, group func name: " << group->GetFuncName()
@@ -371,19 +372,19 @@ std::vector<ir::LoweredFunc> OpLowererImpl::DoOpLower(
     std::unordered_map<std::string, ir::Tensor>* tensor_map,
     std::vector<ir::Tensor>* op_func_arg_tensors) {
   VLOG(4) << "Do lower with Compute, op: " << node->op()->name;
-  std::vector<common::CINNValue> cinn_inputs;
+  std::vector<cinn::common::CINNValue> cinn_inputs;
   for (const ir::Tensor& tensor : *op_func_arg_tensors) {
-    cinn_inputs.push_back(common::CINNValue(ir::Expr(tensor)));
+    cinn_inputs.push_back(cinn::common::CINNValue(ir::Expr(tensor)));
   }
   // set tensor name = node data name
   std::vector<NodeData*> node_datas = GetAllNodeData(node);
   for (const NodeData* node_data : node_datas) {
-    cinn_inputs.push_back(common::CINNValue(node_data->id()));
+    cinn_inputs.push_back(cinn::common::CINNValue(node_data->id()));
   }
 
   // 1.Do compute
-  common::CINNValuePack pack =
-      op_impl->fcompute(common::CINNValuePack{cinn_inputs});
+  cinn::common::CINNValuePack pack =
+      op_impl->fcompute(cinn::common::CINNValuePack{cinn_inputs});
 
   poly::StageMap tmp_stages = pack.back();
   std::string post = "";
@@ -405,7 +406,7 @@ std::vector<ir::LoweredFunc> OpLowererImpl::DoOpLower(
 
     // Insert output tensors into function arg
     if (!expr.as_tensor_ref()->buffer.defined() ||
-        this->target_ != common::DefaultNVGPUTarget()) {
+        this->target_ != cinn::common::DefaultNVGPUTarget()) {
       op_func_arg_tensors->push_back(expr.as_tensor_ref());
       expr.as_tensor_ref()->WithBuffer();
     }
@@ -448,18 +449,18 @@ ir::Expr OpLowererImpl::DoOpSchedule(
     const std::vector<ir::Tensor>& op_func_arg_tensors,
     const std::vector<ir::LoweredFunc>& lowered_funcs) {
   VLOG(4) << "Do op schedule";
-  std::vector<common::CINNValue> schedule_inputs;
+  std::vector<cinn::common::CINNValue> schedule_inputs;
   // 1.Collect tensors
   for (const ir::Tensor& op_func_arg_tensor : op_func_arg_tensors) {
-    schedule_inputs.push_back(common::CINNValue(op_func_arg_tensor));
+    schedule_inputs.push_back(cinn::common::CINNValue(op_func_arg_tensor));
   }
   // 2.Collect bodies to be scheduled
   for (const ir::LoweredFunc& func : lowered_funcs) {
-    schedule_inputs.push_back(common::CINNValue(func->body));
+    schedule_inputs.push_back(cinn::common::CINNValue(func->body));
   }
   // 3.Do schedule on AST
-  common::CINNValuePack expr_pack =
-      op_impl->fschedule(common::CINNValuePack{schedule_inputs});
+  cinn::common::CINNValuePack expr_pack =
+      op_impl->fschedule(cinn::common::CINNValuePack{schedule_inputs});
   VLOG(4) << "After op schedule: " << expr_pack[0].operator ir::Expr();
 
   return expr_pack[0].operator ir::Expr();
diff --git a/paddle/cinn/hlir/framework/op_lowering_impl.h b/paddle/cinn/hlir/framework/op_lowering_impl.h
index 5a562f4d1cabd0..895b73c87bdf1b 100644
--- a/paddle/cinn/hlir/framework/op_lowering_impl.h
+++ b/paddle/cinn/hlir/framework/op_lowering_impl.h
@@ -37,7 +37,7 @@ namespace hlir {
 namespace framework {
 
 using GroupPtr = std::shared_ptr<Graph::Group>;
-using common::Target;
+using cinn::common::Target;
 class OpLowererImpl;
 
 typedef bool (OpLowererImpl::*ScheduleDetermineFunction)(Node*);
diff --git a/paddle/cinn/hlir/framework/op_lowering_test.cc b/paddle/cinn/hlir/framework/op_lowering_test.cc
index 602003719e5746..07fcc7a48e0164 100644
--- a/paddle/cinn/hlir/framework/op_lowering_test.cc
+++ b/paddle/cinn/hlir/framework/op_lowering_test.cc
@@ -34,7 +34,7 @@ using frontend::RunDecomposer;
 
 void CodeGen(const ir::LoweredFunc& func) {
 #ifdef CINN_WITH_CUDA
-  auto target = common::DefaultNVGPUTarget();
+  auto target = cinn::common::DefaultNVGPUTarget();
   Module::Builder builder("module_builder", target);
 
   builder.AddFunction(func);
@@ -44,7 +44,7 @@ void CodeGen(const ir::LoweredFunc& func) {
   std::string code = "";
   compiler->Build(module, code);
 #else
-  auto target = common::DefaultHostTarget();
+  auto target = cinn::common::DefaultHostTarget();
   ir::Module::Builder builder("Module_Builder", target);
   builder.AddFunction(func);
 
@@ -58,7 +58,7 @@ void CodeGen(const ir::LoweredFunc& func) {
 
 void Compile(NetBuilder& net_builder) {  // NOLINT
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   RunDecomposer(&program, target);
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
@@ -1204,9 +1204,9 @@ TEST(OP_LOWERING, Reduce_Fusion_Test_21) {
 */
 
 TEST(OpFusionPass, Block_Reduce_Fuse_Broadcast) {
-  int sm_count = common::DefaultNVGPUTarget().get_multi_processor_count();
+  int sm_count = cinn::common::DefaultNVGPUTarget().get_multi_processor_count();
   int max_threads_per_sm =
-      common::DefaultNVGPUTarget().get_max_threads_per_sm();
+      cinn::common::DefaultNVGPUTarget().get_max_threads_per_sm();
   int warp_reduce_threshold = sm_count * max_threads_per_sm / 32;
   int h = warp_reduce_threshold - 10;
   int w = 256;
@@ -1222,9 +1222,9 @@ TEST(OpFusionPass, Block_Reduce_Fuse_Broadcast) {
 }
 
 TEST(OpFusionPass, Block_Reduce_Fuse_Elementwise) {
-  int sm_count = common::DefaultNVGPUTarget().get_multi_processor_count();
+  int sm_count = cinn::common::DefaultNVGPUTarget().get_multi_processor_count();
   int max_threads_per_sm =
-      common::DefaultNVGPUTarget().get_max_threads_per_sm();
+      cinn::common::DefaultNVGPUTarget().get_max_threads_per_sm();
   int warp_reduce_threshold = sm_count * max_threads_per_sm / 32;
   int h = warp_reduce_threshold - 10;
   int w = 256;
@@ -1240,9 +1240,9 @@ TEST(OpFusionPass, Block_Reduce_Fuse_Elementwise) {
   Compile(net_builder);
 }
 TEST(OpFusionPass, Warp_Reduce_Fuse_Broadcast) {
-  int sm_count = common::DefaultNVGPUTarget().get_multi_processor_count();
+  int sm_count = cinn::common::DefaultNVGPUTarget().get_multi_processor_count();
   int max_threads_per_sm =
-      common::DefaultNVGPUTarget().get_max_threads_per_sm();
+      cinn::common::DefaultNVGPUTarget().get_max_threads_per_sm();
   int warp_reduce_threshold = sm_count * max_threads_per_sm / 32;
   int h = warp_reduce_threshold + 10;
   int w = 256;
@@ -1258,9 +1258,9 @@ TEST(OpFusionPass, Warp_Reduce_Fuse_Broadcast) {
 }
 
 TEST(OpFusionPass, Warp_Reduce_Fuse_Elementwise) {
-  int sm_count = common::DefaultNVGPUTarget().get_multi_processor_count();
+  int sm_count = cinn::common::DefaultNVGPUTarget().get_multi_processor_count();
   int max_threads_per_sm =
-      common::DefaultNVGPUTarget().get_max_threads_per_sm();
+      cinn::common::DefaultNVGPUTarget().get_max_threads_per_sm();
   int warp_reduce_threshold = sm_count * max_threads_per_sm / 32;
   int h = warp_reduce_threshold + 10;
   int w = 256;
diff --git a/paddle/cinn/hlir/framework/op_lowering_util.cc b/paddle/cinn/hlir/framework/op_lowering_util.cc
index 1af9ef05763517..5a332324c7c89b 100644
--- a/paddle/cinn/hlir/framework/op_lowering_util.cc
+++ b/paddle/cinn/hlir/framework/op_lowering_util.cc
@@ -53,11 +53,11 @@ ir::Tensor GetTensor(
     return lang::Placeholder<double>(node_data->id(),
                                      shape_dict.at(node_data->id()));
   } else if (dtype.is_bfloat16()) {
-    return lang::Placeholder<common::bfloat16>(node_data->id(),
-                                               shape_dict.at(node_data->id()));
+    return lang::Placeholder<cinn::common::bfloat16>(
+        node_data->id(), shape_dict.at(node_data->id()));
   } else if (dtype.is_float16()) {
-    return lang::Placeholder<common::float16>(node_data->id(),
-                                              shape_dict.at(node_data->id()));
+    return lang::Placeholder<cinn::common::float16>(
+        node_data->id(), shape_dict.at(node_data->id()));
   } else if (dtype.is_bool()) {
     return lang::Placeholder<bool>(node_data->id(),
                                    shape_dict.at(node_data->id()));
@@ -546,7 +546,7 @@ bool WithoutLastDimInReduce(const std::vector<int>& shape,
 void LoopOrderAssignReduce(ir::IRSchedule& ir_sch,  // NOLINT
                            const std::string& block_name,
                            const std::vector<int>& axes,
-                           const common::Target& target,
+                           const cinn::common::Target& target,
                            const bool just_reorder = false) {
   // reorder none-last reduce axis to last.
   // like: shape = [16,16,16,16,16],axes = [1,3] -> new order = [0, 2, 4, 1, 3].
@@ -597,7 +597,7 @@ void LoopAssignReduceWithoutLast(ir::IRSchedule& ir_sch,  // NOLINT
                                  const std::string& block_name,
                                  const std::vector<int>& inshape,
                                  const std::vector<int>& axes,
-                                 const common::Target& target) {
+                                 const cinn::common::Target& target) {
   int tail = 0;
   bool bound = true;
   auto shape = pe::GetFirstStepReduceShape(inshape, axes, bound, tail);
@@ -711,11 +711,11 @@ void LoopAssignReduceWithLast(ir::IRSchedule& ir_sch,  // NOLINT
                               const std::string& block_name,
                               const std::vector<int>& inshape,
                               const std::vector<int>& axes,
-                              const common::Target& target) {
+                              const cinn::common::Target& target) {
   // If the number of current device SM is smaller than the number of SM
   // required by Warp Reduce, the performance of Warp Reduce is better.
   // Otherwise, use Block Reduce.
-  auto max_num_threads = common::DefaultNVGPUTarget().max_num_threads();
+  auto max_num_threads = cinn::common::DefaultNVGPUTarget().max_num_threads();
   int need_reduce_last_count = 1;
   for (int i = 0; i < inshape.size(); i++) {
     if (find(axes.begin(), axes.end(), i) == axes.end()) {
diff --git a/paddle/cinn/hlir/framework/op_strategy.h b/paddle/cinn/hlir/framework/op_strategy.h
index b782e943b2c217..b0ff691828860c 100644
--- a/paddle/cinn/hlir/framework/op_strategy.h
+++ b/paddle/cinn/hlir/framework/op_strategy.h
@@ -36,12 +36,12 @@ using StrategyFunction = std::function<std::shared_ptr<OpStrategy>(
     const std::vector<ir::Tensor>&,
     const std::vector<Type>&,
     const std::vector<std::vector<int>>&,
-    const common::Target&)>;
+    const cinn::common::Target&)>;
 using InferShapeFunction = std::function<std::vector<std::vector<int>>(
     const std::vector<std::vector<int>>&, const AttrMapType&)>;
 
 //! Operator implementation that includes compute and schedule function.
-class OpImpl : public common::Object {
+class OpImpl : public cinn::common::Object {
  public:
   //! Compute function
   CINNCompute fcompute;
@@ -72,7 +72,7 @@ class OpImpl : public common::Object {
    * @param target The build target.
    * @return The computation schedule.
    */
-  common::Shared<Schedule> GetSchedule(
+  cinn::common::Shared<Schedule> GetSchedule(
       const std::vector<ir::Tensor>& outs,
       const std::vector<ir::Tensor>& temp_tensors,
       const Target& target) {
@@ -88,7 +88,7 @@ class OpImpl : public common::Object {
 };
 
 //! Specialized implementations for operators under certain conditions.
-class OpSpec : public common::Object {
+class OpSpec : public cinn::common::Object {
  public:
   //! List of implementations.
   std::vector<std::shared_ptr<OpImpl>> implementations;
@@ -119,7 +119,7 @@ class OpSpec : public common::Object {
 };
 
 //! Operator strategy class.
-class OpStrategy : public common::Object {
+class OpStrategy : public cinn::common::Object {
  public:
   const char* type_info() const override { return __type_info__; }
   //! List of operator specializations.
diff --git a/paddle/cinn/hlir/framework/op_test.cc b/paddle/cinn/hlir/framework/op_test.cc
index 6648cbac3e17fe..bba9baf2c60c66 100644
--- a/paddle/cinn/hlir/framework/op_test.cc
+++ b/paddle/cinn/hlir/framework/op_test.cc
@@ -46,7 +46,7 @@ TEST(Operator, GetAttrs) {
   NodeAttr attrs;
   std::vector<ir::Tensor> inputs{A, B};
   std::vector<Type> type{Float(32)};
-  common::Target target = common::DefaultHostTarget();
+  cinn::common::Target target = cinn::common::DefaultHostTarget();
   auto impl = OpStrategy::SelectImpl(
       strategy[add](attrs, inputs, type, {{100, 32}}, target));
 
@@ -56,10 +56,10 @@ TEST(Operator, GetAttrs) {
   std::string func_name = "add1";
 
   std::string out_name = "C";
-  common::CINNValuePack cinn_input =
-      common::CINNValuePack{{common::CINNValue(A),
-                             common::CINNValue(B),
-                             common::CINNValue(out_name)}};
+  cinn::common::CINNValuePack cinn_input =
+      cinn::common::CINNValuePack{{cinn::common::CINNValue(A),
+                                   cinn::common::CINNValue(B),
+                                   cinn::common::CINNValue(out_name)}};
   std::vector<std::string> input_output_names{"A", "B", out_name};
 
   auto funcs = framework::GetFuncFromImpl(
diff --git a/paddle/cinn/hlir/framework/parallel_compiler.cc b/paddle/cinn/hlir/framework/parallel_compiler.cc
index 3a15f7c42bef0d..57055a9eb20225 100644
--- a/paddle/cinn/hlir/framework/parallel_compiler.cc
+++ b/paddle/cinn/hlir/framework/parallel_compiler.cc
@@ -229,13 +229,14 @@ void ParallelCompiler::Task::CodegenAndJit() {
   VLOG(2) << "Start Codegen and JIT on Group " << group_id
           << " at thread: " << std::this_thread::get_id();
   // build module
-  ir::Module::Builder builder(common::UniqName("module"), context->target);
+  ir::Module::Builder builder(cinn::common::UniqName("module"),
+                              context->target);
   for (auto& func : pcompiler->result_.LoweredFuncs(group_id)) {
     builder.AddFunction(func);
   }
 
   auto ir_module = builder.Build();
-  if (context->target == common::DefaultNVGPUTarget()) {
+  if (context->target == cinn::common::DefaultNVGPUTarget()) {
 #ifdef CINN_WITH_CUDA
     auto splited_module = backends::SplitCudaAndHostModule(ir_module);
     auto hmodule = std::get<0>(splited_module);
diff --git a/paddle/cinn/hlir/framework/parallel_compiler_test.cc b/paddle/cinn/hlir/framework/parallel_compiler_test.cc
index 0379da139ed947..79d502e744f6bb 100644
--- a/paddle/cinn/hlir/framework/parallel_compiler_test.cc
+++ b/paddle/cinn/hlir/framework/parallel_compiler_test.cc
@@ -31,7 +31,7 @@ TEST(ParallelCompilerTest, Add_TEST_0) {
   auto A = builder.CreateInput(Float(32), {128, 128}, "A");
   auto B = builder.CreateInput(Float(32), {128, 128}, "B");
   auto C = builder.Add(A, B);
-  auto target = common::DefaultNVGPUTarget();
+  auto target = cinn::common::DefaultNVGPUTarget();
   auto program = builder.Build();
   auto graph = std::make_shared<Graph>(program, target);
   auto scope = BuildScope(target, graph);
@@ -49,7 +49,7 @@ TEST(ParallelCompilerTest, Conv2d_Test_0) {
   auto D = builder.Conv2d(A, B, {2, 2}, {1, 1});
   auto E = builder.Add(C, D);
 
-  auto target = common::DefaultNVGPUTarget();
+  auto target = cinn::common::DefaultNVGPUTarget();
   auto program = builder.Build();
   auto graph = frontend::Optimize(&program, {}, target);
   auto scope = BuildScope(target, graph);
@@ -67,7 +67,7 @@ TEST(ParallelCompilerTest, Matmul_Test_0) {
   auto D = builder.Matmul(A, B);
   auto E = builder.Add(C, D);
 
-  auto target = common::DefaultNVGPUTarget();
+  auto target = cinn::common::DefaultNVGPUTarget();
   auto program = builder.Build();
   auto graph = frontend::Optimize(&program, {}, target);
   auto scope = BuildScope(target, graph);
diff --git a/paddle/cinn/hlir/framework/pir/compilation_task.cc b/paddle/cinn/hlir/framework/pir/compilation_task.cc
index cacd2061e07520..cc792f3fce6495 100644
--- a/paddle/cinn/hlir/framework/pir/compilation_task.cc
+++ b/paddle/cinn/hlir/framework/pir/compilation_task.cc
@@ -61,7 +61,8 @@ void CompilationTask::Lowering() {
 }
 
 void CompilationTask::CodegenAndJit() {
-  ir::Module::Builder builder(common::UniqName("module"), context_->target_);
+  ir::Module::Builder builder(cinn::common::UniqName("module"),
+                              context_->target_);
   CHECK_EQ(context_->predicates_.size(), context_->lowered_funcs_.size());
   for (const ir::Expr predicate : context_->predicates_) {
     builder.AddPredicate(predicate);
diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
index 9c48ec2326bffe..b1b4e5c23e3755 100644
--- a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
+++ b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
@@ -29,8 +29,8 @@
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
 #include "paddle/cinn/lang/placeholder.h"
 #include "paddle/cinn/optim/transform_gpu_forloop.h"
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
-#include "paddle/phi/core/ddim.h"
 
 PD_DECLARE_bool(cinn_use_cuda_vectorize);
 PD_DECLARE_bool(cinn_enable_map_expr);
@@ -42,8 +42,8 @@ namespace hlir {
 namespace framework {
 namespace pir {
 
+using cinn::common::Type;
 using cinn::hlir::op::ExternalApiRegistry;
-using common::Type;
 using framework::OpPatternKind;
 using framework::StrategyFunction;
 
@@ -60,14 +60,14 @@ bool IsInTensorMap(
   return false;
 }
 
-common::Type GetTensorDtype(const ::pir::Value& value) {
+cinn::common::Type GetTensorDtype(const ::pir::Value& value) {
   auto type_info = value.type().dyn_cast<paddle::dialect::DenseTensorType>();
-  auto in_shape = phi::vectorize<int>(type_info.dims());
+  auto in_shape = ::common::vectorize<int>(type_info.dims());
   auto dtype = type_info.dtype();
   return CompatibleInfo::ConvertIRType(dtype);
 }
 
-common::Type GetTensorDtype(
+cinn::common::Type GetTensorDtype(
     const std::string& name,
     const std::unordered_map<::pir::Value, ir::Tensor>& tensor_map) {
   for (auto iter : tensor_map) {
@@ -76,12 +76,12 @@ common::Type GetTensorDtype(
     }
   }
   VLOG(4) << name << " is not in tensor map, return FP32 by default.";
-  return common::F32();
+  return cinn::common::F32();
 }
 
 ir::Tensor GetTensor(const GroupPtr& group, const ::pir::Value& value) {
   auto type_info = value.type().dyn_cast<paddle::dialect::DenseTensorType>();
-  auto in_shape = phi::vectorize<int>(type_info.dims());
+  auto in_shape = ::common::vectorize<int>(type_info.dims());
   auto dtype = type_info.dtype();
   std::string input_id = CompatibleInfo::ValueName(value);
   if (group->shape_analysis != nullptr) {
@@ -145,7 +145,7 @@ void CollectOutputInfo(::pir::Operation* op,
         out_value.type().dyn_cast<paddle::dialect::DenseTensorType>();
 
     out_types->push_back(CompatibleInfo::ConvertIRType(type_info.dtype()));
-    auto out_shape = phi::vectorize<int>(type_info.dims());
+    auto out_shape = ::common::vectorize<int>(type_info.dims());
     out_shapes->push_back(std::move(out_shape));
   }
 }
@@ -457,10 +457,11 @@ std::vector<ir::LoweredFunc> OpLowererImpl::LowerCustomCall(
   //   external_api = ExternalApiRegistry::Global()->GetExternalApi(node,
   //   target_);
   // }
-  std::vector<common::CINNValue> compute_args = {
-      common::CINNValue(group->FuncName()), common::CINNValue(external_api)};
-  common::CINNValuePack pack =
-      impl->fcompute(common::CINNValuePack{compute_args});
+  std::vector<cinn::common::CINNValue> compute_args = {
+      cinn::common::CINNValue(group->FuncName()),
+      cinn::common::CINNValue(external_api)};
+  cinn::common::CINNValuePack pack =
+      impl->fcompute(cinn::common::CINNValuePack{compute_args});
   CHECK_EQ(pack.size(), 1UL);
   // reset input names as extern api input args can't be remove duplicate.
   // group->input_names.clear();
@@ -553,7 +554,7 @@ std::vector<ir::LoweredFunc> OpLowererImpl::PostProcess(
         }
         int_args_set.insert(symbol_name);
         group_func_args.emplace_back(
-            ir::_Var_::Make(symbol_name, common::Int(32)));
+            ir::_Var_::Make(symbol_name, cinn::common::Int(32)));
         group->int_args_map[non_tensor_arg_idx++] = {tensor_arg_idx,
                                                      tensor_arg_dim_idx};
       }
@@ -632,21 +633,21 @@ std::vector<ir::LoweredFunc> OpLowererImpl::DoOpLower(
     std::unordered_map<std::string, ir::Tensor>* tmp_tensor_info,
     std::vector<ir::Tensor>* op_func_arg_tensors) {
   VLOG(4) << "Do lower with Compute, op: " << op->name();
-  std::vector<common::CINNValue> cinn_inputs;
+  std::vector<cinn::common::CINNValue> cinn_inputs;
   for (const ir::Tensor& tensor : *op_func_arg_tensors) {
-    cinn_inputs.push_back(common::CINNValue(ir::Expr(tensor)));
+    cinn_inputs.push_back(cinn::common::CINNValue(ir::Expr(tensor)));
   }
 
   // set tensor name = operand hash name
   auto op_results = op->results();
   for (const auto& result : op_results) {
     std::string output_id = CompatibleInfo::ValueName(result);
-    cinn_inputs.push_back(common::CINNValue(output_id));
+    cinn_inputs.push_back(cinn::common::CINNValue(output_id));
   }
 
   // 1.Do compute
-  common::CINNValuePack pack =
-      op_impl->fcompute(common::CINNValuePack{cinn_inputs});
+  cinn::common::CINNValuePack pack =
+      op_impl->fcompute(cinn::common::CINNValuePack{cinn_inputs});
 
   poly::StageMap tmp_stages = pack.back();
   std::string post = "";
@@ -673,7 +674,7 @@ std::vector<ir::LoweredFunc> OpLowererImpl::DoOpLower(
 
     // Insert output tensors into function arg
     if (!expr.as_tensor_ref()->buffer.defined() ||
-        this->target_ != common::DefaultNVGPUTarget()) {
+        this->target_ != cinn::common::DefaultNVGPUTarget()) {
       op_func_arg_tensors->push_back(expr.as_tensor_ref());
       expr.as_tensor_ref()->WithBuffer();
     }
@@ -710,18 +711,18 @@ ir::Expr OpLowererImpl::DoOpSchedule(
     const std::vector<ir::Tensor>& op_func_arg_tensors,
     const std::vector<ir::LoweredFunc>& lowered_funcs) {
   VLOG(4) << "Do op schedule";
-  std::vector<common::CINNValue> schedule_inputs;
+  std::vector<cinn::common::CINNValue> schedule_inputs;
   // 1.Collect tensors
   for (const ir::Tensor& op_func_arg_tensor : op_func_arg_tensors) {
-    schedule_inputs.push_back(common::CINNValue(op_func_arg_tensor));
+    schedule_inputs.push_back(cinn::common::CINNValue(op_func_arg_tensor));
   }
   // 2.Collect bodies to be scheduled
   for (const ir::LoweredFunc& func : lowered_funcs) {
-    schedule_inputs.push_back(common::CINNValue(func->body));
+    schedule_inputs.push_back(cinn::common::CINNValue(func->body));
   }
   // 3.Do schedule on AST
-  common::CINNValuePack expr_pack =
-      op_impl->fschedule(common::CINNValuePack{schedule_inputs});
+  cinn::common::CINNValuePack expr_pack =
+      op_impl->fschedule(cinn::common::CINNValuePack{schedule_inputs});
   VLOG(4) << "After op schedule: " << expr_pack[0].operator ir::Expr();
 
   return expr_pack[0].operator ir::Expr();
diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_impl.h b/paddle/cinn/hlir/framework/pir/op_lowering_impl.h
index c8c2bdaa134435..ba768cc498f864 100644
--- a/paddle/cinn/hlir/framework/pir/op_lowering_impl.h
+++ b/paddle/cinn/hlir/framework/pir/op_lowering_impl.h
@@ -41,7 +41,7 @@ namespace pir {
 
 using GroupPtr = std::shared_ptr<Group>;
 
-using common::Target;
+using cinn::common::Target;
 class OpLowererImpl;
 
 typedef bool (OpLowererImpl::*ScheduleDetermineFunction)(::pir::Operation*);
diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_util.cc b/paddle/cinn/hlir/framework/pir/op_lowering_util.cc
index a9b14a215107a6..25c75ba4354ac7 100644
--- a/paddle/cinn/hlir/framework/pir/op_lowering_util.cc
+++ b/paddle/cinn/hlir/framework/pir/op_lowering_util.cc
@@ -519,7 +519,7 @@ ::pir::Operation* GetMasterToComputeAt(
 void LoopOrderAssignReduce(ir::IRSchedule& ir_sch,  // NOLINT
                            const std::string& block_name,
                            const std::vector<int>& axes,
-                           const common::Target& target,
+                           const cinn::common::Target& target,
                            const bool just_reorder = false) {
   // reorder none-last reduce axis to last.
   // like: shape = [16,16,16,16,16],axes = [1,3] -> new order = [0, 2, 4, 1, 3].
@@ -570,11 +570,11 @@ void LoopAssignReduceWithLast(ir::IRSchedule& ir_sch,  // NOLINT
                               const std::string& block_name,
                               const std::vector<int>& inshape,
                               const std::vector<int>& axes,
-                              const common::Target& target) {
+                              const cinn::common::Target& target) {
   // If the number of current device SM is smaller than the number of SM
   // required by Warp Reduce, the performance of Warp Reduce is better.
   // Otherwise, use Block Reduce.
-  auto max_num_threads = common::DefaultNVGPUTarget().max_num_threads();
+  auto max_num_threads = cinn::common::DefaultNVGPUTarget().max_num_threads();
   int need_reduce_last_count = 1;
   for (int i = 0; i < inshape.size(); i++) {
     if (find(axes.begin(), axes.end(), i) == axes.end()) {
@@ -699,7 +699,7 @@ void LoopAssignReduceWithoutLast(ir::IRSchedule& ir_sch,  // NOLINT
                                  const std::string& block_name,
                                  const std::vector<int>& inshape,
                                  const std::vector<int>& axes,
-                                 const common::Target& target) {
+                                 const cinn::common::Target& target) {
   int tail = 0;
   bool bound = true;
   auto shape = pe::GetFirstStepReduceShape(inshape, axes, bound, tail);
diff --git a/paddle/cinn/hlir/framework/pir/utils.cc b/paddle/cinn/hlir/framework/pir/utils.cc
index 5070aae0670a67..83aa5ff5844265 100644
--- a/paddle/cinn/hlir/framework/pir/utils.cc
+++ b/paddle/cinn/hlir/framework/pir/utils.cc
@@ -229,10 +229,10 @@ utils::AttributeMap CompatibleInfo::ConvertAttributes(
 }
 
 #define CASE_TYPE(src, dst) \
-  else if (type.isa<::pir::src>()) return common::dst();
+  else if (type.isa<::pir::src>()) return cinn::common::dst();
 
-common::Type CompatibleInfo::ConvertIRType(::pir::Type type) {
-  if (type.isa<::pir::BFloat16Type>()) return common::BF16();
+cinn::common::Type CompatibleInfo::ConvertIRType(::pir::Type type) {
+  if (type.isa<::pir::BFloat16Type>()) return cinn::common::BF16();
   CASE_TYPE(Float16Type, F16)
   CASE_TYPE(Float32Type, F32)
   CASE_TYPE(Float64Type, F64)
@@ -270,7 +270,7 @@ OpPatternKind CompatibleInfo::OpKind(const ::pir::Operation& op) {
 
 std::vector<int> CompatibleInfo::ValueShape(const ::pir::Value& value) {
   auto& dim = value.type().dyn_cast<::pir::DenseTensorType>().dims();
-  return phi::vectorize<int>(dim);
+  return ::common::vectorize<int>(dim);
 }
 
 std::vector<int64_t> GetBroadcastAxis(const phi::DDim& in_shape,
diff --git a/paddle/cinn/hlir/framework/pir/utils.h b/paddle/cinn/hlir/framework/pir/utils.h
index 127a50eb925045..cd0f66af3f1ff4 100644
--- a/paddle/cinn/hlir/framework/pir/utils.h
+++ b/paddle/cinn/hlir/framework/pir/utils.h
@@ -20,7 +20,7 @@
 #include "paddle/cinn/common/type.h"
 #include "paddle/cinn/hlir/framework/op.h"
 #include "paddle/cinn/utils/type_defs.h"
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 #include "paddle/pir/core/operation.h"
 
 namespace cinn {
@@ -84,7 +84,7 @@ struct CompatibleInfo {
 
   static utils::AttributeMap ConvertAttributes(const ::pir::Operation& op);
 
-  static common::Type ConvertIRType(::pir::Type type);
+  static cinn::common::Type ConvertIRType(::pir::Type type);
 
   static std::vector<int> ValueShape(const ::pir::Value& value);
 
diff --git a/paddle/cinn/hlir/framework/print_graph_pass_test.cc b/paddle/cinn/hlir/framework/print_graph_pass_test.cc
index cc3d51c4f79c09..b26c60c716d0c1 100644
--- a/paddle/cinn/hlir/framework/print_graph_pass_test.cc
+++ b/paddle/cinn/hlir/framework/print_graph_pass_test.cc
@@ -63,7 +63,7 @@ TEST(Operator, GetAttrs) {
   auto d = prog.add(c, b);
   auto e = prog.add(c, d);
   ASSERT_EQ(prog.size(), 3);
-  Graph* g = new Graph(prog, common::DefaultHostTarget());
+  Graph* g = new Graph(prog, cinn::common::DefaultHostTarget());
   ApplyPass(g, "PrintGraph");
   auto s = g->GetAttrs<std::string>("print_graph");
   LOG(INFO) << s;
diff --git a/paddle/cinn/hlir/framework/schedule.h b/paddle/cinn/hlir/framework/schedule.h
index 3fe12f5afae7c3..737328602d7561 100644
--- a/paddle/cinn/hlir/framework/schedule.h
+++ b/paddle/cinn/hlir/framework/schedule.h
@@ -30,7 +30,7 @@ namespace framework {
  *  For operations and all the operations they depend on.
  *  The schedule per Operation is named as stage.
  */
-class Schedule : public common::Object {
+class Schedule : public cinn::common::Object {
  public:
   const char* type_info() const override { return __type_info__; }
 
diff --git a/paddle/cinn/hlir/framework/scope_test.cc b/paddle/cinn/hlir/framework/scope_test.cc
index 23ac65469af9a1..c9b2cd47832c86 100644
--- a/paddle/cinn/hlir/framework/scope_test.cc
+++ b/paddle/cinn/hlir/framework/scope_test.cc
@@ -25,7 +25,7 @@ TEST(Scope, basic) {
   auto* var = scope.Var<Tensor>("key");
   auto& tensor = absl::get<Tensor>(*var);
   tensor->Resize(Shape{{3, 1}});
-  auto* data = tensor->mutable_data<float>(common::DefaultHostTarget());
+  auto* data = tensor->mutable_data<float>(cinn::common::DefaultHostTarget());
   data[0] = 0.f;
   data[1] = 1.f;
   data[2] = 2.f;
diff --git a/paddle/cinn/hlir/framework/tensor.h b/paddle/cinn/hlir/framework/tensor.h
index 7b5d201d0f0ae0..59f115b32e2474 100644
--- a/paddle/cinn/hlir/framework/tensor.h
+++ b/paddle/cinn/hlir/framework/tensor.h
@@ -29,7 +29,7 @@
 namespace cinn {
 namespace hlir {
 namespace framework {
-using common::Target;
+using cinn::common::Target;
 
 struct Shape {
   using dim_t = int;
@@ -68,7 +68,7 @@ class _Tensor_ : public Object {
 
   inline void* mutable_data(const Target& target, const Type& type) {
     set_type(type);
-    if (target == common::DefaultHostTarget()) {
+    if (target == cinn::common::DefaultHostTarget()) {
       buffer_->ResizeLazy(1024, shape_.numel() * type.bytes(), target);
     } else {
       buffer_->ResizeLazy(shape_.numel() * type.bytes(), target);
@@ -79,7 +79,7 @@ class _Tensor_ : public Object {
   template <typename T>
   inline T* mutable_data(const Target& target) {
     set_type(type_of<T>());
-    if (target == common::DefaultHostTarget()) {
+    if (target == cinn::common::DefaultHostTarget()) {
       buffer_->ResizeLazy(1024, shape_.numel() * sizeof(T), target);
     } else {
       buffer_->ResizeLazy(shape_.numel() * sizeof(T), target);
@@ -104,7 +104,7 @@ class _Tensor_ : public Object {
   const char* type_info() const override { return __type_info__; }
 
  private:
-  common::Type type_;
+  cinn::common::Type type_;
   // A shared ptr to make it easier to share buffer between tensors.
   std::shared_ptr<Buffer> buffer_;
   Shape shape_;
diff --git a/paddle/cinn/hlir/framework/tensor_test.cc b/paddle/cinn/hlir/framework/tensor_test.cc
index 30ce7c158696df..0644bf941c8a84 100644
--- a/paddle/cinn/hlir/framework/tensor_test.cc
+++ b/paddle/cinn/hlir/framework/tensor_test.cc
@@ -24,7 +24,7 @@ TEST(Tensor, basic) {
   _Tensor_ tensor;
   tensor.Resize(Shape{{3, 2}});
 
-  auto* data = tensor.mutable_data<float>(common::DefaultHostTarget());
+  auto* data = tensor.mutable_data<float>(cinn::common::DefaultHostTarget());
 
   for (int i = 0; i < tensor.shape().numel(); i++) {
     data[i] = i;
diff --git a/paddle/cinn/hlir/framework/visualize_helper.cc b/paddle/cinn/hlir/framework/visualize_helper.cc
index bc8c5e41241868..b6e73f6f2c6978 100644
--- a/paddle/cinn/hlir/framework/visualize_helper.cc
+++ b/paddle/cinn/hlir/framework/visualize_helper.cc
@@ -177,7 +177,7 @@ bool MakeDirectory(const std::string& dirname, mode_t mode) {
 std::string GenNodeDataLabel(
     const NodeData* node,
     const absl::flat_hash_map<std::string, shape_t>& shape_dict,
-    const absl::flat_hash_map<std::string, common::Type>& dtype_dict,
+    const absl::flat_hash_map<std::string, cinn::common::Type>& dtype_dict,
     const std::string dot_nodedata_id) {
   std::stringstream ss;
   ss << dot_nodedata_id;
@@ -194,7 +194,7 @@ std::string GenNodeDataLabel(
   }
   if (dtype_dict.count(node->id())) {
     ss << "\\n";
-    ss << common::Type2Str(dtype_dict.at(node->id()));
+    ss << cinn::common::Type2Str(dtype_dict.at(node->id()));
   }
 
   return ss.str();
@@ -344,7 +344,7 @@ void AddGroupNode(
     const std::string& dot_cluster_id,
     const std::unordered_set<std::string>& fetch_var_ids,
     const absl::flat_hash_map<std::string, shape_t>& shape_dict,
-    const absl::flat_hash_map<std::string, common::Type>& dtype_dict,
+    const absl::flat_hash_map<std::string, cinn::common::Type>& dtype_dict,
     std::unordered_map<std::string, int>* recompute_nodes,
     std::unordered_map<std::string, std::string>* outnode2dot_id,
     std::unordered_set<std::string>* nodedatas_set,
diff --git a/paddle/cinn/hlir/framework/visualize_helper.h b/paddle/cinn/hlir/framework/visualize_helper.h
index 3afd3a974db0c8..2f281d3e3d4768 100644
--- a/paddle/cinn/hlir/framework/visualize_helper.h
+++ b/paddle/cinn/hlir/framework/visualize_helper.h
@@ -136,7 +136,7 @@ bool MakeDirectory(const std::string& dirname, mode_t mode);
 std::string GenNodeDataLabel(
     const NodeData* node,
     const absl::flat_hash_map<std::string, shape_t>& shape_dict,
-    const absl::flat_hash_map<std::string, common::Type>& dtype_dict,
+    const absl::flat_hash_map<std::string, cinn::common::Type>& dtype_dict,
     const std::string dot_nodedata_id);
 
 void Summary(const std::vector<std::vector<Node*>>& groups,
@@ -152,7 +152,7 @@ void AddGroupNode(
     const std::string& dot_cluster_id,
     const std::unordered_set<std::string>& fetch_var_ids,
     const absl::flat_hash_map<std::string, shape_t>& shape_dict,
-    const absl::flat_hash_map<std::string, common::Type>& dtype_dict,
+    const absl::flat_hash_map<std::string, cinn::common::Type>& dtype_dict,
     std::unordered_map<std::string, int>* recompute_nodes,
     std::unordered_map<std::string, std::string>* outnode2dot_id,
     std::unordered_set<std::string>* nodedatas_set,
diff --git a/paddle/cinn/hlir/op/broadcast.cc b/paddle/cinn/hlir/op/broadcast.cc
index 2ce71d5198cf97..c2fc4586d94507 100644
--- a/paddle/cinn/hlir/op/broadcast.cc
+++ b/paddle/cinn/hlir/op/broadcast.cc
@@ -30,9 +30,9 @@
 namespace cinn {
 namespace hlir {
 namespace op {
-using common::_CINNValuePack_;
-using common::CINNValue;
-using common::CINNValuePack;
+using cinn::common::_CINNValuePack_;
+using cinn::common::CINNValue;
+using cinn::common::CINNValuePack;
 using framework::OpStrategy;
 using framework::shape_t;
 using framework::StrategyFunction;
diff --git a/paddle/cinn/hlir/op/contrib/argmax.cc b/paddle/cinn/hlir/op/contrib/argmax.cc
index 041cfe7dc47a50..7de32179b52a0b 100644
--- a/paddle/cinn/hlir/op/contrib/argmax.cc
+++ b/paddle/cinn/hlir/op/contrib/argmax.cc
@@ -39,12 +39,12 @@ namespace cinn {
 namespace hlir {
 namespace op {
 
-using common::CINNValue;
+using cinn::common::CINNValue;
 using framework::shape_t;
 using ir::Tensor;
 
 std::vector<ir::Tensor> Argmax(const Tensor &in_tensor,
-                               const common::Target &target,
+                               const cinn::common::Target &target,
                                poly::StageMap stages,
                                const int &axis,
                                const bool &keep_dims,
@@ -116,7 +116,7 @@ std::shared_ptr<framework::OpStrategy> StrategyForArgmax(
       [=](lang::Args args, lang::RetValue *ret) {
         CHECK(!args.empty())
             << "The input argument of argmax compute is empty! Please check.";
-        common::CINNValuePack pack_args = args[0];
+        cinn::common::CINNValuePack pack_args = args[0];
         std::string tensor_name = UniqName("Argmax_out");
         CHECK_GE(pack_args.size(), 1U)
             << "There should be 1 input args for argmax compute";
@@ -135,14 +135,14 @@ std::shared_ptr<framework::OpStrategy> StrategyForArgmax(
                                            CINNValue(out_tensor[1]),
                                            CINNValue(out_tensor[2]),
                                            CINNValue(stages)};
-        *ret = common::CINNValuePack{cinn_values};
+        *ret = cinn::common::CINNValuePack{cinn_values};
       });
 
   framework::CINNSchedule argmax_schedule([=](lang::Args args,
                                               lang::RetValue *ret) {
     CHECK(!args.empty())
         << "The input argument of argmax_schedule is empty! Please check.\n";
-    common::CINNValuePack arg_pack = args[0];
+    cinn::common::CINNValuePack arg_pack = args[0];
     std::vector<Expr> vec_ast;
     for (int i = 0; i < arg_pack.size(); i++) {
       if (arg_pack[i].is_expr()) {
@@ -187,9 +187,9 @@ std::shared_ptr<framework::OpStrategy> StrategyForArgmax(
     if (prod_size > 1 && target.arch == Target::Arch::X86) {
       pe::IRScheduleInjectiveCPU(ir_sch, output_shapes.front(), target, true);
     }
-    std::vector<common::CINNValue> res{
-        common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
-    *ret = common::CINNValuePack{res};
+    std::vector<cinn::common::CINNValue> res{
+        cinn::common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+    *ret = cinn::common::CINNValuePack{res};
   });
 
   auto strategy = std::make_shared<framework::OpStrategy>();
diff --git a/paddle/cinn/hlir/op/contrib/argmax.h b/paddle/cinn/hlir/op/contrib/argmax.h
index b52f9e80f4ce56..4b5b519614f18c 100644
--- a/paddle/cinn/hlir/op/contrib/argmax.h
+++ b/paddle/cinn/hlir/op/contrib/argmax.h
@@ -22,7 +22,7 @@ namespace cinn {
 namespace hlir {
 namespace op {
 std::vector<ir::Tensor> Argmax(const ir::Tensor &in_tensor,
-                               const common::Target &target,
+                               const cinn::common::Target &target,
                                poly::StageMap stages,
                                const int &axis,
                                const bool &keep_dims = false,
diff --git a/paddle/cinn/hlir/op/contrib/argmax_test.cc b/paddle/cinn/hlir/op/contrib/argmax_test.cc
index 786e19b163a9a0..7a1fc95384cbc1 100644
--- a/paddle/cinn/hlir/op/contrib/argmax_test.cc
+++ b/paddle/cinn/hlir/op/contrib/argmax_test.cc
@@ -34,9 +34,9 @@ namespace hlir {
 namespace op {
 
 TEST(GenerateCode_Cpu, Argmax_Keep) {
-  common::Context::Global().ResetNameId();
+  cinn::common::Context::Global().ResetNameId();
 
-  common::Target target = common::DefaultHostTarget();
+  cinn::common::Target target = cinn::common::DefaultHostTarget();
 
   int axis = 1;
   ir::Expr n(4);
diff --git a/paddle/cinn/hlir/op/contrib/argmin.cc b/paddle/cinn/hlir/op/contrib/argmin.cc
index 3caaf45c46a5eb..8f9d2ec9f45fd3 100644
--- a/paddle/cinn/hlir/op/contrib/argmin.cc
+++ b/paddle/cinn/hlir/op/contrib/argmin.cc
@@ -39,12 +39,12 @@ namespace cinn {
 namespace hlir {
 namespace op {
 
-using common::CINNValue;
+using cinn::common::CINNValue;
 using framework::shape_t;
 using ir::Tensor;
 
 std::vector<Tensor> Argmin(const Tensor &in_tensor,
-                           const common::Target &target,
+                           const cinn::common::Target &target,
                            poly::StageMap stages,
                            const int &axis,
                            const bool &keep_dims,
@@ -115,7 +115,7 @@ std::shared_ptr<framework::OpStrategy> StrategyForArgmin(
       [=](lang::Args args, lang::RetValue *ret) {
         CHECK(!args.empty())
             << "The input argument of argmin compute is empty! Please check.";
-        common::CINNValuePack pack_args = args[0];
+        cinn::common::CINNValuePack pack_args = args[0];
         CHECK_GE(pack_args.size(), 1U)
             << "There should be 1 input args for argmax compute";
         Expr in_expr = pack_args[0];
@@ -133,14 +133,14 @@ std::shared_ptr<framework::OpStrategy> StrategyForArgmin(
                                            CINNValue(out_tensor[1]),
                                            CINNValue(out_tensor[2]),
                                            CINNValue(stages)};
-        *ret = common::CINNValuePack{cinn_values};
+        *ret = cinn::common::CINNValuePack{cinn_values};
       });
 
   framework::CINNSchedule argmin_schedule([=](lang::Args args,
                                               lang::RetValue *ret) {
     CHECK(!args.empty())
         << "The input argument of arange_schedule is empty! Please check.\n";
-    common::CINNValuePack arg_pack = args[0];
+    cinn::common::CINNValuePack arg_pack = args[0];
     std::vector<Expr> vec_ast;
     for (int i = 0; i < arg_pack.size(); i++) {
       if (arg_pack[i].is_expr()) {
@@ -185,9 +185,9 @@ std::shared_ptr<framework::OpStrategy> StrategyForArgmin(
     if (prod_size > 1 && target.arch == Target::Arch::X86) {
       pe::IRScheduleInjectiveCPU(ir_sch, output_shapes.front(), target, true);
     }
-    std::vector<common::CINNValue> res{
-        common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
-    *ret = common::CINNValuePack{res};
+    std::vector<cinn::common::CINNValue> res{
+        cinn::common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+    *ret = cinn::common::CINNValuePack{res};
   });
 
   auto strategy = std::make_shared<framework::OpStrategy>();
diff --git a/paddle/cinn/hlir/op/contrib/argmin.h b/paddle/cinn/hlir/op/contrib/argmin.h
index 17b0095b5c8a41..a0cc5261fb9262 100644
--- a/paddle/cinn/hlir/op/contrib/argmin.h
+++ b/paddle/cinn/hlir/op/contrib/argmin.h
@@ -22,7 +22,7 @@ namespace cinn {
 namespace hlir {
 namespace op {
 std::vector<ir::Tensor> Argmin(const ir::Tensor& in_tensor,
-                               const common::Target& target,
+                               const cinn::common::Target& target,
                                poly::StageMap stages,
                                const int& axis,
                                const bool& keep_dims = false,
diff --git a/paddle/cinn/hlir/op/contrib/argmin_test.cc b/paddle/cinn/hlir/op/contrib/argmin_test.cc
index a979870fe88a90..beec71e153bb09 100644
--- a/paddle/cinn/hlir/op/contrib/argmin_test.cc
+++ b/paddle/cinn/hlir/op/contrib/argmin_test.cc
@@ -33,9 +33,9 @@ namespace hlir {
 namespace op {
 
 TEST(GenerateCode_Cpu, Argmin_Keep) {
-  common::Context::Global().ResetNameId();
+  cinn::common::Context::Global().ResetNameId();
 
-  common::Target target = common::DefaultHostTarget();
+  cinn::common::Target target = cinn::common::DefaultHostTarget();
 
   int axis = 1;
   ir::Expr n(4);
diff --git a/paddle/cinn/hlir/op/contrib/assert_true.cc b/paddle/cinn/hlir/op/contrib/assert_true.cc
index a91f740c54892c..c3f39144b8a59c 100644
--- a/paddle/cinn/hlir/op/contrib/assert_true.cc
+++ b/paddle/cinn/hlir/op/contrib/assert_true.cc
@@ -33,8 +33,8 @@ namespace cinn {
 namespace hlir {
 namespace op {
 
-using common::CINNValue;
-using common::CINNValuePack;
+using cinn::common::CINNValue;
+using cinn::common::CINNValuePack;
 
 std::shared_ptr<framework::OpStrategy> StrategyForAssertTrue(
     const framework::NodeAttr &attrs,
diff --git a/paddle/cinn/hlir/op/contrib/bitcast_convert.cc b/paddle/cinn/hlir/op/contrib/bitcast_convert.cc
index cfa957ba5f06fd..dc8516b160bd24 100644
--- a/paddle/cinn/hlir/op/contrib/bitcast_convert.cc
+++ b/paddle/cinn/hlir/op/contrib/bitcast_convert.cc
@@ -39,8 +39,8 @@ namespace cinn {
 namespace hlir {
 namespace op {
 
-using common::CINNValue;
-using common::CINNValuePack;
+using cinn::common::CINNValue;
+using cinn::common::CINNValuePack;
 using framework::shape_t;
 
 ir::Tensor BitcastConvert(const ir::Tensor &input,
@@ -98,8 +98,8 @@ std::vector<shape_t> InferShapeForBitcastConvert(
   auto input_data_type_name =
       absl::get<std::string>(attrs.at("input_data_type"));
   auto output_data_type_name = absl::get<std::string>(attrs.at("dtype"));
-  auto input_data_type = common::Str2Type(input_data_type_name);
-  auto output_data_type = common::Str2Type(output_data_type_name);
+  auto input_data_type = cinn::common::Str2Type(input_data_type_name);
+  auto output_data_type = cinn::common::Str2Type(output_data_type_name);
 
   auto output_shape =
       std::vector<shape_t>(inputs_shape.begin(), inputs_shape.end());
@@ -124,7 +124,7 @@ std::vector<shape_t> InferShapeForBitcastConvert(
 std::vector<Type> InferDtypeForBitcastConvert(
     const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
   CHECK(attrs.count("dtype"));
-  return {common::Str2Type(absl::get<std::string>(attrs.at("dtype")))};
+  return {cinn::common::Str2Type(absl::get<std::string>(attrs.at("dtype")))};
 }
 
 }  // namespace op
diff --git a/paddle/cinn/hlir/op/contrib/cholesky.cc b/paddle/cinn/hlir/op/contrib/cholesky.cc
index a1d43859c71bdd..1784ef570ed681 100644
--- a/paddle/cinn/hlir/op/contrib/cholesky.cc
+++ b/paddle/cinn/hlir/op/contrib/cholesky.cc
@@ -47,8 +47,8 @@ namespace cinn {
 namespace hlir {
 namespace op {
 
-using common::CINNValue;
-using common::CINNValuePack;
+using cinn::common::CINNValue;
+using cinn::common::CINNValuePack;
 
 std::shared_ptr<framework::OpStrategy> StrategyForCholesky(
     const framework::NodeAttr &attrs,
diff --git a/paddle/cinn/hlir/op/contrib/gather_nd.cc b/paddle/cinn/hlir/op/contrib/gather_nd.cc
index 1a3f06330f6c76..9e23642a11d651 100644
--- a/paddle/cinn/hlir/op/contrib/gather_nd.cc
+++ b/paddle/cinn/hlir/op/contrib/gather_nd.cc
@@ -41,8 +41,8 @@ namespace cinn {
 namespace hlir {
 namespace op {
 
-using common::CINNValue;
-using common::CINNValuePack;
+using cinn::common::CINNValue;
+using cinn::common::CINNValuePack;
 
 ir::Tensor GatherNd(const ir::Tensor &x,
                     const ir::Tensor &index,
@@ -62,16 +62,17 @@ ir::Tensor GatherNd(const ir::Tensor &x,
         std::vector<Expr> indices_position;
         for (size_t i = 0; i < index_shape_size - 1; ++i) {
           indices_position.push_back(
-              ir::Cast::Make(common::Int(32), indices[i]));
+              ir::Cast::Make(cinn::common::Int(32), indices[i]));
         }
-        indices_position.push_back(ir::Cast::Make(common::Int(32), Expr(0)));
+        indices_position.push_back(
+            ir::Cast::Make(cinn::common::Int(32), Expr(0)));
         size_t indices_position_size = indices_position.size();
         std::vector<Expr> real_indices;
         for (size_t i = 0; i < index_shape.back().as_int32(); ++i) {
           indices_position[indices_position_size - 1] =
-              ir::Cast::Make(common::Int(32), Expr(i));
+              ir::Cast::Make(cinn::common::Int(32), Expr(i));
           real_indices.push_back(
-              ir::Cast::Make(common::Int(32), index(indices_position)));
+              ir::Cast::Make(cinn::common::Int(32), index(indices_position)));
         }
         if (real_indices.size() == x_shape_size) {
           return x(real_indices);
@@ -127,7 +128,7 @@ std::shared_ptr<framework::OpStrategy> StrategyForGatherNd(
                                                  lang::RetValue *ret) {
     CHECK(!args.empty()) << "The input argument of gather_nd_schedule is "
                             "empty! Please check.\n";
-    common::CINNValuePack arg_pack = args[0];
+    cinn::common::CINNValuePack arg_pack = args[0];
     std::vector<Expr> vec_ast;
     for (int i = 0; i < arg_pack.size(); i++) {
       if (arg_pack[i].is_expr()) {
@@ -150,9 +151,9 @@ std::shared_ptr<framework::OpStrategy> StrategyForGatherNd(
         pe::IRScheduleInjectiveCPU(ir_sch, output_shapes.front(), target, true);
       }
     }
-    std::vector<common::CINNValue> res{
-        common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
-    *ret = common::CINNValuePack{res};
+    std::vector<cinn::common::CINNValue> res{
+        cinn::common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+    *ret = cinn::common::CINNValuePack{res};
   });
 
   auto strategy = std::make_shared<framework::OpStrategy>();
diff --git a/paddle/cinn/hlir/op/contrib/gather_nd_test.cc b/paddle/cinn/hlir/op/contrib/gather_nd_test.cc
index ee5f47477a5de1..46702a7a80c63e 100644
--- a/paddle/cinn/hlir/op/contrib/gather_nd_test.cc
+++ b/paddle/cinn/hlir/op/contrib/gather_nd_test.cc
@@ -33,9 +33,9 @@ namespace hlir {
 namespace op {
 
 TEST(GenerateCode_Cpu, GatherNd) {
-  common::Context::Global().ResetNameId();
+  cinn::common::Context::Global().ResetNameId();
 
-  common::Target target = common::DefaultHostTarget();
+  cinn::common::Target target = cinn::common::DefaultHostTarget();
 
   ir::Expr dim0(1);
   ir::Expr dim1(2);
diff --git a/paddle/cinn/hlir/op/contrib/gaussian_random.cc b/paddle/cinn/hlir/op/contrib/gaussian_random.cc
index 1d70e4098e7211..2b6a3019c6fcda 100644
--- a/paddle/cinn/hlir/op/contrib/gaussian_random.cc
+++ b/paddle/cinn/hlir/op/contrib/gaussian_random.cc
@@ -47,8 +47,8 @@ namespace cinn {
 namespace hlir {
 namespace op {
 
-using common::CINNValue;
-using common::CINNValuePack;
+using cinn::common::CINNValue;
+using cinn::common::CINNValuePack;
 
 std::shared_ptr<framework::OpStrategy> StrategyForGaussianRandom(
     const framework::NodeAttr &attrs,
@@ -88,7 +88,7 @@ std::vector<Type> InferDtypeForGaussianRandom(
   if (attrs.find("dtype") != attrs.end()) {
     dtype = absl::get<std::string>(attrs.at("dtype"));
   }
-  std::vector<Type> res{common::Str2Type(dtype)};
+  std::vector<Type> res{cinn::common::Str2Type(dtype)};
   CHECK(res[0].is_float(32) || res[0].is_float(64))
       << "gaussian_random only support float32 and float64, but here " << res[0]
       << "! Please check.";
diff --git a/paddle/cinn/hlir/op/contrib/logical_right_shift.cc b/paddle/cinn/hlir/op/contrib/logical_right_shift.cc
index f4223a2d794128..4f176850a4d3c9 100644
--- a/paddle/cinn/hlir/op/contrib/logical_right_shift.cc
+++ b/paddle/cinn/hlir/op/contrib/logical_right_shift.cc
@@ -42,9 +42,9 @@ namespace cinn {
 namespace hlir {
 namespace op {
 
-using common::_CINNValuePack_;
-using common::CINNValue;
-using common::CINNValuePack;
+using cinn::common::_CINNValuePack_;
+using cinn::common::CINNValue;
+using cinn::common::CINNValuePack;
 using framework::OpStrategy;
 using framework::shape_t;
 using framework::StrategyFunction;
@@ -54,9 +54,9 @@ ir::Tensor LogicalRightShift(const ir::Tensor &A,
                              const Target &target,
                              const std::string &output_name) {
   std::string extern_func = "cinn_";
-  if (target == common::DefaultHostTarget()) {
+  if (target == cinn::common::DefaultHostTarget()) {
     extern_func += "host_";
-  } else if (target == common::DefaultNVGPUTarget()) {
+  } else if (target == cinn::common::DefaultNVGPUTarget()) {
     extern_func += "nvgpu_";
   } else {
     CINN_NOT_IMPLEMENTED
diff --git a/paddle/cinn/hlir/op/contrib/logical_right_shift_test.cc b/paddle/cinn/hlir/op/contrib/logical_right_shift_test.cc
index 19315022167110..f23867a391d4bf 100644
--- a/paddle/cinn/hlir/op/contrib/logical_right_shift_test.cc
+++ b/paddle/cinn/hlir/op/contrib/logical_right_shift_test.cc
@@ -33,9 +33,9 @@ namespace hlir {
 namespace op {
 
 TEST(GenerateCode_Cpu, LogicalRightShift) {
-  common::Context::Global().ResetNameId();
+  cinn::common::Context::Global().ResetNameId();
 
-  common::Target target = common::DefaultHostTarget();
+  cinn::common::Target target = cinn::common::DefaultHostTarget();
   lang::Placeholder<int> x("x", std::vector<int>{10});
   lang::Placeholder<int> y("y", std::vector<int>{10});
   ir::Tensor res = LogicalRightShift(x, y, target, "test_logical_right_shift");
diff --git a/paddle/cinn/hlir/op/contrib/lookup_table.cc b/paddle/cinn/hlir/op/contrib/lookup_table.cc
index 2796d45e016b92..3e4ebd679a9047 100644
--- a/paddle/cinn/hlir/op/contrib/lookup_table.cc
+++ b/paddle/cinn/hlir/op/contrib/lookup_table.cc
@@ -43,8 +43,8 @@ namespace cinn {
 namespace hlir {
 namespace op {
 
-using common::CINNValue;
-using common::CINNValuePack;
+using cinn::common::CINNValue;
+using cinn::common::CINNValuePack;
 
 ir::Tensor LookupTable(const ir::Tensor& table,
                        const ir::Tensor& ids,
@@ -65,7 +65,7 @@ ir::Tensor LookupTable(const ir::Tensor& table,
         offsets.emplace_back(Expr(0));
         // Because the current conversion rules have not been completed, static
         // conversion is done here.
-        auto ids_offset = ir::Cast::Make(common::I32(), ids(offsets));
+        auto ids_offset = ir::Cast::Make(cinn::common::I32(), ids(offsets));
         auto pred = ir::And::Make(
             Expr(padding_idx != -1),
             ir::EQ::Make(ids_offset, Expr(static_cast<int32_t>(padding_idx))));
@@ -73,7 +73,7 @@ ir::Tensor LookupTable(const ir::Tensor& table,
                                 ir::Cast::Make(table->type(), Expr(0)),
                                 table(ids_offset, indices.back()));
       },
-      common::UniqName(output_name));
+      cinn::common::UniqName(output_name));
 }
 
 std::shared_ptr<framework::OpStrategy> StrategyForLookupTable(
diff --git a/paddle/cinn/hlir/op/contrib/lookup_table_test.cc b/paddle/cinn/hlir/op/contrib/lookup_table_test.cc
index d09d4238f6268e..43a4e3526629f1 100644
--- a/paddle/cinn/hlir/op/contrib/lookup_table_test.cc
+++ b/paddle/cinn/hlir/op/contrib/lookup_table_test.cc
@@ -34,9 +34,9 @@ namespace hlir {
 namespace op {
 
 TEST(GenerateCode_Cpu, LookupTable) {
-  common::Context::Global().ResetNameId();
+  cinn::common::Context::Global().ResetNameId();
 
-  common::Target target = common::DefaultHostTarget();
+  cinn::common::Target target = cinn::common::DefaultHostTarget();
 
   lang::Placeholder<float> in1("in1", {10, 20});
   lang::Placeholder<int64_t> in2("in2", std::vector<int32_t>{2, 2, 1});
@@ -72,9 +72,9 @@ TEST(GenerateCode_Gpu, LookupTable) {
   if (!cinn::runtime::IsCompiledWithCUDA()) {
     return;
   }
-  common::Context::Global().ResetNameId();
+  cinn::common::Context::Global().ResetNameId();
 
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
 
   lang::Placeholder<float> in1("in1", {10, 20});
   lang::Placeholder<int64_t> in2("in2", std::vector<int32_t>{2, 2, 1});
diff --git a/paddle/cinn/hlir/op/contrib/one_hot.cc b/paddle/cinn/hlir/op/contrib/one_hot.cc
index c7d0c3564da949..5070026a434c68 100644
--- a/paddle/cinn/hlir/op/contrib/one_hot.cc
+++ b/paddle/cinn/hlir/op/contrib/one_hot.cc
@@ -41,7 +41,7 @@ namespace cinn {
 namespace hlir {
 namespace op {
 
-using common::CINNValuePack;
+using cinn::common::CINNValuePack;
 
 ir::Tensor OneHot(const ir::Tensor& indices,
                   const ir::Tensor& on_value,
@@ -94,7 +94,7 @@ ir::Tensor OneHot(const ir::Tensor& indices,
         return ir::Select::Make(
             ir::EQ::Make(elem, idx), on_value_cast, off_value_cast);
       },
-      common::UniqName(output_name));
+      cinn::common::UniqName(output_name));
 
   return res;
 }
@@ -144,7 +144,7 @@ std::vector<Type> InferDtypeForOneHot(const std::vector<Type>& inputs_type,
     dtype = absl::get<std::string>(attrs.at("dtype"));
   }
 
-  std::vector<Type> res{common::Str2Type(dtype)};
+  std::vector<Type> res{cinn::common::Str2Type(dtype)};
   return res;
 }
 
@@ -175,7 +175,7 @@ std::shared_ptr<framework::OpStrategy> StrategyForOneHot(
                                              lang::RetValue* ret) {
     CHECK(!args.empty())
         << "The input argument of one_hot compute is empty! Please check.\n";
-    common::CINNValuePack pack_args = args[0];
+    cinn::common::CINNValuePack pack_args = args[0];
     CHECK(!pack_args.empty())
         << "at least one input tensor for transpose compute\n";
     CHECK_GE(pack_args.size(), 3U);
@@ -198,15 +198,15 @@ std::shared_ptr<framework::OpStrategy> StrategyForOneHot(
                             off_value,
                             depth,
                             axis,
-                            common::Str2Type(dtype),
+                            cinn::common::Str2Type(dtype),
                             tensor_name);
 
-    std::vector<common::CINNValue> res;
+    std::vector<cinn::common::CINNValue> res;
     auto stages = CreateStages({indices, on_value, off_value});
     stages->InsertLazily(out);
-    res.push_back(common::CINNValue(out));
-    res.push_back(common::CINNValue(stages));
-    *ret = common::CINNValuePack{res};
+    res.push_back(cinn::common::CINNValue(out));
+    res.push_back(cinn::common::CINNValue(stages));
+    *ret = cinn::common::CINNValuePack{res};
   });
 
   auto strategy = std::make_shared<framework::OpStrategy>();
diff --git a/paddle/cinn/hlir/op/contrib/one_hot_test.cc b/paddle/cinn/hlir/op/contrib/one_hot_test.cc
index 572172de0ab41b..7478876c4766d5 100644
--- a/paddle/cinn/hlir/op/contrib/one_hot_test.cc
+++ b/paddle/cinn/hlir/op/contrib/one_hot_test.cc
@@ -33,9 +33,9 @@ namespace hlir {
 namespace op {
 
 TEST(GenerateCode_Cpu, OneHot) {
-  common::Context::Global().ResetNameId();
+  cinn::common::Context::Global().ResetNameId();
 
-  common::Target target = common::DefaultHostTarget();
+  cinn::common::Target target = cinn::common::DefaultHostTarget();
 
   Expr m(4);
   Expr n(4);
@@ -52,7 +52,7 @@ TEST(GenerateCode_Cpu, OneHot) {
                           off_value,
                           depth,
                           axis,
-                          common::Str2Type(dtype),
+                          cinn::common::Str2Type(dtype),
                           "test_one_hot");
 
   poly::StageMap stages = poly::CreateStages({res});
diff --git a/paddle/cinn/hlir/op/contrib/randint.cc b/paddle/cinn/hlir/op/contrib/randint.cc
index 335155fd5afad4..8838656b814c0d 100644
--- a/paddle/cinn/hlir/op/contrib/randint.cc
+++ b/paddle/cinn/hlir/op/contrib/randint.cc
@@ -47,8 +47,8 @@ namespace cinn {
 namespace hlir {
 namespace op {
 
-using common::CINNValue;
-using common::CINNValuePack;
+using cinn::common::CINNValue;
+using cinn::common::CINNValuePack;
 
 std::shared_ptr<framework::OpStrategy> StrategyForRandInt(
     const framework::NodeAttr &attrs,
@@ -86,7 +86,7 @@ std::vector<framework::shape_t> InferShapeForRandInt(
 std::vector<Type> InferDtypeForRandInt(const std::vector<Type> &inputs_type,
                                        const framework::AttrMapType &attrs) {
   std::string dtype = "int32";
-  std::vector<Type> res{common::Str2Type(dtype)};
+  std::vector<Type> res{cinn::common::Str2Type(dtype)};
   return res;
 }
 
diff --git a/paddle/cinn/hlir/op/contrib/reciprocal.cc b/paddle/cinn/hlir/op/contrib/reciprocal.cc
index f9a71f7172b5b4..9b89395f8277b8 100644
--- a/paddle/cinn/hlir/op/contrib/reciprocal.cc
+++ b/paddle/cinn/hlir/op/contrib/reciprocal.cc
@@ -42,9 +42,9 @@ namespace cinn {
 namespace hlir {
 namespace op {
 
-using common::_CINNValuePack_;
-using common::CINNValue;
-using common::CINNValuePack;
+using cinn::common::_CINNValuePack_;
+using cinn::common::CINNValue;
+using cinn::common::CINNValuePack;
 using framework::OpStrategy;
 using framework::shape_t;
 using framework::StrategyFunction;
@@ -71,7 +71,7 @@ ir::Tensor Reciprocal(const ir::Tensor &input, const std::string &output_name) {
       [=](const std::vector<Expr> &indice) {
         ir::Tensor out_tensor(input);
         auto e = out_tensor(indice);
-        return common::make_const(input->type(), 1.0f) / e;
+        return cinn::common::make_const(input->type(), 1.0f) / e;
       },
       output_name)};
 }
diff --git a/paddle/cinn/hlir/op/contrib/reciprocal_test.cc b/paddle/cinn/hlir/op/contrib/reciprocal_test.cc
index c23afb3e7fd468..cfc22f878fc3d1 100644
--- a/paddle/cinn/hlir/op/contrib/reciprocal_test.cc
+++ b/paddle/cinn/hlir/op/contrib/reciprocal_test.cc
@@ -33,9 +33,9 @@ namespace hlir {
 namespace op {
 
 TEST(GenerateCode_Cpu, Reciprocal) {
-  common::Context::Global().ResetNameId();
+  cinn::common::Context::Global().ResetNameId();
 
-  common::Target target = common::DefaultHostTarget();
+  cinn::common::Target target = cinn::common::DefaultHostTarget();
 
   ir::Expr n(4);
   ir::Expr m(2);
diff --git a/paddle/cinn/hlir/op/contrib/repeat.cc b/paddle/cinn/hlir/op/contrib/repeat.cc
index 6f35a0f9b058c5..f77e5939099b52 100644
--- a/paddle/cinn/hlir/op/contrib/repeat.cc
+++ b/paddle/cinn/hlir/op/contrib/repeat.cc
@@ -40,7 +40,7 @@ namespace cinn {
 namespace hlir {
 namespace op {
 
-using common::CINNValuePack;
+using cinn::common::CINNValuePack;
 
 std::vector<ir::Tensor> Repeat(const ir::Tensor &tensor,
                                int repeats,
@@ -79,7 +79,7 @@ std::vector<ir::Tensor> Repeat(const ir::Tensor &tensor,
         }
         return tensor(idx);
       },
-      common::UniqName(output_name));
+      cinn::common::UniqName(output_name));
   return {res};
 }
 
@@ -166,22 +166,22 @@ std::shared_ptr<framework::OpStrategy> StrategyForRepeat(
     std::vector<ir::Tensor> out = Repeat(tensor_A, repeats, axis, tensor_name);
     CHECK(out.size() == 1U) << "The size of Repeat's output should be 1";
 
-    std::vector<common::CINNValue> res;
+    std::vector<cinn::common::CINNValue> res;
     auto stages = CreateStages({tensor_A});
     for (auto &t : out) {
       stages->InsertLazily(t);
-      res.push_back(common::CINNValue(t));
+      res.push_back(cinn::common::CINNValue(t));
     }
 
-    res.push_back(common::CINNValue(stages));
-    *ret = common::CINNValuePack{res};
+    res.push_back(cinn::common::CINNValue(stages));
+    *ret = cinn::common::CINNValuePack{res};
   });
 
   framework::CINNSchedule repeat_schedule([=](lang::Args args,
                                               lang::RetValue *ret) {
     CHECK(!args.empty())
         << "The input argument of repeat schedule is empty! Please check.\n";
-    common::CINNValuePack arg_pack = args[0];
+    cinn::common::CINNValuePack arg_pack = args[0];
     std::vector<Expr> vec_ast;
     for (int i = 0; i < arg_pack.size(); i++) {
       if (arg_pack[i].is_expr()) {
@@ -204,9 +204,9 @@ std::shared_ptr<framework::OpStrategy> StrategyForRepeat(
         pe::IRScheduleInjectiveCPU(ir_sch, output_shapes.front(), target, true);
       }
     }
-    std::vector<common::CINNValue> res{
-        common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
-    *ret = common::CINNValuePack{res};
+    std::vector<cinn::common::CINNValue> res{
+        cinn::common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+    *ret = cinn::common::CINNValuePack{res};
   });
 
   auto strategy = std::make_shared<framework::OpStrategy>();
diff --git a/paddle/cinn/hlir/op/contrib/repeat_test.cc b/paddle/cinn/hlir/op/contrib/repeat_test.cc
index a5abd5bb758046..358bfdd52d0db4 100755
--- a/paddle/cinn/hlir/op/contrib/repeat_test.cc
+++ b/paddle/cinn/hlir/op/contrib/repeat_test.cc
@@ -33,9 +33,9 @@ namespace hlir {
 namespace op {
 
 TEST(GenerateCode_Cpu, Repeat) {
-  common::Context::Global().ResetNameId();
+  cinn::common::Context::Global().ResetNameId();
 
-  common::Target target = common::DefaultHostTarget();
+  cinn::common::Target target = cinn::common::DefaultHostTarget();
 
   ir::Expr m(4);
   ir::Expr n(4);
diff --git a/paddle/cinn/hlir/op/contrib/resize.cc b/paddle/cinn/hlir/op/contrib/resize.cc
index 786365d21dcd92..d74f4647878b00 100644
--- a/paddle/cinn/hlir/op/contrib/resize.cc
+++ b/paddle/cinn/hlir/op/contrib/resize.cc
@@ -39,26 +39,26 @@ namespace cinn {
 namespace hlir {
 namespace op {
 
-using common::CINNValuePack;
+using cinn::common::CINNValuePack;
 
 #define __get_pixel(input, h, w, n, c, y, x)                         \
   input({n,                                                          \
          c,                                                          \
-         common::AutoSimplify(                                       \
+         cinn::common::AutoSimplify(                                 \
              ir::Max::Make(ir::Min::Make(y, h - Expr(1)), Expr(0))), \
-         common::AutoSimplify(                                       \
+         cinn::common::AutoSimplify(                                 \
              ir::Max::Make(ir::Min::Make(x, w - Expr(1)), Expr(0)))})
 
 ir::Tensor Resize(const ir::Tensor &input,
-                  const common::Target &target,
+                  const cinn::common::Target &target,
                   const std::vector<int> &out_shape,
                   const std::string &mode,
                   const std::string &output_name) {
   std::string func_name;
 
-  if (target.arch == common::Target::Arch::NVGPU) {
+  if (target.arch == cinn::common::Target::Arch::NVGPU) {
     func_name.assign("cinn_cuda_resize_");
-  } else if (target.arch == common::Target::Arch::X86) {
+  } else if (target.arch == cinn::common::Target::Arch::X86) {
     func_name.assign("cinn_host_resize_");
   } else {
     LOG(FATAL) << "Resize only supports X86 and NVGPU ! Please Check.\n";
@@ -85,14 +85,16 @@ ir::Tensor Resize(const ir::Tensor &input,
         Expr value;
 
         if (mode == "nearest") {
-          Expr in_y = ir::Cast::Make(common::F32(), in_h) /
-                      ir::Cast::Make(common::F32(), out_h) *
-                      ir::Cast::Make(common::F32(), out_y);
-          Expr in_x = ir::Cast::Make(common::F32(), in_w) /
-                      ir::Cast::Make(common::F32(), out_w) *
-                      ir::Cast::Make(common::F32(), out_x);
-          Expr in_y_int = ir::Cast::Make(common::Int(32), lang::Floor(in_y));
-          Expr in_x_int = ir::Cast::Make(common::Int(32), lang::Floor(in_x));
+          Expr in_y = ir::Cast::Make(cinn::common::F32(), in_h) /
+                      ir::Cast::Make(cinn::common::F32(), out_h) *
+                      ir::Cast::Make(cinn::common::F32(), out_y);
+          Expr in_x = ir::Cast::Make(cinn::common::F32(), in_w) /
+                      ir::Cast::Make(cinn::common::F32(), out_w) *
+                      ir::Cast::Make(cinn::common::F32(), out_x);
+          Expr in_y_int =
+              ir::Cast::Make(cinn::common::Int(32), lang::Floor(in_y));
+          Expr in_x_int =
+              ir::Cast::Make(cinn::common::Int(32), lang::Floor(in_x));
           std::vector<Expr> in_indices = {
               indices[0], indices[1], in_y_int, in_x_int};
           value = input(in_indices);
@@ -126,7 +128,7 @@ ir::Tensor Resize(const ir::Tensor &input,
 
         return value;
       },
-      common::UniqName(output_name));
+      cinn::common::UniqName(output_name));
 
   return res;
 }
@@ -209,19 +211,19 @@ std::shared_ptr<framework::OpStrategy> StrategyForResize(
 
     ir::Tensor out = Resize(tensor_A, target, out_shape, mode, tensor_name);
 
-    std::vector<common::CINNValue> res;
+    std::vector<cinn::common::CINNValue> res;
     auto stages = CreateStages({tensor_A});
     stages->InsertLazily(out);
-    res.push_back(common::CINNValue(out));
-    res.push_back(common::CINNValue(stages));
-    *ret = common::CINNValuePack{res};
+    res.push_back(cinn::common::CINNValue(out));
+    res.push_back(cinn::common::CINNValue(stages));
+    *ret = cinn::common::CINNValuePack{res};
   });
 
   framework::CINNSchedule resize_schedule([=](lang::Args args,
                                               lang::RetValue *ret) {
     CHECK(!args.empty())
         << "The input argument of resize schedule is empty! Please check.\n";
-    common::CINNValuePack arg_pack = args[0];
+    cinn::common::CINNValuePack arg_pack = args[0];
     std::vector<Expr> vec_ast;
     for (int i = 0; i < arg_pack.size(); i++) {
       if (arg_pack[i].is_expr()) {
@@ -244,9 +246,9 @@ std::shared_ptr<framework::OpStrategy> StrategyForResize(
         pe::IRScheduleInjectiveCPU(ir_sch, output_shapes.front(), target, true);
       }
     }
-    std::vector<common::CINNValue> res{
-        common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
-    *ret = common::CINNValuePack{res};
+    std::vector<cinn::common::CINNValue> res{
+        cinn::common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+    *ret = cinn::common::CINNValuePack{res};
   });
 
   auto strategy = std::make_shared<framework::OpStrategy>();
diff --git a/paddle/cinn/hlir/op/contrib/resize.h b/paddle/cinn/hlir/op/contrib/resize.h
index 77bf94878870db..b087eb36a9b51f 100644
--- a/paddle/cinn/hlir/op/contrib/resize.h
+++ b/paddle/cinn/hlir/op/contrib/resize.h
@@ -26,7 +26,7 @@ namespace hlir {
 namespace op {
 
 ir::Tensor Resize(const ir::Tensor &x,
-                  const common::Target &target,
+                  const cinn::common::Target &target,
                   const std::vector<int> &out_shape,
                   const std::string &mode,
                   const std::string &output_name);
diff --git a/paddle/cinn/hlir/op/contrib/sort.cc b/paddle/cinn/hlir/op/contrib/sort.cc
index 0941d2690b604a..8adc618e352e61 100644
--- a/paddle/cinn/hlir/op/contrib/sort.cc
+++ b/paddle/cinn/hlir/op/contrib/sort.cc
@@ -40,20 +40,20 @@ namespace cinn {
 namespace hlir {
 namespace op {
 
-using common::CINNValue;
-using common::CINNValuePack;
+using cinn::common::CINNValue;
+using cinn::common::CINNValuePack;
 
 std::vector<ir::Tensor> ArgSort(const ir::Tensor &A,
-                                const common::Target &target,
+                                const cinn::common::Target &target,
                                 poly::StageMap stages,
                                 const int &axis,
                                 const bool &is_ascend,
                                 const std::string &name) {
   std::string find_func_name;
   std::string index_func_name;
-  if (target.arch == common::Target::Arch::NVGPU) {
+  if (target.arch == cinn::common::Target::Arch::NVGPU) {
     find_func_name.assign("cinn_nvgpu_next_smallest_int32");
-  } else if (target.arch == common::Target::Arch::X86) {
+  } else if (target.arch == cinn::common::Target::Arch::X86) {
     find_func_name.assign("cinn_host_next_smallest_int32");
   } else {
     LOG(FATAL) << "ArgSort only supports X86 and NVGPU ! Please Check.\n";
@@ -84,8 +84,8 @@ std::vector<ir::Tensor> ArgSort(const ir::Tensor &A,
             stride = stride * A->shape[i];
           }
         }
-        offset = common::AutoSimplify(offset);
-        stride = common::AutoSimplify(stride);
+        offset = cinn::common::AutoSimplify(offset);
+        stride = cinn::common::AutoSimplify(stride);
         auto A_shape_axis = A->shape[pos_axis];
         return lang::CallExtern(index_func_name,
                                 {A, A_shape_axis, A(indices), offset, stride});
@@ -106,8 +106,8 @@ std::vector<ir::Tensor> ArgSort(const ir::Tensor &A,
             stride = stride * A->shape[i];
           }
         }
-        offset = common::AutoSimplify(offset);
-        stride = common::AutoSimplify(stride);
+        offset = cinn::common::AutoSimplify(offset);
+        stride = cinn::common::AutoSimplify(stride);
 
         auto A_shape_axis = A->shape[pos_axis];
         auto idx = lang::CallExtern(
@@ -121,7 +121,7 @@ std::vector<ir::Tensor> ArgSort(const ir::Tensor &A,
 }
 
 std::vector<ir::Tensor> Sort(const ir::Tensor &A,
-                             const common::Target &target,
+                             const cinn::common::Target &target,
                              poly::StageMap stages,
                              const int &axis,
                              const bool &is_ascend,
@@ -192,7 +192,7 @@ std::shared_ptr<framework::OpStrategy> StrategyForSort(
       [=](lang::Args args, lang::RetValue *ret) {
         CHECK(!args.empty())
             << "The input argument of sort_schedule is empty! Please check.\n";
-        common::CINNValuePack arg_pack = args[0];
+        cinn::common::CINNValuePack arg_pack = args[0];
         std::vector<Expr> vec_ast;
         for (int i = 0; i < arg_pack.size(); i++) {
           if (arg_pack[i].is_expr()) {
@@ -218,9 +218,9 @@ std::shared_ptr<framework::OpStrategy> StrategyForSort(
           pe::IRScheduleInjectiveCPU(
               ir_sch, output_shapes.front(), target, true);
         }
-        std::vector<common::CINNValue> res{
-            common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
-        *ret = common::CINNValuePack{res};
+        std::vector<cinn::common::CINNValue> res{
+            cinn::common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+        *ret = cinn::common::CINNValuePack{res};
       });
 
   auto strategy = std::make_shared<framework::OpStrategy>();
@@ -275,7 +275,7 @@ std::shared_ptr<framework::OpStrategy> StrategyForArgSort(
                                                lang::RetValue *ret) {
     CHECK(!args.empty())
         << "The input argument of argsort_schedule is empty! Please check.\n";
-    common::CINNValuePack arg_pack = args[0];
+    cinn::common::CINNValuePack arg_pack = args[0];
     std::vector<Expr> vec_ast;
     for (int i = 0; i < arg_pack.size(); i++) {
       if (arg_pack[i].is_expr()) {
@@ -300,9 +300,9 @@ std::shared_ptr<framework::OpStrategy> StrategyForArgSort(
     if (prod_size > 1 && target.arch == Target::Arch::X86) {
       pe::IRScheduleInjectiveCPU(ir_sch, output_shapes.front(), target, true);
     }
-    std::vector<common::CINNValue> res{
-        common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
-    *ret = common::CINNValuePack{res};
+    std::vector<cinn::common::CINNValue> res{
+        cinn::common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+    *ret = cinn::common::CINNValuePack{res};
   });
 
   auto strategy = std::make_shared<framework::OpStrategy>();
diff --git a/paddle/cinn/hlir/op/contrib/sort.h b/paddle/cinn/hlir/op/contrib/sort.h
index bb07855666f214..93660861ef91b3 100644
--- a/paddle/cinn/hlir/op/contrib/sort.h
+++ b/paddle/cinn/hlir/op/contrib/sort.h
@@ -26,14 +26,14 @@ namespace hlir {
 namespace op {
 
 std::vector<ir::Tensor> ArgSort(const ir::Tensor& A,
-                                const common::Target& target,
+                                const cinn::common::Target& target,
                                 poly::StageMap stages,
                                 const int& axis,
                                 const bool& is_ascend,
                                 const std::string& name);
 
 std::vector<ir::Tensor> Sort(const ir::Tensor& A,
-                             const common::Target& target,
+                             const cinn::common::Target& target,
                              poly::StageMap stages,
                              const int& axis,
                              const bool& is_ascend,
diff --git a/paddle/cinn/hlir/op/contrib/sort_test.cc b/paddle/cinn/hlir/op/contrib/sort_test.cc
index 3d2a8f6c73e38d..76386dea2ecef1 100644
--- a/paddle/cinn/hlir/op/contrib/sort_test.cc
+++ b/paddle/cinn/hlir/op/contrib/sort_test.cc
@@ -33,9 +33,9 @@ namespace hlir {
 namespace op {
 
 TEST(GenerateCode_Cpu, ArgSort) {
-  common::Context::Global().ResetNameId();
+  cinn::common::Context::Global().ResetNameId();
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   ir::Expr n(4);
   ir::Expr h(28);
@@ -72,9 +72,9 @@ TEST(GenerateCode_Cpu, ArgSort) {
 }
 
 TEST(GenerateCode_Cpu, Sort) {
-  common::Context::Global().ResetNameId();
+  cinn::common::Context::Global().ResetNameId();
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   ir::Expr n(4);
   ir::Expr h(28);
diff --git a/paddle/cinn/hlir/op/contrib/triangular_solve.cc b/paddle/cinn/hlir/op/contrib/triangular_solve.cc
index 3ec35013fc417d..a6b45405367ab8 100644
--- a/paddle/cinn/hlir/op/contrib/triangular_solve.cc
+++ b/paddle/cinn/hlir/op/contrib/triangular_solve.cc
@@ -33,8 +33,8 @@ namespace cinn {
 namespace hlir {
 namespace op {
 
-using common::CINNValue;
-using common::CINNValuePack;
+using cinn::common::CINNValue;
+using cinn::common::CINNValuePack;
 
 std::shared_ptr<framework::OpStrategy> StrategyForTriangularSolve(
     const framework::NodeAttr &attrs,
diff --git a/paddle/cinn/hlir/op/contrib/uniform_random.cc b/paddle/cinn/hlir/op/contrib/uniform_random.cc
index cd034560f8feba..0002f1be773557 100644
--- a/paddle/cinn/hlir/op/contrib/uniform_random.cc
+++ b/paddle/cinn/hlir/op/contrib/uniform_random.cc
@@ -47,8 +47,8 @@ namespace cinn {
 namespace hlir {
 namespace op {
 
-using common::CINNValue;
-using common::CINNValuePack;
+using cinn::common::CINNValue;
+using cinn::common::CINNValuePack;
 
 std::shared_ptr<framework::OpStrategy> StrategyForUniformRandom(
     const framework::NodeAttr &attrs,
@@ -89,7 +89,7 @@ std::vector<Type> InferDtypeForUniformRandom(
   if (attrs.find("dtype") != attrs.end()) {
     dtype = absl::get<std::string>(attrs.at("dtype"));
   }
-  std::vector<Type> res{common::Str2Type(dtype)};
+  std::vector<Type> res{cinn::common::Str2Type(dtype)};
   CHECK(res[0].is_float(32) || res[0].is_float(64))
       << "uniform_random only support float32 and float64, but here " << res[0]
       << "! Please check.";
diff --git a/paddle/cinn/hlir/op/custom_call.cc b/paddle/cinn/hlir/op/custom_call.cc
index bf411c0fb502fe..91c3ee6db08986 100644
--- a/paddle/cinn/hlir/op/custom_call.cc
+++ b/paddle/cinn/hlir/op/custom_call.cc
@@ -34,9 +34,9 @@ namespace cinn {
 namespace hlir {
 namespace op {
 
-using common::_CINNValuePack_;
-using common::CINNValue;
-using common::CINNValuePack;
+using cinn::common::_CINNValuePack_;
+using cinn::common::CINNValue;
+using cinn::common::CINNValuePack;
 using framework::OpStrategy;
 using framework::shape_t;
 using framework::StrategyFunction;
@@ -54,14 +54,14 @@ class CustomCallArgsFuncRegistry {
   }
 
   void Register(const std::string &custom_call,
-                const common::Target &target,
+                const cinn::common::Target &target,
                 ArgsFunc args_func) {
     auto id = custom_call + "_" + target.arch_str();
     func_map_[id] = args_func;
   }
 
   ArgsFunc Lookup(const std::string &custom_call,
-                  const common::Target &target) {
+                  const cinn::common::Target &target) {
     auto id = custom_call + "_" + target.arch_str();
     CHECK(func_map_.count(id))
         << "Can't find " << custom_call << " for target " << target.arch_str();
@@ -100,7 +100,7 @@ std::shared_ptr<OpStrategy> StrategyForCustomCall(
         ir::Argument(kernel_args, ir::Argument::IO::kOutput),
         ir::Argument(kernel_args_num, ir::Argument::IO::kInput)};
     // if target is nvgpu, add stream.
-    if (target == common::DefaultNVGPUTarget()) {
+    if (target == cinn::common::DefaultNVGPUTarget()) {
       ir::Var kernel_stream(KERNEL_STREAM, type_of<void *>());
 
       host_args.push_back(kernel_stream);
@@ -906,7 +906,7 @@ std::vector<ir::Expr> CustomCallArgsForMemset(
   }
 
   const auto &dtype =
-      common::Str2Type(absl::get<std::string>(attr_store.at("dtype")));
+      cinn::common::Str2Type(absl::get<std::string>(attr_store.at("dtype")));
   count *= dtype.bytes();
   VLOG(4) << "call memset custom_call with value="
           << utils::Attribute2String(value_attr) << " (" << value
@@ -939,60 +939,68 @@ std::vector<ir::Expr> CustomCallArgsForMemcpy(
 
 bool RegisteryCustomCallArgsFunc() {
 #ifdef CINN_WITH_CUDA
-  CustomCallArgsFuncRegistry::Global().Register("cinn_call_cublas",
-                                                common::DefaultNVGPUTarget(),
-                                                CustomCallArgsForCublas);
+  CustomCallArgsFuncRegistry::Global().Register(
+      "cinn_call_cublas",
+      cinn::common::DefaultNVGPUTarget(),
+      CustomCallArgsForCublas);
   CustomCallArgsFuncRegistry::Global().Register(
       "cinn_call_gaussian_random",
-      common::DefaultNVGPUTarget(),
+      cinn::common::DefaultNVGPUTarget(),
       CustomCallArgsForGaussianRandom);
-  CustomCallArgsFuncRegistry::Global().Register("cinn_call_uniform_random",
-                                                common::DefaultNVGPUTarget(),
-                                                CustomCallArgsForUniformRandom);
-  CustomCallArgsFuncRegistry::Global().Register("cinn_call_randint",
-                                                common::DefaultNVGPUTarget(),
-                                                CustomCallArgsForRandInt);
-  CustomCallArgsFuncRegistry::Global().Register("cinn_call_cholesky_nvgpu",
-                                                common::DefaultNVGPUTarget(),
-                                                CustomCallArgsForCholesky);
-  CustomCallArgsFuncRegistry::Global().Register("cinn_call_batched_cublas",
-                                                common::DefaultNVGPUTarget(),
-                                                CustomCallArgsForBatchedCublas);
+  CustomCallArgsFuncRegistry::Global().Register(
+      "cinn_call_uniform_random",
+      cinn::common::DefaultNVGPUTarget(),
+      CustomCallArgsForUniformRandom);
+  CustomCallArgsFuncRegistry::Global().Register(
+      "cinn_call_randint",
+      cinn::common::DefaultNVGPUTarget(),
+      CustomCallArgsForRandInt);
+  CustomCallArgsFuncRegistry::Global().Register(
+      "cinn_call_cholesky_nvgpu",
+      cinn::common::DefaultNVGPUTarget(),
+      CustomCallArgsForCholesky);
+  CustomCallArgsFuncRegistry::Global().Register(
+      "cinn_call_batched_cublas",
+      cinn::common::DefaultNVGPUTarget(),
+      CustomCallArgsForBatchedCublas);
   CustomCallArgsFuncRegistry::Global().Register(
       "cinn_call_triangular_solve_nvgpu",
-      common::DefaultNVGPUTarget(),
+      cinn::common::DefaultNVGPUTarget(),
       CustomCallArgsForTriangularSolve);
-  CustomCallArgsFuncRegistry::Global().Register("cinn_assert_true_nvgpu",
-                                                common::DefaultNVGPUTarget(),
-                                                CustomCallArgsForAssertTrue);
-  CustomCallArgsFuncRegistry::Global().Register("cinn_call_cuda_memset",
-                                                common::DefaultNVGPUTarget(),
-                                                CustomCallArgsForMemset);
-  CustomCallArgsFuncRegistry::Global().Register("cinn_call_cuda_memcpy",
-                                                common::DefaultNVGPUTarget(),
-                                                CustomCallArgsForMemcpy);
+  CustomCallArgsFuncRegistry::Global().Register(
+      "cinn_assert_true_nvgpu",
+      cinn::common::DefaultNVGPUTarget(),
+      CustomCallArgsForAssertTrue);
+  CustomCallArgsFuncRegistry::Global().Register(
+      "cinn_call_cuda_memset",
+      cinn::common::DefaultNVGPUTarget(),
+      CustomCallArgsForMemset);
+  CustomCallArgsFuncRegistry::Global().Register(
+      "cinn_call_cuda_memcpy",
+      cinn::common::DefaultNVGPUTarget(),
+      CustomCallArgsForMemcpy);
 #endif
 
 #ifdef CINN_WITH_CUDNN
   CustomCallArgsFuncRegistry::Global().Register(
       "cinn_call_cudnn_conv2d_forward",
-      common::DefaultNVGPUTarget(),
+      cinn::common::DefaultNVGPUTarget(),
       CustomCallArgsForCudnnConvForward);
   CustomCallArgsFuncRegistry::Global().Register(
       "cinn_call_cudnn_conv2d_backward_data",
-      common::DefaultNVGPUTarget(),
+      cinn::common::DefaultNVGPUTarget(),
       CustomCallArgsForCudnnConvBackwardData);
   CustomCallArgsFuncRegistry::Global().Register(
       "cinn_call_cudnn_conv2d_backward_filter",
-      common::DefaultNVGPUTarget(),
+      cinn::common::DefaultNVGPUTarget(),
       CustomCallArgsForCudnnConvBackwardFilter);
   CustomCallArgsFuncRegistry::Global().Register(
       "cinn_call_cudnn_pool2d_forward",
-      common::DefaultNVGPUTarget(),
+      cinn::common::DefaultNVGPUTarget(),
       CustomCallArgsForCudnnPoolForward);
   CustomCallArgsFuncRegistry::Global().Register(
       "cinn_call_cudnn_pool2d_backward",
-      common::DefaultNVGPUTarget(),
+      cinn::common::DefaultNVGPUTarget(),
       CustomCallArgsForCudnnPoolBackward);
 #endif
 
@@ -1002,15 +1010,17 @@ bool RegisteryCustomCallArgsFunc() {
 
 #ifdef CINN_WITH_MKL_CBLAS
 
-  CustomCallArgsFuncRegistry::Global().Register("cinn_call_cholesky_host",
-                                                common::DefaultHostTarget(),
-                                                CustomCallArgsForCholesky);
+  CustomCallArgsFuncRegistry::Global().Register(
+      "cinn_call_cholesky_host",
+      cinn::common::DefaultHostTarget(),
+      CustomCallArgsForCholesky);
 
 #endif
 
-  CustomCallArgsFuncRegistry::Global().Register("cinn_assert_true_host",
-                                                common::DefaultHostTarget(),
-                                                CustomCallArgsForAssertTrue);
+  CustomCallArgsFuncRegistry::Global().Register(
+      "cinn_assert_true_host",
+      cinn::common::DefaultHostTarget(),
+      CustomCallArgsForAssertTrue);
 
   return true;
 }
diff --git a/paddle/cinn/hlir/op/elementwise.cc b/paddle/cinn/hlir/op/elementwise.cc
index a488391714dd8f..78df4cf0b78ab7 100644
--- a/paddle/cinn/hlir/op/elementwise.cc
+++ b/paddle/cinn/hlir/op/elementwise.cc
@@ -31,9 +31,9 @@
 namespace cinn {
 namespace hlir {
 namespace op {
-using common::_CINNValuePack_;
-using common::CINNValue;
-using common::CINNValuePack;
+using cinn::common::_CINNValuePack_;
+using cinn::common::CINNValue;
+using cinn::common::CINNValuePack;
 using framework::OpStrategy;
 using framework::shape_t;
 using framework::StrategyFunction;
@@ -167,8 +167,8 @@ std::shared_ptr<OpStrategy> StrategyForScale(
 
         // Paddle upscale float16 or bfloat16 compute to float32,
         // we made CINN consistent with this behavior of Paddle
-        bool should_upscale_fp32 =
-            A->type() == common::F16() || A->type() == common::BF16();
+        bool should_upscale_fp32 = A->type() == cinn::common::F16() ||
+                                   A->type() == cinn::common::BF16();
 
         out = Compute(
             A->shape,
@@ -180,8 +180,9 @@ std::shared_ptr<OpStrategy> StrategyForScale(
                                    ? Expr(bias)
                                    : ir::Cast::Make(A->type(), Expr(bias));
               Expr cast_A_indice =
-                  should_upscale_fp32 ? ir::Cast::Make(common::F32(), A(indice))
-                                      : A(indice);
+                  should_upscale_fp32
+                      ? ir::Cast::Make(cinn::common::F32(), A(indice))
+                      : A(indice);
               Expr add_result = bias_after_scale
                                     ? cast_scale * cast_A_indice + cast_bias
                                     : cast_scale * (cast_A_indice + cast_bias);
@@ -290,7 +291,7 @@ std::vector<Type> InferDtypeForConstScalar(
   if (attrs.find("dtype") != attrs.end()) {
     auto dtype_str = absl::get<std::string>(attrs.at("dtype"));
     if (!dtype_str.empty()) {
-      out_type = common::Str2Type(dtype_str);
+      out_type = cinn::common::Str2Type(dtype_str);
     }
   } else {
     auto scalar = GetScalarExpr(attrs.at("value"));
@@ -368,7 +369,7 @@ std::shared_ptr<OpStrategy> StrategyForFillConstant(
         CHECK(attrs.attr_store.count("force_cpu"));
         force_cpu = absl::get<bool>(attrs.attr_store.at("force_cpu"));
 
-        if (force_cpu && target != common::DefaultHostTarget()) {
+        if (force_cpu && target != cinn::common::DefaultHostTarget()) {
           LOG(WARNING) << "The attribute \"force_cpu\" of \"fill_constant\" "
                           "not supported in CINN! The \"fill_constant\"'s "
                           "output tensor will placed on "
@@ -412,19 +413,19 @@ std::vector<shape_t> InferShapeForFillConstant(
 
 std::vector<Type> InferDtypeForFillConstant(
     const std::vector<Type> &inputs_type, const framework::AttrMapType &attrs) {
-  common::Type out_type;
+  cinn::common::Type out_type;
   CHECK(attrs.count("value"));
   if (attrs.find("dtype") != attrs.end()) {
     // attribute [dtype] are given
     auto dtype_str = absl::get<std::string>(attrs.at("dtype"));
-    out_type = common::Str2Type(dtype_str);
+    out_type = cinn::common::Str2Type(dtype_str);
     VLOG(3) << "FillConstant output dtype (from [dtype]): " << dtype_str;
   } else {
     // attribute [dtype] no given, inferred by value's type
     auto scalar = GetScalarExpr(attrs.at("value"));
     out_type = scalar->type();
     VLOG(3) << "FillConstant scalar type (from [value]): "
-            << common::Type2Str(out_type);
+            << cinn::common::Type2Str(out_type);
   }
   return {out_type};
 }
@@ -542,7 +543,7 @@ std::vector<Type> InferDtypeForAssignValue(
     auto dtype_str = absl::get<std::string>(attrs.at("dtype"));
     if (!dtype_str.empty()) {
       // if the [dtype] is not empty, output as the given type
-      out_type = common::Str2Type(dtype_str);
+      out_type = cinn::common::Str2Type(dtype_str);
     }
   }
 
@@ -555,10 +556,10 @@ std::vector<Type> InferDtypeForAssignValue(
 
 #define EXPAND_ATTR_TO_GET_DTYPE(TYPE)                           \
   else if (absl::get_if<TYPE>(&value)) { /*NOLINT*/              \
-    out_type = common::type_of<TYPE>();                          \
+    out_type = cinn::common::type_of<TYPE>();                    \
   }                                                              \
   else if (absl::get_if<std::vector<TYPE>>(&value)) { /*NOLINT*/ \
-    out_type = common::type_of<TYPE>();                          \
+    out_type = cinn::common::type_of<TYPE>();                    \
   }
 
     if (false) {  // NOLINT
@@ -918,7 +919,7 @@ std::shared_ptr<framework::OpStrategy> StrategyForCast(
 std::vector<Type> InferDtypeForCast(const std::vector<Type> &inputs_type,
                                     const framework::AttrMapType &attrs) {
   CHECK(attrs.count("dtype"));
-  return {common::Str2Type(absl::get<std::string>(attrs.at("dtype")))};
+  return {cinn::common::Str2Type(absl::get<std::string>(attrs.at("dtype")))};
 }
 
 std::shared_ptr<framework::OpStrategy> StrategyForArange(
@@ -936,7 +937,8 @@ std::shared_ptr<framework::OpStrategy> StrategyForArange(
   auto start = absl::get<float>(attr_store.at("start"));
   auto stop = absl::get<float>(attr_store.at("stop"));
   auto step = absl::get<float>(attr_store.at("step"));
-  auto dtype = common::Str2Type(absl::get<std::string>(attr_store.at("dtype")));
+  auto dtype =
+      cinn::common::Str2Type(absl::get<std::string>(attr_store.at("dtype")));
 
   framework::CINNCompute arange_compute(
       [=](lang::Args args, lang::RetValue *ret) {
@@ -948,10 +950,10 @@ std::shared_ptr<framework::OpStrategy> StrategyForArange(
         std::string tensor_name = pack_args[0].operator std::string();
 
         auto out = pe::Arange(start, stop, step, dtype, tensor_name);
-        std::vector<common::CINNValue> res;
+        std::vector<cinn::common::CINNValue> res;
         auto stages = CreateStages({out});
-        res.push_back(common::CINNValue(out));
-        res.push_back(common::CINNValue(stages));
+        res.push_back(cinn::common::CINNValue(out));
+        res.push_back(cinn::common::CINNValue(stages));
         *ret = CINNValuePack{res};
       });
 
@@ -984,12 +986,12 @@ std::vector<std::vector<int>> InferShapeForArange(
 std::vector<Type> InferDtypeForArange(const std::vector<Type> &inputs_type,
                                       const framework::AttrMapType &attrs) {
   CHECK(attrs.count("dtype"));
-  return {common::Str2Type(absl::get<std::string>(attrs.at("dtype")))};
+  return {cinn::common::Str2Type(absl::get<std::string>(attrs.at("dtype")))};
 }
 
 std::vector<Type> InferDtypeForLogicalNot(const std::vector<Type> &inputs_type,
                                           const framework::AttrMapType &attrs) {
-  return {common::Bool()};
+  return {cinn::common::Bool()};
 }
 
 }  // namespace op
diff --git a/paddle/cinn/hlir/op/external_api_registry.cc b/paddle/cinn/hlir/op/external_api_registry.cc
index 000f8b92de9054..cddb10c63b1467 100644
--- a/paddle/cinn/hlir/op/external_api_registry.cc
+++ b/paddle/cinn/hlir/op/external_api_registry.cc
@@ -18,13 +18,13 @@ namespace cinn {
 namespace hlir {
 namespace op {
 
-ExternalApiInfo& ExternalApiRegistry::Register(const std::string& op_name,
-                                               const common::Target& target) {
+ExternalApiInfo& ExternalApiRegistry::Register(
+    const std::string& op_name, const cinn::common::Target& target) {
   return __REGISTER__(GenKey(op_name, target));
 }
 
-std::string ExternalApiRegistry::GetExternalApi(const framework::Node* op_node,
-                                                const common::Target& target) {
+std::string ExternalApiRegistry::GetExternalApi(
+    const framework::Node* op_node, const cinn::common::Target& target) {
   CHECK(op_node->attrs.attr_store.count("original_op"))
       << "a custom_call op must store its original op name";
   std::string op_name =
@@ -44,7 +44,7 @@ std::string ExternalApiRegistry::GetExternalApi(const framework::Node* op_node,
 }
 
 std::string ExternalApiRegistry::GenKey(const std::string& op_name,
-                                        const common::Target& target) {
+                                        const cinn::common::Target& target) {
   std::ostringstream oss;
   oss << target;
   return op_name + "_" + oss.str();
diff --git a/paddle/cinn/hlir/op/external_api_registry.h b/paddle/cinn/hlir/op/external_api_registry.h
index 307cac68b2f20e..e7d85cca784fce 100644
--- a/paddle/cinn/hlir/op/external_api_registry.h
+++ b/paddle/cinn/hlir/op/external_api_registry.h
@@ -61,22 +61,23 @@ class ExternalApiRegistry : public Registry<ExternalApiInfo> {
   }
 
   ExternalApiInfo& Register(const std::string& op_name,
-                            const common::Target& target);
+                            const cinn::common::Target& target);
 
-  bool Has(const std::string& op_name, const common::Target& target) {
+  bool Has(const std::string& op_name, const cinn::common::Target& target) {
     return nullptr != Registry<ExternalApiInfo>::Find(GenKey(op_name, target));
   }
 
   // return the api name on the specified target
   std::string GetExternalApi(const framework::Node* op_node,
-                             const common::Target& target);
+                             const cinn::common::Target& target);
 
  private:
   ExternalApiRegistry() = default;
   CINN_DISALLOW_COPY_AND_ASSIGN(ExternalApiRegistry);
 
   // the registered key consist of the name of op and the specified target
-  std::string GenKey(const std::string& op_name, const common::Target& target);
+  std::string GenKey(const std::string& op_name,
+                     const cinn::common::Target& target);
 };
 
 }  // namespace op
diff --git a/paddle/cinn/hlir/op/external_api_registry_test.cc b/paddle/cinn/hlir/op/external_api_registry_test.cc
index 186fb8fa532624..36a15666277e67 100644
--- a/paddle/cinn/hlir/op/external_api_registry_test.cc
+++ b/paddle/cinn/hlir/op/external_api_registry_test.cc
@@ -27,12 +27,12 @@ using cinn::hlir::framework::Node;
 using cinn::hlir::op::ExternalApiRegistry;
 
 TEST(ExternalApiRegistry, Has) {
-  ASSERT_TRUE(ExternalApiRegistry::Global()->Has("matmul",
-                                                 common::DefaultNVGPUTarget()));
-  ASSERT_TRUE(ExternalApiRegistry::Global()->Has("cholesky",
-                                                 common::DefaultHostTarget()));
+  ASSERT_TRUE(ExternalApiRegistry::Global()->Has(
+      "matmul", cinn::common::DefaultNVGPUTarget()));
+  ASSERT_TRUE(ExternalApiRegistry::Global()->Has(
+      "cholesky", cinn::common::DefaultHostTarget()));
   ASSERT_FALSE(ExternalApiRegistry::Global()->Has(
-      "op_doesn't_exist", common::DefaultNVGPUTarget()));
+      "op_doesn't_exist", cinn::common::DefaultNVGPUTarget()));
 }
 
 TEST(ExternalApiRegistry, GetExternalApi) {
@@ -41,13 +41,13 @@ TEST(ExternalApiRegistry, GetExternalApi) {
   node->attrs.attr_store["original_op"] = std::string("matmul");
   ASSERT_EQ("cinn_call_cublas",
             ExternalApiRegistry::Global()->GetExternalApi(
-                node.get(), common::DefaultNVGPUTarget()));
+                node.get(), cinn::common::DefaultNVGPUTarget()));
 #ifdef CINN_WITH_CUDNN
   node->attrs.attr_store["conv_type"] = std::string("backward_data");
   node->attrs.attr_store["original_op"] = std::string("conv2d");
   ASSERT_EQ("cinn_call_cudnn_conv2d_backward_data",
             ExternalApiRegistry::Global()->GetExternalApi(
-                node.get(), common::DefaultNVGPUTarget()));
+                node.get(), cinn::common::DefaultNVGPUTarget()));
 #endif
 }
 
diff --git a/paddle/cinn/hlir/op/nn.cc b/paddle/cinn/hlir/op/nn.cc
index 22bd95dcbf0109..ca5d542d85a125 100644
--- a/paddle/cinn/hlir/op/nn.cc
+++ b/paddle/cinn/hlir/op/nn.cc
@@ -32,9 +32,9 @@
 namespace cinn {
 namespace hlir {
 namespace op {
-using common::_CINNValuePack_;
-using common::CINNValue;
-using common::CINNValuePack;
+using cinn::common::_CINNValuePack_;
+using cinn::common::CINNValue;
+using cinn::common::CINNValuePack;
 using framework::OpStrategy;
 using framework::shape_t;
 using framework::StrategyFunction;
@@ -481,7 +481,7 @@ std::vector<shape_t> InferShapeForConv2d(
                          -1,
                          -1,
                          Float(32),
-                         common::DefaultHostTarget(),
+                         cinn::common::DefaultHostTarget(),
                          key);
     int ic_bn = conv2d_factors["ic_bn"];
     int oc_bn = conv2d_factors["oc_bn"];
@@ -885,7 +885,7 @@ std::shared_ptr<OpStrategy> StrategyForDepthwiseConv2d(
       [=](lang::Args args, lang::RetValue *ret) {
         CHECK(!args.empty()) << "The input argument of InjectiveSchedule is "
                                 "empty! Please check.\n";
-        common::CINNValuePack arg_pack = args[0];
+        cinn::common::CINNValuePack arg_pack = args[0];
         std::vector<Expr> vec_ast;
         std::vector<Expr> vec_tensor;
         for (int i = 0; i < arg_pack.size(); i++) {
@@ -906,9 +906,9 @@ std::shared_ptr<OpStrategy> StrategyForDepthwiseConv2d(
         } else {
           CINN_NOT_IMPLEMENTED
         }
-        std::vector<common::CINNValue> res{
-            common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
-        *ret = common::CINNValuePack{res};
+        std::vector<cinn::common::CINNValue> res{
+            cinn::common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+        *ret = cinn::common::CINNValuePack{res};
       });
 
   auto strategy = std::make_shared<framework::OpStrategy>();
diff --git a/paddle/cinn/hlir/op/op_broadcast_test.cc b/paddle/cinn/hlir/op/op_broadcast_test.cc
index 8981712f5da643..4acb3371587cbd 100644
--- a/paddle/cinn/hlir/op/op_broadcast_test.cc
+++ b/paddle/cinn/hlir/op/op_broadcast_test.cc
@@ -48,7 +48,7 @@ TEST(Operator, Operator_ElementWise_Add_Test0) {
   NodeAttr attrs;
   std::vector<ir::Tensor> inputs{A.tensor(), B.tensor()};
   std::vector<Type> type{Float(32)};
-  common::Target target = common::DefaultHostTarget();
+  cinn::common::Target target = cinn::common::DefaultHostTarget();
   auto impl = OpStrategy::SelectImpl(strategy[add](
       attrs, inputs, type, {{M.as_int32(), N.as_int32()}}, target));
   ASSERT_EQ(impl->name, "strategy.elementwise_add.x86");
@@ -58,10 +58,10 @@ TEST(Operator, Operator_ElementWise_Add_Test0) {
   Module::Builder builder("module0", target);
 
   std::string out_name = "C";
-  common::CINNValuePack cinn_input =
-      common::CINNValuePack{{common::CINNValue(A),
-                             common::CINNValue(B),
-                             common::CINNValue(out_name)}};
+  cinn::common::CINNValuePack cinn_input =
+      cinn::common::CINNValuePack{{cinn::common::CINNValue(A),
+                                   cinn::common::CINNValue(B),
+                                   cinn::common::CINNValue(out_name)}};
   std::vector<std::string> input_output_names{"A", "B", out_name};
 
   auto funcs = framework::GetFuncFromImpl(
@@ -83,28 +83,29 @@ TEST(Operator, Operator_ElementWise_Add_Test0) {
   cinn_buffer_t *B_buf;
   int set_value = 0;
   if (set_value != 0) {
-    A_buf = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
+    A_buf = cinn::common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
                 .set_align(512)
                 .set_val(set_value)
                 .Build();
-    B_buf = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
+    B_buf = cinn::common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
                 .set_align(512)
                 .set_val(set_value)
                 .Build();
   } else {
-    A_buf = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
+    A_buf = cinn::common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
                 .set_align(512)
                 .set_random()
                 .Build();
-    B_buf = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
+    B_buf = cinn::common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
                 .set_align(512)
                 .set_random()
                 .Build();
   }
-  auto *C_buf = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
-                    .set_align(512)
-                    .set_zero()
-                    .Build();
+  auto *C_buf =
+      cinn::common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
+          .set_align(512)
+          .set_zero()
+          .Build();
 
   cinn_pod_value_t a_arg(A_buf), b_arg(B_buf), c_arg(C_buf);
   cinn_pod_value_t args[] = {a_arg, b_arg, c_arg};
@@ -131,7 +132,7 @@ TEST(Operator, Operator_ElementWise_Add_Test1) {
   attrs.attr_store["axis"] = 1;
   std::vector<ir::Tensor> inputs{A.tensor(), B.tensor()};
   std::vector<Type> type{Float(32)};
-  common::Target target = common::DefaultNVGPUTarget();
+  cinn::common::Target target = cinn::common::DefaultNVGPUTarget();
   auto impl = OpStrategy::SelectImpl(
       strategy[add](attrs, inputs, type, {{100, 32}}, target));
   ASSERT_EQ(impl->name, "strategy.elementwise_add.x86");
@@ -141,10 +142,10 @@ TEST(Operator, Operator_ElementWise_Add_Test1) {
   Module::Builder builder("module", target);
 
   std::string out_name = "C";
-  common::CINNValuePack cinn_input =
-      common::CINNValuePack{{common::CINNValue(A),
-                             common::CINNValue(B),
-                             common::CINNValue(out_name)}};
+  cinn::common::CINNValuePack cinn_input =
+      cinn::common::CINNValuePack{{cinn::common::CINNValue(A),
+                                   cinn::common::CINNValue(B),
+                                   cinn::common::CINNValue(out_name)}};
   std::vector<std::string> input_output_names{"A", "B", out_name};
 
   auto funcs = framework::GetFuncFromImpl(
@@ -181,7 +182,7 @@ TEST(Operator, Operator_BroadcastTo) {
 
   std::vector<ir::Tensor> inputs{B.tensor()};
   std::vector<Type> type{Float(32)};
-  common::Target target = common::DefaultHostTarget();
+  cinn::common::Target target = cinn::common::DefaultHostTarget();
 
   auto impl = OpStrategy::SelectImpl(
       strategy[broadcast_to](attrs, inputs, type, {out_shape}, target));
@@ -189,8 +190,8 @@ TEST(Operator, Operator_BroadcastTo) {
   std::string func_name = "broadcast_to";
 
   std::string out_name = "C";
-  common::CINNValuePack cinn_input = common::CINNValuePack{
-      {common::CINNValue(B), common::CINNValue(out_name)}};
+  cinn::common::CINNValuePack cinn_input = cinn::common::CINNValuePack{
+      {cinn::common::CINNValue(B), cinn::common::CINNValue(out_name)}};
   std::vector<std::string> input_output_names{"B", out_name};
 
   auto funcs = framework::GetFuncFromImpl(
@@ -201,12 +202,12 @@ TEST(Operator, Operator_BroadcastTo) {
   }
 }
 
-common::CINNValuePack GetComputeResult(
+cinn::common::CINNValuePack GetComputeResult(
     const std::shared_ptr<OpImpl> &impl,
-    std::vector<common::CINNValue> &cinn_inputs,  // NOLINT
+    std::vector<cinn::common::CINNValue> &cinn_inputs,  // NOLINT
     const std::string &output_name = "") {
   cinn_inputs.emplace_back(output_name);
-  return impl->fcompute(common::CINNValuePack{cinn_inputs});
+  return impl->fcompute(cinn::common::CINNValuePack{cinn_inputs});
 }
 
 TEST(Operator, Operator_BroadcastTo_0) {
@@ -233,21 +234,22 @@ TEST(Operator, Operator_BroadcastTo_0) {
   attrs.attr_store["dim"] = dim;
 
   std::vector<Type> type{Float(32)};
-  common::Target target = common::DefaultHostTarget();
+  cinn::common::Target target = cinn::common::DefaultHostTarget();
 
   auto impl_0 = OpStrategy::SelectImpl(strategy[const_scalar](
       attrs, std::vector<ir::Tensor>{}, type, {out_shape}, target));
-  std::vector<common::CINNValue> cinn_inputs;
-  common::CINNValuePack rets_0 = GetComputeResult(impl_0, cinn_inputs, "out_0");
+  std::vector<cinn::common::CINNValue> cinn_inputs;
+  cinn::common::CINNValuePack rets_0 =
+      GetComputeResult(impl_0, cinn_inputs, "out_0");
   ir::Expr out_0 = rets_0[0];
   auto tensor_0 = out_0.as_tensor_ref();
   poly::StageMap stages_0 = rets_0.back();
 
   auto impl_1 = OpStrategy::SelectImpl(
       strategy[broadcast_to](attrs, {tensor_0}, type, {out_shape}, target));
-  std::vector<common::CINNValue> cinn_inputs_1 = {
-      {common::CINNValue(tensor_0)}};
-  common::CINNValuePack rets_1 =
+  std::vector<cinn::common::CINNValue> cinn_inputs_1 = {
+      {cinn::common::CINNValue(tensor_0)}};
+  cinn::common::CINNValuePack rets_1 =
       GetComputeResult(impl_1, cinn_inputs_1, "out_1");
 
   ir::Expr out_1 = rets_1[0];
@@ -256,18 +258,18 @@ TEST(Operator, Operator_BroadcastTo_0) {
 
   auto impl_2 = OpStrategy::SelectImpl(
       strategy[reduce_sum](attrs, {A.tensor()}, type, {out_shape}, target));
-  std::vector<common::CINNValue> cinn_inputs_2 = {
-      {common::CINNValue(A.tensor())}};
-  common::CINNValuePack rets_2 =
+  std::vector<cinn::common::CINNValue> cinn_inputs_2 = {
+      {cinn::common::CINNValue(A.tensor())}};
+  cinn::common::CINNValuePack rets_2 =
       GetComputeResult(impl_2, cinn_inputs_2, "out_2");
 
   ir::Expr out_2 = rets_2[0];
   auto tensor_2 = out_2.as_tensor_ref();
   poly::StageMap stages_2 = rets_2.back();
 
-  std::vector<common::CINNValue> cinn_inputs_4 = {
-      {common::CINNValue(A.tensor())}};
-  common::CINNValuePack rets_4 =
+  std::vector<cinn::common::CINNValue> cinn_inputs_4 = {
+      {cinn::common::CINNValue(A.tensor())}};
+  cinn::common::CINNValuePack rets_4 =
       GetComputeResult(impl_2, cinn_inputs_4, "out_4");
   ir::Expr out_4 = rets_4[0];
   auto tensor_4 = out_4.as_tensor_ref();
@@ -275,9 +277,9 @@ TEST(Operator, Operator_BroadcastTo_0) {
 
   auto impl_3 = OpStrategy::SelectImpl(strategy[elementwise_add](
       attrs, {tensor_1, tensor_2}, type, {out_shape}, target));
-  std::vector<common::CINNValue> cinn_inputs_3 = {
-      {common::CINNValue(tensor_1), common::CINNValue(tensor_2)}};
-  common::CINNValuePack rets_3 =
+  std::vector<cinn::common::CINNValue> cinn_inputs_3 = {
+      {cinn::common::CINNValue(tensor_1), cinn::common::CINNValue(tensor_2)}};
+  cinn::common::CINNValuePack rets_3 =
       GetComputeResult(impl_3, cinn_inputs_3, "out_3");
 
   ir::Expr out_3 = rets_3[0];
diff --git a/paddle/cinn/hlir/op/op_nn_test.cc b/paddle/cinn/hlir/op/op_nn_test.cc
index 1d4920439ef4f2..593c75da8cfe95 100644
--- a/paddle/cinn/hlir/op/op_nn_test.cc
+++ b/paddle/cinn/hlir/op/op_nn_test.cc
@@ -43,12 +43,13 @@ Module LowerToModule(const std::string test_name,
                      std::vector<std::string> input_names,
                      const std::string &output_name,
                      std::vector<ir::Tensor> &inputs,  // NOLINT
-                     std::vector<common::CINNValue> cinn_inputs,
+                     std::vector<cinn::common::CINNValue> cinn_inputs,
                      const Target &target) {
   Module::Builder builder("module", target);
 
   cinn_inputs.emplace_back(output_name);
-  common::CINNValuePack cinn_input = common::CINNValuePack{cinn_inputs};
+  cinn::common::CINNValuePack cinn_input =
+      cinn::common::CINNValuePack{cinn_inputs};
   input_names.push_back(output_name);
 
   auto funcs = framework::GetFuncFromImpl(
@@ -80,7 +81,7 @@ TEST(Operator, Operator_Pool2d_Test0) {
   attrs.attr_store["pool_type"] = pool_type;
   std::vector<ir::Tensor> inputs{A.tensor()};
   std::vector<Type> type{Float(32)};
-  common::Target target = common::DefaultHostTarget();
+  cinn::common::Target target = cinn::common::DefaultHostTarget();
   auto impl = OpStrategy::SelectImpl(strategy[pool2d](
       attrs, inputs, type, {{1, 3, 10, 10}, {1, 3, 5, 5}}, target));
 
@@ -91,7 +92,7 @@ TEST(Operator, Operator_Pool2d_Test0) {
                               {"A"},
                               "B",
                               inputs,
-                              {common::CINNValue(A)},
+                              {cinn::common::CINNValue(A)},
                               target);
 
   auto jit = backends::ExecutionEngine::Create({});
@@ -102,11 +103,12 @@ TEST(Operator, Operator_Pool2d_Test0) {
   auto fn_ = reinterpret_cast<void (*)(void *, int32_t)>(fn);
 
   cinn_buffer_t *A_buf =
-      common::BufferBuilder(Float(32), {1, 3, 8, 8}).set_random().Build();
-  cinn_buffer_t *B_buf =
-      common::BufferBuilder(Float(32), {1, 3, 10, 10}).set_random().Build();
+      cinn::common::BufferBuilder(Float(32), {1, 3, 8, 8}).set_random().Build();
+  cinn_buffer_t *B_buf = cinn::common::BufferBuilder(Float(32), {1, 3, 10, 10})
+                             .set_random()
+                             .Build();
   cinn_buffer_t *C_buf =
-      common::BufferBuilder(Float(32), {1, 3, 5, 5}).set_random().Build();
+      cinn::common::BufferBuilder(Float(32), {1, 3, 5, 5}).set_random().Build();
   cinn_pod_value_t a_arg(A_buf), b_arg(B_buf), c_arg(C_buf);
   cinn_pod_value_t args[] = {a_arg, b_arg, c_arg};
   fn_(args, 3);
@@ -138,7 +140,7 @@ TEST(Operator, Operator_Pool2d_Test1) {
   attrs.attr_store["exclusive"] = false;
   std::vector<ir::Tensor> inputs{A.tensor()};
   std::vector<Type> type{Float(32)};
-  common::Target target = common::DefaultHostTarget();
+  cinn::common::Target target = cinn::common::DefaultHostTarget();
   auto impl = OpStrategy::SelectImpl(strategy[pool2d](
       attrs, inputs, type, {{1, 3, 11, 11}, {1, 3, 5, 5}}, target));
 
@@ -150,7 +152,7 @@ TEST(Operator, Operator_Pool2d_Test1) {
                               {"A"},
                               "B",
                               inputs,
-                              {common::CINNValue(A)},
+                              {cinn::common::CINNValue(A)},
                               target);
 
   auto jit = backends::ExecutionEngine::Create({});
@@ -161,11 +163,12 @@ TEST(Operator, Operator_Pool2d_Test1) {
   auto fn_ = reinterpret_cast<void (*)(void *, int32_t)>(fn);
 
   cinn_buffer_t *A_buf =
-      common::BufferBuilder(Float(32), {1, 3, 8, 8}).set_random().Build();
-  cinn_buffer_t *B_buf =
-      common::BufferBuilder(Float(32), {1, 3, 11, 11}).set_random().Build();
+      cinn::common::BufferBuilder(Float(32), {1, 3, 8, 8}).set_random().Build();
+  cinn_buffer_t *B_buf = cinn::common::BufferBuilder(Float(32), {1, 3, 11, 11})
+                             .set_random()
+                             .Build();
   cinn_buffer_t *C_buf =
-      common::BufferBuilder(Float(32), {1, 3, 5, 5}).set_random().Build();
+      cinn::common::BufferBuilder(Float(32), {1, 3, 5, 5}).set_random().Build();
   cinn_pod_value_t a_arg(A_buf), b_arg(B_buf), c_arg(C_buf);
   cinn_pod_value_t args[] = {a_arg, b_arg, c_arg};
   fn_(args, 3);
@@ -199,7 +202,7 @@ TEST(Operator, Operator_Pool2d_Test2) {
   attrs.attr_store["data_format"] = data_format;
   std::vector<ir::Tensor> inputs{A.tensor()};
   std::vector<Type> type{Float(32)};
-  common::Target target = common::DefaultHostTarget();
+  cinn::common::Target target = cinn::common::DefaultHostTarget();
   auto impl = OpStrategy::SelectImpl(strategy[pool2d](
       attrs, inputs, type, {{1, 11, 11, 3}, {1, 5, 5, 3}}, target));
 
@@ -211,7 +214,7 @@ TEST(Operator, Operator_Pool2d_Test2) {
                               {"A"},
                               "B",
                               inputs,
-                              {common::CINNValue(A)},
+                              {cinn::common::CINNValue(A)},
                               target);
 
   auto jit = backends::ExecutionEngine::Create({});
@@ -222,11 +225,12 @@ TEST(Operator, Operator_Pool2d_Test2) {
   auto fn_ = reinterpret_cast<void (*)(void *, int32_t)>(fn);
 
   cinn_buffer_t *A_buf =
-      common::BufferBuilder(Float(32), {1, 8, 8, 3}).set_random().Build();
-  cinn_buffer_t *B_buf =
-      common::BufferBuilder(Float(32), {1, 11, 11, 3}).set_random().Build();
+      cinn::common::BufferBuilder(Float(32), {1, 8, 8, 3}).set_random().Build();
+  cinn_buffer_t *B_buf = cinn::common::BufferBuilder(Float(32), {1, 11, 11, 3})
+                             .set_random()
+                             .Build();
   cinn_buffer_t *C_buf =
-      common::BufferBuilder(Float(32), {1, 5, 5, 3}).set_random().Build();
+      cinn::common::BufferBuilder(Float(32), {1, 5, 5, 3}).set_random().Build();
   cinn_pod_value_t a_arg(A_buf), b_arg(B_buf), c_arg(C_buf);
   cinn_pod_value_t args[] = {a_arg, b_arg, c_arg};
   fn_(args, 3);
@@ -260,7 +264,7 @@ TEST(Operator, Operator_Pool3d_Test0) {
   attrs.attr_store["data_format"] = data_format;
   std::vector<ir::Tensor> inputs{A.tensor()};
   std::vector<Type> type{Float(32)};
-  common::Target target = common::DefaultHostTarget();
+  cinn::common::Target target = cinn::common::DefaultHostTarget();
   auto impl = OpStrategy::SelectImpl(strategy[pool3d](
       attrs, inputs, type, {{1, 11, 11, 11, 3}, {1, 5, 5, 5, 3}}, target));
 
@@ -271,7 +275,7 @@ TEST(Operator, Operator_Pool3d_Test0) {
                               {"A"},
                               "B",
                               inputs,
-                              {common::CINNValue(A)},
+                              {cinn::common::CINNValue(A)},
                               target);
 
   auto jit = backends::ExecutionEngine::Create({});
@@ -281,12 +285,16 @@ TEST(Operator, Operator_Pool3d_Test0) {
   CHECK(fn);
   auto fn_ = reinterpret_cast<void (*)(void *, int32_t)>(fn);
 
-  cinn_buffer_t *A_buf =
-      common::BufferBuilder(Float(32), {1, 8, 8, 8, 3}).set_random().Build();
+  cinn_buffer_t *A_buf = cinn::common::BufferBuilder(Float(32), {1, 8, 8, 8, 3})
+                             .set_random()
+                             .Build();
   cinn_buffer_t *B_buf =
-      common::BufferBuilder(Float(32), {1, 11, 11, 11, 3}).set_random().Build();
-  cinn_buffer_t *C_buf =
-      common::BufferBuilder(Float(32), {1, 5, 5, 5, 3}).set_random().Build();
+      cinn::common::BufferBuilder(Float(32), {1, 11, 11, 11, 3})
+          .set_random()
+          .Build();
+  cinn_buffer_t *C_buf = cinn::common::BufferBuilder(Float(32), {1, 5, 5, 5, 3})
+                             .set_random()
+                             .Build();
   cinn_pod_value_t a_arg(A_buf), b_arg(B_buf), c_arg(C_buf);
   cinn_pod_value_t args[] = {a_arg, b_arg, c_arg};
   fn_(args, 3);
@@ -320,7 +328,7 @@ TEST(Operator, Operator_Pool1d_Test0) {
   attrs.attr_store["data_format"] = data_format;
   std::vector<ir::Tensor> inputs{A.tensor()};
   std::vector<Type> type{Float(32)};
-  common::Target target = common::DefaultHostTarget();
+  cinn::common::Target target = cinn::common::DefaultHostTarget();
   auto impl = OpStrategy::SelectImpl(
       strategy[pool1d](attrs, inputs, type, {{1, 11, 3}, {1, 5, 3}}, target));
 
@@ -331,7 +339,7 @@ TEST(Operator, Operator_Pool1d_Test0) {
                               {"A"},
                               "B",
                               inputs,
-                              {common::CINNValue(A)},
+                              {cinn::common::CINNValue(A)},
                               target);
 
   auto jit = backends::ExecutionEngine::Create({});
@@ -342,11 +350,11 @@ TEST(Operator, Operator_Pool1d_Test0) {
   auto fn_ = reinterpret_cast<void (*)(void *, int32_t)>(fn);
 
   cinn_buffer_t *A_buf =
-      common::BufferBuilder(Float(32), {1, 8, 3}).set_random().Build();
+      cinn::common::BufferBuilder(Float(32), {1, 8, 3}).set_random().Build();
   cinn_buffer_t *B_buf =
-      common::BufferBuilder(Float(32), {1, 11, 3}).set_random().Build();
+      cinn::common::BufferBuilder(Float(32), {1, 11, 3}).set_random().Build();
   cinn_buffer_t *C_buf =
-      common::BufferBuilder(Float(32), {1, 5, 3}).set_random().Build();
+      cinn::common::BufferBuilder(Float(32), {1, 5, 3}).set_random().Build();
   cinn_pod_value_t a_arg(A_buf), b_arg(B_buf), c_arg(C_buf);
   cinn_pod_value_t args[] = {a_arg, b_arg, c_arg};
   fn_(args, 3);
@@ -372,7 +380,7 @@ TEST(Operator, Operator_Select_Test0) {
   std::vector<ir::Tensor> inputs{
       condition.tensor(), true_value.tensor(), false_value.tensor()};
   std::vector<Type> type{Float(32)};
-  const common::Target target = common::DefaultHostTarget();
+  const cinn::common::Target target = cinn::common::DefaultHostTarget();
 
   const std::vector<framework::shape_t> input_shapes = {
       {16, 64, 64}, {16, 64, 64}, {16, 64, 64}};
@@ -387,9 +395,10 @@ TEST(Operator, Operator_Select_Test0) {
   std::string func_name = "select";
   std::vector<std::string> input_names = {
       "condition", "true_value", "false_value"};
-  std::vector<common::CINNValue> cinn_inputs = {common::CINNValue(condition),
-                                                common::CINNValue(true_value),
-                                                common::CINNValue(false_value)};
+  std::vector<cinn::common::CINNValue> cinn_inputs = {
+      cinn::common::CINNValue(condition),
+      cinn::common::CINNValue(true_value),
+      cinn::common::CINNValue(false_value)};
 
   auto module = LowerToModule("Operator_Select_Test0",
                               func_name,
@@ -408,13 +417,13 @@ TEST(Operator, Operator_Select_Test0) {
   auto fn_ = reinterpret_cast<void (*)(void *, int32_t)>(fn);
 
   cinn_buffer_t *A_buf =
-      common::BufferBuilder(Bool(), {16, 64, 64}).set_random().Build();
+      cinn::common::BufferBuilder(Bool(), {16, 64, 64}).set_random().Build();
   cinn_buffer_t *B_buf =
-      common::BufferBuilder(Float(32), {16, 64, 64}).set_random().Build();
+      cinn::common::BufferBuilder(Float(32), {16, 64, 64}).set_random().Build();
   cinn_buffer_t *C_buf =
-      common::BufferBuilder(Float(32), {16, 64, 64}).set_random().Build();
+      cinn::common::BufferBuilder(Float(32), {16, 64, 64}).set_random().Build();
   cinn_buffer_t *D_buf =
-      common::BufferBuilder(Float(32), {16, 64, 64}).set_random().Build();
+      cinn::common::BufferBuilder(Float(32), {16, 64, 64}).set_random().Build();
 
   cinn_pod_value_t a_arg(A_buf), b_arg(B_buf), c_arg(C_buf), d_arg(D_buf);
   cinn_pod_value_t args[] = {a_arg, b_arg, c_arg, d_arg};
@@ -452,7 +461,7 @@ TEST(Operator, Operator_Reverse_Test0) {
   attrs.attr_store["axis"] = axis;
   std::vector<ir::Tensor> inputs{A.tensor()};
   std::vector<Type> type{Float(32)};
-  common::Target target = common::DefaultHostTarget();
+  cinn::common::Target target = cinn::common::DefaultHostTarget();
 
   auto impl = OpStrategy::SelectImpl(
       strategy[reverse](attrs, inputs, type, {{c, h, w}}, target));
@@ -464,7 +473,7 @@ TEST(Operator, Operator_Reverse_Test0) {
                               {"A"},
                               "B",
                               inputs,
-                              {common::CINNValue(A)},
+                              {cinn::common::CINNValue(A)},
                               target);
 
   auto jit = backends::ExecutionEngine::Create({});
@@ -475,9 +484,9 @@ TEST(Operator, Operator_Reverse_Test0) {
   auto fn_ = reinterpret_cast<void (*)(void *, int32_t)>(fn);
 
   cinn_buffer_t *A_buf =
-      common::BufferBuilder(Float(32), {c, h, w}).set_random().Build();
+      cinn::common::BufferBuilder(Float(32), {c, h, w}).set_random().Build();
   cinn_buffer_t *B_buf =
-      common::BufferBuilder(Float(32), {c, h, w}).set_random().Build();
+      cinn::common::BufferBuilder(Float(32), {c, h, w}).set_random().Build();
   cinn_pod_value_t a_arg(A_buf), b_arg(B_buf);
   cinn_pod_value_t args[] = {a_arg, b_arg};
   fn_(args, 2);
@@ -516,7 +525,7 @@ TEST(Operator, Operator_Transpose_Test0) {
   attrs.attr_store["axis"] = axis;
   std::vector<ir::Tensor> inputs{A.tensor()};
   std::vector<Type> type{Float(32)};
-  common::Target target = common::DefaultHostTarget();
+  cinn::common::Target target = cinn::common::DefaultHostTarget();
 
   auto infer_shape = infer_shape_func({{n, c, h, w}}, attrs.attr_store);
   ASSERT_EQ(infer_shape[0][0], n);
@@ -551,7 +560,7 @@ TEST(Operator, Operator_Transpose_Test0) {
                               {"A"},
                               "B",
                               inputs,
-                              {common::CINNValue(A)},
+                              {cinn::common::CINNValue(A)},
                               target);
 
   auto jit = backends::ExecutionEngine::Create({});
@@ -562,9 +571,9 @@ TEST(Operator, Operator_Transpose_Test0) {
   auto fn_ = reinterpret_cast<void (*)(void *, int32_t)>(fn);
 
   cinn_buffer_t *A_buf =
-      common::BufferBuilder(Float(32), input_shape).set_random().Build();
+      cinn::common::BufferBuilder(Float(32), input_shape).set_random().Build();
   cinn_buffer_t *B_buf =
-      common::BufferBuilder(Float(32), output_shape).set_random().Build();
+      cinn::common::BufferBuilder(Float(32), output_shape).set_random().Build();
   cinn_pod_value_t a_arg(A_buf), b_arg(B_buf);
   cinn_pod_value_t args[] = {a_arg, b_arg};
   fn_(args, 2);
diff --git a/paddle/cinn/hlir/op/op_util.cc b/paddle/cinn/hlir/op/op_util.cc
index 4a8ec32633dbd1..6cad9f4cb75f12 100644
--- a/paddle/cinn/hlir/op/op_util.cc
+++ b/paddle/cinn/hlir/op/op_util.cc
@@ -31,7 +31,7 @@ CINNSchedule GetElementwiseScheduleFunc(
   return CINNSchedule([=](lang::Args args, lang::RetValue* ret) {
     CHECK(!args.empty()) << "The input argument of ElementwiseSchedule is "
                             "empty! Please check.\n";
-    common::CINNValuePack arg_pack = args[0];
+    cinn::common::CINNValuePack arg_pack = args[0];
     CHECK_GT(arg_pack.size(), 0U)
         << "arg_pack.size() must contains at least one element.";
     std::vector<Expr> vec_ast;
@@ -46,9 +46,9 @@ CINNSchedule GetElementwiseScheduleFunc(
     ir::IRSchedule ir_sch(mod_expr);
     ir_sch.MergeExprs();
     pe::IRElementwiseSchedule(ir_sch, output_shapes.front(), target);
-    std::vector<common::CINNValue> res{
-        common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
-    *ret = common::CINNValuePack{res};
+    std::vector<cinn::common::CINNValue> res{
+        cinn::common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+    *ret = cinn::common::CINNValuePack{res};
   });
 }
 
@@ -59,7 +59,7 @@ CINNSchedule GetInjectiveScheduleFunc(
   return CINNSchedule([=](lang::Args args, lang::RetValue* ret) {
     CHECK(!args.empty()) << "The input argument of InjectiveSchedule is "
                             "empty! Please check.\n";
-    common::CINNValuePack arg_pack = args[0];
+    cinn::common::CINNValuePack arg_pack = args[0];
     std::vector<Expr> vec_ast;
     for (int i = 0; i < arg_pack.size(); i++) {
       if (arg_pack[i].is_expr()) {
@@ -78,14 +78,14 @@ CINNSchedule GetInjectiveScheduleFunc(
       pe::IRScheduleInjectiveCPU(ir_sch, output_shapes.front(), target,
     vectorizable);
     }*/
-    std::vector<common::CINNValue> res{
-        common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
-    *ret = common::CINNValuePack{res};
+    std::vector<cinn::common::CINNValue> res{
+        cinn::common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+    *ret = cinn::common::CINNValuePack{res};
   });
 }
 
-std::string GetExternFuncName(const common::Target& target,
-                              const common::Type& type,
+std::string GetExternFuncName(const cinn::common::Target& target,
+                              const cinn::common::Type& type,
                               const std::string& func_name,
                               const bool need_cinn,
                               const bool need_target,
@@ -95,9 +95,9 @@ std::string GetExternFuncName(const common::Target& target,
     func_proto_name.append("cinn_");
   }
   if (need_target) {
-    if (target.arch == common::Target::Arch::NVGPU) {
+    if (target.arch == cinn::common::Target::Arch::NVGPU) {
       func_proto_name.append("nvgpu_");
-    } else if (target.arch == common::Target::Arch::X86) {
+    } else if (target.arch == cinn::common::Target::Arch::X86) {
       func_proto_name.append("host_");
     } else {
       LOG(FATAL) << func_name
diff --git a/paddle/cinn/hlir/op/op_util.h b/paddle/cinn/hlir/op/op_util.h
index 082c1f258a0427..a0521e26f1b724 100644
--- a/paddle/cinn/hlir/op/op_util.h
+++ b/paddle/cinn/hlir/op/op_util.h
@@ -67,8 +67,9 @@ std::vector<T> ToPodVector(const std::vector<Expr> &args) {
   }
 
   const auto &type = args.front().type();
-  CHECK_EQ(type, common::type_of<T>()) << "Cannot get " << common::type_of<T>()
-                                       << " value from " << type << " vector!";
+  CHECK_EQ(type, cinn::common::type_of<T>())
+      << "Cannot get " << cinn::common::type_of<T>() << " value from " << type
+      << " vector!";
 
   std::vector<T> shape_v;
   if (type.is_bool()) {
@@ -141,8 +142,8 @@ CINNSchedule GetInjectiveScheduleFunc(
     const Target &target,
     bool vectorizable = true);
 
-std::string GetExternFuncName(const common::Target &target,
-                              const common::Type &type,
+std::string GetExternFuncName(const cinn::common::Target &target,
+                              const cinn::common::Type &type,
                               const std::string &func_name,
                               const bool need_cinn = true,
                               const bool need_target = true,
diff --git a/paddle/cinn/hlir/op/reduction.cc b/paddle/cinn/hlir/op/reduction.cc
index 893c0c41fd7070..f9a019a9eea5c8 100644
--- a/paddle/cinn/hlir/op/reduction.cc
+++ b/paddle/cinn/hlir/op/reduction.cc
@@ -38,9 +38,9 @@ PD_DECLARE_bool(cinn_new_group_scheduler);
 namespace cinn {
 namespace hlir {
 namespace op {
-using common::_CINNValuePack_;
-using common::CINNValue;
-using common::CINNValuePack;
+using cinn::common::_CINNValuePack_;
+using cinn::common::CINNValue;
+using cinn::common::CINNValuePack;
 using framework::OpStrategy;
 using framework::shape_t;
 using framework::StrategyFunction;
@@ -142,7 +142,7 @@ std::shared_ptr<OpStrategy> StrategyForReduce(
           *ret = CINNValuePack{cinn_values};
         };
         if (!FLAGS_cinn_enable_map_expr && !FLAGS_cinn_new_group_scheduler &&
-            target == common::DefaultNVGPUTarget()) {
+            target == cinn::common::DefaultNVGPUTarget()) {
           if (!WithoutLastDimInReduce(inputs[0]->shape, reduce_axes)) {
             VLOG(3) << "Do Two Step Block Reduce Compute!";
             auto res = gpu_reduce_with_last_axis_func(
@@ -246,7 +246,7 @@ std::shared_ptr<OpStrategy> StrategyForReduce(
                                           reduce_tmp_out.as_tensor_ref(),
                                           tmp_out.as_tensor_ref(),
                                           out.as_tensor_ref(),
-                                          common::DefaultNVGPUTarget());
+                                          cinn::common::DefaultNVGPUTarget());
 
           std::vector<CINNValue> res{
               CINNValue(ir_sch.GetModule().GetExprs().at(0))};
@@ -262,7 +262,7 @@ std::shared_ptr<OpStrategy> StrategyForReduce(
                                         reduce_tmp_out.as_tensor_ref(),
                                         tmp_out.as_tensor_ref(),
                                         out.as_tensor_ref(),
-                                        common::DefaultNVGPUTarget());
+                                        cinn::common::DefaultNVGPUTarget());
 
           std::vector<CINNValue> res{
               CINNValue(ir_sch.GetModule().GetExprs().at(0))};
diff --git a/paddle/cinn/hlir/op/reduction_test.cc b/paddle/cinn/hlir/op/reduction_test.cc
index 953dd82017d9bd..5586c323462ac6 100644
--- a/paddle/cinn/hlir/op/reduction_test.cc
+++ b/paddle/cinn/hlir/op/reduction_test.cc
@@ -46,9 +46,9 @@ namespace cinn {
 namespace hlir {
 namespace framework {
 
-using common::_CINNValuePack_;
-using common::CINNValue;
-using common::CINNValuePack;
+using cinn::common::_CINNValuePack_;
+using cinn::common::CINNValue;
+using cinn::common::CINNValuePack;
 using framework::OpStrategy;
 using framework::shape_t;
 using framework::StrategyFunction;
@@ -91,7 +91,7 @@ std::pair<ir::Module, std::string> GenReduceCode(
     }
   }
 
-  auto target = common::DefaultNVGPUTarget();
+  auto target = cinn::common::DefaultNVGPUTarget();
   auto impl = OpStrategy::SelectImpl(
       strategy(attrs, inputs, out_type, {output_shape}, target));
 
@@ -99,7 +99,8 @@ std::pair<ir::Module, std::string> GenReduceCode(
   std::vector<std::string> input_output_nodes{"X", op_name};
   func = GetFuncFromImpl(
       impl,
-      common::CINNValuePack{{common::CINNValue(X), common::CINNValue(op_name)}},
+      cinn::common::CINNValuePack{
+          {cinn::common::CINNValue(X), cinn::common::CINNValue(op_name)}},
       inputs,
       input_output_nodes,
       func_name,
@@ -353,8 +354,9 @@ void TestCaseForReduce(const float init_val,
   // auto func_0   = reinterpret_cast<void (*)(cinn_pod_value_t*,
   // int)>(fn_reduce_sum);
   auto buffer_x =
-      common::BufferBuilder(Float(32), {n, c, h, w}).set_random().Build();
-  auto buffer_z = common::BufferBuilder(Float(32), {c}).set_random().Build();
+      cinn::common::BufferBuilder(Float(32), {n, c, h, w}).set_random().Build();
+  auto buffer_z =
+      cinn::common::BufferBuilder(Float(32), {c}).set_random().Build();
 
   void *dev_x = nullptr, *dev_z = nullptr;
   CUDA_CALL(cudaMalloc(&dev_x, buffer_x->memory_size));
@@ -455,8 +457,9 @@ TEST(Operator, Operator_Reduction_Case_7) {
 
   srand(time(NULL));
   auto buffer_x =
-      common::BufferBuilder(Float(32), {n, c, h, w}).set_random().Build();
-  auto buffer_y = common::BufferBuilder(Float(32), {h, w}).set_random().Build();
+      cinn::common::BufferBuilder(Float(32), {n, c, h, w}).set_random().Build();
+  auto buffer_y =
+      cinn::common::BufferBuilder(Float(32), {h, w}).set_random().Build();
 
   void *dev_x = nullptr, *dev_y = nullptr;
   CUDA_CALL(cudaMalloc(&dev_x, buffer_x->memory_size));
@@ -528,9 +531,9 @@ TEST(Operator, Operator_Reduction_Case_11) {
 }
 
 TEST(Operator, Operator_Reduction_Case_Warp_Reduce) {
-  int sm_count = common::DefaultNVGPUTarget().get_multi_processor_count();
+  int sm_count = cinn::common::DefaultNVGPUTarget().get_multi_processor_count();
   int max_threads_per_sm =
-      common::DefaultNVGPUTarget().get_max_threads_per_sm();
+      cinn::common::DefaultNVGPUTarget().get_max_threads_per_sm();
   int warp_reduce_threshold = sm_count * max_threads_per_sm / 32;
 
   std::vector<int> shape = {warp_reduce_threshold + 10, 256};
@@ -542,9 +545,9 @@ TEST(Operator, Operator_Reduction_Case_Warp_Reduce) {
 }
 
 TEST(Operator, Operator_Reduction_Case_Block_Reduce) {
-  int sm_count = common::DefaultNVGPUTarget().get_multi_processor_count();
+  int sm_count = cinn::common::DefaultNVGPUTarget().get_multi_processor_count();
   int max_threads_per_sm =
-      common::DefaultNVGPUTarget().get_max_threads_per_sm();
+      cinn::common::DefaultNVGPUTarget().get_max_threads_per_sm();
   int warp_reduce_threshold = sm_count * max_threads_per_sm / 32;
 
   std::vector<int> shape = {warp_reduce_threshold - 10, 33};
@@ -556,9 +559,9 @@ TEST(Operator, Operator_Reduction_Case_Block_Reduce) {
 }
 
 TEST(Operator, Operator_Reduction_Case_Warp_Reduce_Case_1) {
-  int sm_count = common::DefaultNVGPUTarget().get_multi_processor_count();
+  int sm_count = cinn::common::DefaultNVGPUTarget().get_multi_processor_count();
   int max_threads_per_sm =
-      common::DefaultNVGPUTarget().get_max_threads_per_sm();
+      cinn::common::DefaultNVGPUTarget().get_max_threads_per_sm();
   int warp_reduce_threshold = sm_count * max_threads_per_sm / 32;
 
   std::vector<int> shape = {(warp_reduce_threshold + 32) / 2, 2, 10, 256};
@@ -571,9 +574,9 @@ TEST(Operator, Operator_Reduction_Case_Warp_Reduce_Case_1) {
 }
 
 TEST(Operator, Operator_Reduction_Case_Block_Reduce_Case_1) {
-  int sm_count = common::DefaultNVGPUTarget().get_multi_processor_count();
+  int sm_count = cinn::common::DefaultNVGPUTarget().get_multi_processor_count();
   int max_threads_per_sm =
-      common::DefaultNVGPUTarget().get_max_threads_per_sm();
+      cinn::common::DefaultNVGPUTarget().get_max_threads_per_sm();
   int warp_reduce_threshold = sm_count * max_threads_per_sm / 32;
 
   std::vector<int> shape = {(warp_reduce_threshold - 32) / 2, 2, 10, 33};
diff --git a/paddle/cinn/hlir/op/transform.cc b/paddle/cinn/hlir/op/transform.cc
index ed22f50c644b09..ce1e29731a974f 100644
--- a/paddle/cinn/hlir/op/transform.cc
+++ b/paddle/cinn/hlir/op/transform.cc
@@ -31,9 +31,9 @@
 namespace cinn {
 namespace hlir {
 namespace op {
-using common::_CINNValuePack_;
-using common::CINNValue;
-using common::CINNValuePack;
+using cinn::common::_CINNValuePack_;
+using cinn::common::CINNValue;
+using cinn::common::CINNValuePack;
 using framework::OpStrategy;
 using framework::shape_t;
 using framework::StrategyFunction;
diff --git a/paddle/cinn/hlir/op/transform_test.cc b/paddle/cinn/hlir/op/transform_test.cc
index 2c2612bd1865b5..0e9b6d86d2ece6 100644
--- a/paddle/cinn/hlir/op/transform_test.cc
+++ b/paddle/cinn/hlir/op/transform_test.cc
@@ -44,9 +44,9 @@ namespace cinn {
 namespace hlir {
 namespace framework {
 
-using common::_CINNValuePack_;
-using common::CINNValue;
-using common::CINNValuePack;
+using cinn::common::_CINNValuePack_;
+using cinn::common::CINNValue;
+using cinn::common::CINNValuePack;
 using framework::OpStrategy;
 using framework::shape_t;
 using framework::StrategyFunction;
@@ -75,9 +75,9 @@ TEST(SliceAssign, SliceAssign_Op) {
   std::vector<ir::Tensor> inputs{input.tensor(), assign.tensor()};
 
 #ifdef CINN_WITH_CUDA
-  auto target = common::DefaultNVGPUTarget();
+  auto target = cinn::common::DefaultNVGPUTarget();
 #else
-  auto target = common::DefaultHostTarget();
+  auto target = cinn::common::DefaultHostTarget();
 #endif
   auto impl = OpStrategy::SelectImpl(
       strategy(attrs, inputs, out_type, {output_shape}, target));
@@ -85,10 +85,10 @@ TEST(SliceAssign, SliceAssign_Op) {
   std::string func_name = "slice_assign";
 
   std::string out_name = "output";
-  common::CINNValuePack cinn_input =
-      common::CINNValuePack{{common::CINNValue(input.tensor()),
-                             common::CINNValue(assign.tensor()),
-                             common::CINNValue(out_name)}};
+  cinn::common::CINNValuePack cinn_input =
+      cinn::common::CINNValuePack{{cinn::common::CINNValue(input.tensor()),
+                                   cinn::common::CINNValue(assign.tensor()),
+                                   cinn::common::CINNValue(out_name)}};
   std::vector<std::string> input_output_names{"input", "assign", out_name};
 
   auto funcs = framework::GetFuncFromImpl(
diff --git a/paddle/cinn/hlir/pass/alterlayout.cc b/paddle/cinn/hlir/pass/alterlayout.cc
index 3c8d775fc9befe..4e7df28e7994a4 100644
--- a/paddle/cinn/hlir/pass/alterlayout.cc
+++ b/paddle/cinn/hlir/pass/alterlayout.cc
@@ -25,8 +25,8 @@ namespace cinn {
 namespace hlir {
 namespace pass {
 
-using common::GraphNode;
-using common::Type;
+using cinn::common::GraphNode;
+using cinn::common::Type;
 using framework::Graph;
 using framework::Node;
 using framework::NodeData;
@@ -92,7 +92,7 @@ std::vector<framework::shape_t> UpdateInferInfos(
     const std::vector<framework::shape_t>& input_shapes,
     const std::vector<Type>& input_types,
     const std::vector<std::string>& input_layouts,
-    const common::Target& target,
+    const cinn::common::Target& target,
     const OpValueType<InferShapeFunc>& op_infershape,
     const OpValueType<InferTypeFunc>& op_infertype,
     const OpValueType<InferLayoutFunc>& op_inferlayout,
@@ -219,13 +219,13 @@ void AlterLayoutPass(Graph* graph) {
           // alter conv2d op to conv2d_NCHWc
           Node* new_node = new Node(Operator::Get(new_op_type),
                                     new_op_type,
-                                    common::UniqName(new_op_type));
+                                    cinn::common::UniqName(new_op_type));
           new_node->attrs.attr_store = node->attrs.attr_store;
           std::string new_data_format = "NCHWc";
           new_node->attrs.attr_store["data_format"] = new_data_format;
 
           const auto& conv_inlinks = node->inlinks_in_order();
-          std::vector<common::GraphNode*> input_nodes;
+          std::vector<cinn::common::GraphNode*> input_nodes;
           for (auto& link : conv_inlinks) {
             auto* source = link->source();
             input_nodes.push_back(source);
@@ -322,8 +322,8 @@ void AlterLayoutPass(Graph* graph) {
                     0,
                     src_input_layout,
                     dst_input_layout,
-                    common::UniqName(node->op()->name +
-                                     "_input_layout_tranform"));
+                    cinn::common::UniqName(node->op()->name +
+                                           "_input_layout_tranform"));
             UpdateInferInfos(input_trans_node,
                              {input_shape},
                              {input_type},
@@ -370,8 +370,8 @@ void AlterLayoutPass(Graph* graph) {
                     1,
                     src_kernel_layout,
                     dst_kernel_layout,
-                    common::UniqName(node->op()->name +
-                                     "_weight_layout_tranform"));
+                    cinn::common::UniqName(node->op()->name +
+                                           "_weight_layout_tranform"));
             UpdateInferInfos(weight_trans_node,
                              {weight_shape},
                              {weight_type},
@@ -427,13 +427,14 @@ void AlterLayoutPass(Graph* graph) {
             count++;
           }
           for (int i = 1; i < infershapes.size(); i++) {
-            auto* new_out = new NodeData(
-                node_ptr,
-                i,
-                0,
-                common::UniqName(new_node->id() + "_out_" + std::to_string(i)));
+            auto* new_out =
+                new NodeData(node_ptr,
+                             i,
+                             0,
+                             cinn::common::UniqName(new_node->id() + "_out_" +
+                                                    std::to_string(i)));
             graph->RegisterNode(new_out->id(), new_out);
-            new_node->as<common::GraphNode>()->LinkTo(new_out);
+            new_node->as<cinn::common::GraphNode>()->LinkTo(new_out);
           }
           graph->RegisterNode(new_node->id(), new_node);
           // update conv2d_NCHWc's infershape, infertype, inferlayout and set
@@ -513,10 +514,10 @@ void AlterLayoutPass(Graph* graph) {
                 CHECK(input_data);
                 VLOG(3) << source->id() << " do layout_tranform from C to NCHW";
                 std::string op_type = "broadcast_to";
-                auto trans_node =
-                    new Node(Operator::Get(op_type),
-                             op_type,
-                             common::UniqName(source->id() + "_broadcastto"));
+                auto trans_node = new Node(
+                    Operator::Get(op_type),
+                    op_type,
+                    cinn::common::UniqName(source->id() + "_broadcastto"));
                 trans_node->attrs.attr_store["out_shape"] = new_shapes;
                 std::vector<int> broadcast_axes = {1};
                 trans_node->attrs.attr_store["broadcast_axes"] = broadcast_axes;
@@ -551,8 +552,8 @@ void AlterLayoutPass(Graph* graph) {
                         i,
                         new_src_layout,
                         new_input_layouts[i],
-                        common::UniqName(new_input_data->id() +
-                                         "_layout_tranform"));
+                        cinn::common::UniqName(new_input_data->id() +
+                                               "_layout_tranform"));
                 UpdateInferInfos(new_trans_node,
                                  {shape_dict[new_input_data->id()]},
                                  {input_types[i]},
@@ -585,7 +586,8 @@ void AlterLayoutPass(Graph* graph) {
                         i,
                         src_layout,
                         new_input_layouts[i],
-                        common::UniqName(source->id() + "_layout_tranform"));
+                        cinn::common::UniqName(source->id() +
+                                               "_layout_tranform"));
                 UpdateInferInfos(trans_node,
                                  {input_shapes[i]},
                                  {input_types[i]},
@@ -618,7 +620,8 @@ void AlterLayoutPass(Graph* graph) {
                         i,
                         src_layout,
                         new_input_layouts[i],
-                        common::UniqName(source->id() + "_layout_tranform"));
+                        cinn::common::UniqName(source->id() +
+                                               "_layout_tranform"));
                 UpdateInferInfos(trans_node,
                                  {input_shapes[i]},
                                  {input_types[i]},
@@ -705,7 +708,8 @@ void AlterLayoutPass(Graph* graph) {
                 0,
                 src_layout,
                 dst_layout,
-                common::UniqName(node->op()->name + "_final_layout_tranform"));
+                cinn::common::UniqName(node->op()->name +
+                                       "_final_layout_tranform"));
             shape_dict[temp_out->id()] = shape;
             type_dict[temp_out->id()] = type;
             layout_dict[temp_out->id()] = src_layout;
diff --git a/paddle/cinn/hlir/pass/alterlayout_test.cc b/paddle/cinn/hlir/pass/alterlayout_test.cc
index 293c0c07ebdde6..0936513b5e7584 100644
--- a/paddle/cinn/hlir/pass/alterlayout_test.cc
+++ b/paddle/cinn/hlir/pass/alterlayout_test.cc
@@ -65,7 +65,7 @@ TEST(conv, conv) {
 
   auto c = program.conv2d(A, B, attrs);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
   program.SetInputs({A, B});
   program.Validate();
   LOG(INFO) << "Program:\n" << program;
@@ -112,7 +112,7 @@ TEST(conv_relu_conv, conv_relu_conv) {
   auto d = program.relu(c);
   auto e = program.conv2d(d, D, attrs);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
   program.SetInputs({A, B, D});
   program.Validate();
   LOG(INFO) << "Program:\n" << program;
@@ -162,7 +162,7 @@ TEST(conv_add_conv, conv_add_conv) {
   auto d = program.elementwise_add(c, C, 1);
   auto e = program.conv2d(d, D, attrs);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
   program.SetInputs({A, B, D});
   program.Validate();
   LOG(INFO) << "Program:\n" << program;
@@ -219,7 +219,7 @@ TEST(conv_bn_conv, conv_bn_conv) {
   auto d = program.batchnorm(c, Scale, Bias, Mean, Variance, attrs1);
   auto e = program.conv2d(d, D, attrs);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
   program.SetInputs({A, B, D});
   program.Validate();
   LOG(INFO) << "Program:\n" << program;
@@ -276,7 +276,7 @@ TEST(conv_pool2d_conv, conv_pool2d_conv) {
   auto d = program.pool2d(c, attrs2);
   auto e = program.conv2d(d, D, attrs);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
   program.SetInputs({A, B, D});
   program.Validate();
   LOG(INFO) << "Program:\n" << program;
@@ -328,7 +328,7 @@ TEST(conv_softmax_conv, conv_softmax_conv) {
   auto d = program.softmax(c, attrs1);
   auto e = program.conv2d(d, D, attrs);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
   program.SetInputs({A, B, D});
   program.Validate();
   LOG(INFO) << "Program:\n" << program;
@@ -377,7 +377,7 @@ TEST(conv_sigmoid_conv, conv_sigmoid_conv) {
   auto d = program.sigmoid(c);
   auto e = program.conv2d(d, D, attrs);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
   program.SetInputs({A, B, D});
   program.Validate();
   LOG(INFO) << "Program:\n" << program;
@@ -430,7 +430,7 @@ TEST(conv_mul_conv, conv_mul_conv) {
   auto d = program.mul(c, C, 1, 1);
   auto e = program.softmax(d, attrs1);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
   program.SetInputs({A, B, D});
   program.Validate();
   LOG(INFO) << "Program:\n" << program;
diff --git a/paddle/cinn/hlir/pass/check_fusion_accuracy_pass.cc b/paddle/cinn/hlir/pass/check_fusion_accuracy_pass.cc
index 52a5d128860d21..c0b4f6592bc27d 100644
--- a/paddle/cinn/hlir/pass/check_fusion_accuracy_pass.cc
+++ b/paddle/cinn/hlir/pass/check_fusion_accuracy_pass.cc
@@ -37,15 +37,15 @@ using framework::NodePtr;
 using framework::Operator;
 using framework::OpPatternKind;
 
+using cinn::common::GraphEdge;
+using cinn::common::GraphNode;
 using cinn::hlir::framework::GenerateAccCheckNodeId;
-using common::GraphEdge;
-using common::GraphNode;
 
 using GroupPtr = std::shared_ptr<Graph::Group>;
 using GroupList = std::vector<GroupPtr>;
 
 using ShapeDict = absl::flat_hash_map<std::string, framework::shape_t>;
-using DtypeDict = absl::flat_hash_map<std::string, common::Type>;
+using DtypeDict = absl::flat_hash_map<std::string, cinn::common::Type>;
 
 namespace utils {
 class AssertMsg {
@@ -325,7 +325,7 @@ std::pair<NodePtr, NodeData*> CheckFusionAccuracyPass::CreateIsCloseNode(
 
   auto check_out_shape = shape_dict_.at(node_id);
   shape_dict_.emplace(output_data->id(), std::move(check_out_shape));
-  dtype_dict_.emplace(output_data->id(), common::Bool());
+  dtype_dict_.emplace(output_data->id(), cinn::common::Bool());
 
   VLOG(4) << "Create node " << node_id
           << "'s isclose node success, whose id is " << is_close_node_id
@@ -356,7 +356,7 @@ std::pair<NodePtr, NodeData*> CheckFusionAccuracyPass::CreateAllNode(
   auto output_data = CreateOutputNode(all_node);
 
   shape_dict_.emplace(output_data->id(), framework::shape_t{1});
-  dtype_dict_.emplace(output_data->id(), common::Bool());
+  dtype_dict_.emplace(output_data->id(), cinn::common::Bool());
 
   VLOG(4) << "Create node " << node_id << "'s all node success, whose id is "
           << all_node_id << ", whose output is " << DebugNodeData(output_data);
@@ -387,7 +387,7 @@ std::pair<NodePtr, NodeData*> CheckFusionAccuracyPass::CreateAssertNode(
   auto output_data = CreateOutputNode(assert_node);
 
   shape_dict_.emplace(output_data->id(), framework::shape_t{1});
-  dtype_dict_.emplace(output_data->id(), common::Bool());
+  dtype_dict_.emplace(output_data->id(), cinn::common::Bool());
 
   VLOG(4) << "Create node " << node_id << "'s assert node success, whose id is "
           << assert_node_id << ", whose output is "
diff --git a/paddle/cinn/hlir/pass/check_fusion_accuracy_pass_test.cc b/paddle/cinn/hlir/pass/check_fusion_accuracy_pass_test.cc
index d523fbb6df9f63..10f5c83e6600d9 100644
--- a/paddle/cinn/hlir/pass/check_fusion_accuracy_pass_test.cc
+++ b/paddle/cinn/hlir/pass/check_fusion_accuracy_pass_test.cc
@@ -80,7 +80,7 @@ TEST(CheckFusionAccuracyPass, ElementWise_Fusion) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
 
   auto graph = std::make_shared<Graph>(program, target);
   hlir::framework::ApplyPasses(graph.get(),
@@ -117,7 +117,7 @@ TEST(CheckFusionAccuracyPass, ElementWise_Fusion_1) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
 
   auto graph = std::make_shared<Graph>(program, target);
 
@@ -158,7 +158,7 @@ TEST(CheckFusionAccuracyPass, ElementWise_Fusion_2) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
 
   auto graph = std::make_shared<Graph>(program, target);
 
@@ -199,7 +199,7 @@ TEST(CheckFusionAccuracyPass, ElementWise_Fusion_3) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
 
   auto graph = std::make_shared<Graph>(program, target);
 
@@ -240,7 +240,7 @@ TEST(CheckFusionAccuracyPass, ElementWise_Fusion_4) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
 
   auto graph = std::make_shared<Graph>(program, target);
 
@@ -274,7 +274,7 @@ TEST(CheckFusionAccuracyPass, ElementWise_Fusion_5) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
 
   auto graph = std::make_shared<Graph>(program, target);
 
@@ -311,7 +311,7 @@ TEST(CheckFusionAccuracyPass, Broadcast_Test_0) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
 
   auto graph = std::make_shared<Graph>(program, target);
 
@@ -348,7 +348,7 @@ TEST(CheckFusionAccuracyPass, Broadcast_Test_2) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
 
   auto graph = std::make_shared<Graph>(program, target);
 
@@ -387,7 +387,7 @@ TEST(CheckFusionAccuracyPass, Broadcast_Test_4) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
 
   auto graph = std::make_shared<Graph>(program, target);
 
@@ -426,7 +426,7 @@ TEST(CheckFusionAccuracyPass, Broadcast_Test_5) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
 
   auto graph = std::make_shared<Graph>(program, target);
 
@@ -462,7 +462,7 @@ TEST(CheckFusionAccuracyPass, Reduce_Test_0) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
 
   auto graph = std::make_shared<Graph>(program, target);
 
@@ -497,7 +497,7 @@ TEST(CheckFusionAccuracyPass, Reduce_Test_1) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
 
   auto graph = std::make_shared<Graph>(program, target);
 
@@ -535,7 +535,7 @@ TEST(CheckFusionAccuracyPass, Reduce_Test_2) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
 
   auto graph = std::make_shared<Graph>(program, target);
 
@@ -573,7 +573,7 @@ TEST(CheckFusionAccuracyPass, Reduce_Test_3) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
 
   auto graph = std::make_shared<Graph>(program, target);
 
@@ -612,7 +612,7 @@ TEST(CheckFusionAccuracyPass, Reduce_Test_4) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
 
   auto graph = std::make_shared<Graph>(program, target);
 
@@ -648,7 +648,7 @@ TEST(CheckFusionAccuracyPass, Reduce_Test_5) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
 
   auto graph = std::make_shared<Graph>(program, target);
 
diff --git a/paddle/cinn/hlir/pass/common_subexpression_elimination.cc b/paddle/cinn/hlir/pass/common_subexpression_elimination.cc
index e595783c7b11b8..d50697583db08c 100644
--- a/paddle/cinn/hlir/pass/common_subexpression_elimination.cc
+++ b/paddle/cinn/hlir/pass/common_subexpression_elimination.cc
@@ -31,8 +31,8 @@ using framework::Graph;
 using framework::Node;
 using framework::NodeData;
 
-using common::GraphEdge;
-using common::GraphNode;
+using cinn::common::GraphEdge;
+using cinn::common::GraphNode;
 
 using InputToNodeMap =
     std::unordered_map<std::string, std::unordered_set<Node*>>;
@@ -99,7 +99,7 @@ bool IsSameSubexpression(Node* op1,
       bool op1_equal_op2 = std::any_of(
           op2_in_edges.begin(),
           op2_in_edges.end(),
-          [&](common::Shared<GraphEdge>& edge) {
+          [&](cinn::common::Shared<GraphEdge>& edge) {
             auto* op2_source_node = edge->source()->safe_as<NodeData>();
             CHECK(op2_source_node);
             if (op1_source_node->id() == op2_source_node->id()) {
diff --git a/paddle/cinn/hlir/pass/common_subexpression_elimination_test.cc b/paddle/cinn/hlir/pass/common_subexpression_elimination_test.cc
index 3e7417a9c29aa2..4f326ee58f1c96 100644
--- a/paddle/cinn/hlir/pass/common_subexpression_elimination_test.cc
+++ b/paddle/cinn/hlir/pass/common_subexpression_elimination_test.cc
@@ -59,7 +59,7 @@ TEST(common_subexpression_elimination, common_subexpression_elimination_case1) {
   auto concat = program.concat({t_1, t_2, t_3});
   auto max = program.reduce_max(concat, {0}, true);
 
-  Target target = common::DefaultTarget();
+  Target target = cinn::common::DefaultTarget();
   program.SetInputs({A, B});
   program.Validate();
   LOG(INFO) << "Program:\n" << program;
@@ -104,7 +104,7 @@ TEST(common_subexpression_elimination, common_subexpression_elimination_case2) {
   auto concat_2 = program.concat({reshape_1, reshape_2});
   auto concat_3 = program.concat({reshape_1, reshape_2}, 1);
 
-  Target target = common::DefaultTarget();
+  Target target = cinn::common::DefaultTarget();
   program.SetInputs({A, B});
   program.Validate();
   LOG(INFO) << "Program:\n" << program;
@@ -168,7 +168,7 @@ TEST(common_subexpression_elimination, common_subexpression_elimination_case3) {
   fetch_list.insert(out1->id);
   fetch_list.insert(out2->id);
 
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
   auto graph =
       std::make_shared<hlir::framework::Graph>(program, fetch_list, target);
   LOG(INFO) << "graph:\n" << graph->DebugGroupedGraph(fetch_list);
diff --git a/paddle/cinn/hlir/pass/const_propagate.cc b/paddle/cinn/hlir/pass/const_propagate.cc
index 3db1c174222943..0654cc85520eb6 100644
--- a/paddle/cinn/hlir/pass/const_propagate.cc
+++ b/paddle/cinn/hlir/pass/const_propagate.cc
@@ -24,7 +24,7 @@ namespace cinn {
 namespace hlir {
 namespace pass {
 
-using common::Type;
+using cinn::common::Type;
 using framework::Graph;
 using framework::Node;
 using framework::NodeData;
diff --git a/paddle/cinn/hlir/pass/const_propagate_test.cc b/paddle/cinn/hlir/pass/const_propagate_test.cc
index c1600c81aa5f9d..bf9c2f471e5402 100644
--- a/paddle/cinn/hlir/pass/const_propagate_test.cc
+++ b/paddle/cinn/hlir/pass/const_propagate_test.cc
@@ -46,7 +46,7 @@ TEST(const_conv, const_conv) {
   attrs["data_format"] = src_layout;
 
   auto c = program.conv2d(A, B, attrs);
-  Target target = common::DefaultTarget();
+  Target target = cinn::common::DefaultTarget();
   program.SetInputs({A, B});
   program.Validate();
   LOG(INFO) << "Program:\n" << program;
@@ -91,7 +91,7 @@ TEST(const_bn, const_bn) {
   auto a =
       program.fused_batchnorm_inference(A, Scale, Bias, Mean, Variance, attrs);
 
-  Target target = common::DefaultTarget();
+  Target target = cinn::common::DefaultTarget();
   program.SetInputs({A, Scale, Bias, Mean, Variance});
   program.Validate();
   LOG(INFO) << "Program:\n" << program;
diff --git a/paddle/cinn/hlir/pass/constant_folding_pass.cc b/paddle/cinn/hlir/pass/constant_folding_pass.cc
index 50a76f54cb312e..0f4493ccf3e776 100644
--- a/paddle/cinn/hlir/pass/constant_folding_pass.cc
+++ b/paddle/cinn/hlir/pass/constant_folding_pass.cc
@@ -25,8 +25,8 @@ using framework::NodeData;
 using framework::OpPatternKind;
 using framework::shape_t;
 
-using common::GraphEdge;
-using common::GraphNode;
+using cinn::common::GraphEdge;
+using cinn::common::GraphNode;
 
 using AlterFunction =
     std::function<void(const FusionHelperBase*, Graph*, Node*)>;
diff --git a/paddle/cinn/hlir/pass/constant_folding_pass_test.cc b/paddle/cinn/hlir/pass/constant_folding_pass_test.cc
index 5e98a0e2bbcfe9..0cf95ea0a12e55 100644
--- a/paddle/cinn/hlir/pass/constant_folding_pass_test.cc
+++ b/paddle/cinn/hlir/pass/constant_folding_pass_test.cc
@@ -40,7 +40,7 @@ std::unordered_map<std::string, std::vector<float>> RunModelTest(
     const std::vector<std::string>&& passes,
     const std::unordered_map<std::string, std::vector<float>>& input_data,
     const std::unordered_set<std::string>& fetch_ids) {
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   auto graph =
       std::make_shared<hlir::framework::Graph>(program, fetch_ids, target);
   hlir::framework::ApplyPasses(graph.get(), passes);
diff --git a/paddle/cinn/hlir/pass/constant_folding_pass_util.cc b/paddle/cinn/hlir/pass/constant_folding_pass_util.cc
index 90aab2144065fb..748948f2206fcc 100644
--- a/paddle/cinn/hlir/pass/constant_folding_pass_util.cc
+++ b/paddle/cinn/hlir/pass/constant_folding_pass_util.cc
@@ -65,8 +65,8 @@ class ConstantFoldingHelper {
  private:
   Node* CreateNewNode(const std::string& op_name,
                       const AttributeMap& attrs_map) {
-    auto* node =
-        new Node(Operator::Get(op_name), op_name, common::UniqName(op_name));
+    auto* node = new Node(
+        Operator::Get(op_name), op_name, cinn::common::UniqName(op_name));
     node->attrs.attr_store = attrs_map;
     graph_->RegisterNode(node->id(), node);
     return node;
diff --git a/paddle/cinn/hlir/pass/custom_call_pass.cc b/paddle/cinn/hlir/pass/custom_call_pass.cc
index 287bda3ba783c3..231d2cc7d44200 100644
--- a/paddle/cinn/hlir/pass/custom_call_pass.cc
+++ b/paddle/cinn/hlir/pass/custom_call_pass.cc
@@ -38,10 +38,10 @@ class GraphAlterHelper {
       deny_ops_ = {splited_names.begin(), splited_names.end()};
     }
   }
-  void TransToCustomCall(const common::Target& target) {
+  void TransToCustomCall(const cinn::common::Target& target) {
     // collect candidate nodes
     auto mark_nodes = graph_->CollectNodes(
-        [this, &target](const common::GraphNode* graph_node) -> bool {
+        [this, &target](const cinn::common::GraphNode* graph_node) -> bool {
           if (graph_node->safe_as<Node>()) {
             auto node = graph_node->safe_as<Node>();
             auto&& op_name = node->op()->name;
@@ -63,7 +63,7 @@ class GraphAlterHelper {
       // codegen-registered is not consistent with cudnn
       if ((node->op()->name == "conv2d" ||
            node->op()->name == "depthwise_conv2d") &&
-          target == common::DefaultNVGPUTarget()) {
+          target == cinn::common::DefaultNVGPUTarget()) {
         auto out_links = node->outlinks_in_order();
         for (int idx = 1; idx < out_links.size(); ++idx) {
           auto link = out_links[idx];
diff --git a/paddle/cinn/hlir/pass/dce_pass.cc b/paddle/cinn/hlir/pass/dce_pass.cc
index fd439c1e97cfc7..b17f8ee4de5d9f 100644
--- a/paddle/cinn/hlir/pass/dce_pass.cc
+++ b/paddle/cinn/hlir/pass/dce_pass.cc
@@ -27,8 +27,8 @@ using framework::NodeData;
 using framework::OpPatternKind;
 using framework::shape_t;
 
-using common::GraphEdge;
-using common::GraphNode;
+using cinn::common::GraphEdge;
+using cinn::common::GraphNode;
 
 using GroupPtr = std::shared_ptr<Graph::Group>;
 using GroupList = std::vector<GroupPtr>;
diff --git a/paddle/cinn/hlir/pass/dce_pass_test.cc b/paddle/cinn/hlir/pass/dce_pass_test.cc
index 7f5c3355b00673..bb9c5d7654851f 100644
--- a/paddle/cinn/hlir/pass/dce_pass_test.cc
+++ b/paddle/cinn/hlir/pass/dce_pass_test.cc
@@ -30,7 +30,7 @@ TEST(DCE, Test_0) {
 
   auto fetch_ids = {D->id};
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
 
   auto graph =
       std::make_shared<hlir::framework::Graph>(program, fetch_ids, target);
@@ -54,7 +54,7 @@ TEST(DCE, Test_1) {
 
   auto fetch_ids = {F->id};
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
 
   auto graph =
       std::make_shared<hlir::framework::Graph>(program, fetch_ids, target);
diff --git a/paddle/cinn/hlir/pass/dense_merge_pass.cc b/paddle/cinn/hlir/pass/dense_merge_pass.cc
index c8433f3a85fc7f..82341cb8469bf3 100644
--- a/paddle/cinn/hlir/pass/dense_merge_pass.cc
+++ b/paddle/cinn/hlir/pass/dense_merge_pass.cc
@@ -20,7 +20,7 @@ namespace cinn {
 namespace hlir {
 namespace pass {
 
-using common::GraphNode;
+using cinn::common::GraphNode;
 using framework::Graph;
 using framework::Node;
 using framework::NodeAttr;
@@ -118,7 +118,7 @@ class DenseMergePassHelper : public FusionHelperBase {
       // create custom call node
       Node* node_tmp = new Node(Operator::Get("custom_call"),
                                 "custom_call",
-                                common::UniqName("custom_call"));
+                                cinn::common::UniqName("custom_call"));
       graph_->RegisterNode(node_tmp->id(), node_tmp);
       node_tmp->attrs.attr_store = dense_op.second[0]->attrs.attr_store;
       node_tmp->attrs.attr_store["side"] = side;
diff --git a/paddle/cinn/hlir/pass/dense_merge_pass_test.cc b/paddle/cinn/hlir/pass/dense_merge_pass_test.cc
index 23ce990ff6fd8b..b1eab6fadfe342 100644
--- a/paddle/cinn/hlir/pass/dense_merge_pass_test.cc
+++ b/paddle/cinn/hlir/pass/dense_merge_pass_test.cc
@@ -34,7 +34,7 @@ void RunModelTest(Program& program,  // NOLINT
         &inputs_data.back(), inputs_data.back().size(), 0.0f, 1.0f, 1e-3);
   }
 
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   std::unordered_map<std::string,
                      std::pair<std::vector<float>, std::vector<float>>>
       outputs;
diff --git a/paddle/cinn/hlir/pass/dot_merger.cc b/paddle/cinn/hlir/pass/dot_merger.cc
index 8638200180f66e..941cf6b29b66c9 100644
--- a/paddle/cinn/hlir/pass/dot_merger.cc
+++ b/paddle/cinn/hlir/pass/dot_merger.cc
@@ -22,7 +22,7 @@ namespace hlir {
 namespace pass {
 namespace {
 
-using common::GraphNode;
+using cinn::common::GraphNode;
 using framework::Node;
 using framework::NodeData;
 using framework::Operator;
@@ -33,7 +33,7 @@ using infershape_t = std::function<std::vector<framework::shape_t>(
     const std::vector<framework::shape_t>&, const framework::AttrMapType&)>;
 using inferdtype_t = std::function<std::vector<Type>(
     const std::vector<Type>&, const framework::AttrMapType&)>;
-using dtype_dict_t = absl::flat_hash_map<std::string, common::Type>;
+using dtype_dict_t = absl::flat_hash_map<std::string, cinn::common::Type>;
 using shape_dict_t = absl::flat_hash_map<std::string, framework::shape_t>;
 
 bool accessible(GraphNode* start, GraphNode* end) {
@@ -130,7 +130,7 @@ class DotBuilder {
   const shape_dict_t& shape_dict() const { return shape_dict_; }
 
   // Currently the constructor of `NodeData` needs to pass in `Shared<Node>`.
-  NodeData* Var(common::Shared<Node>& producer) {  // NOLINT
+  NodeData* Var(cinn::common::Shared<Node>& producer) {  // NOLINT
     auto* res = new NodeData(producer, 0, 0, node_name("var"), false);
     graph_->RegisterNode(producer->id(), res);
     graph_->RegisterNode(res->id(), producer.get());
@@ -141,7 +141,7 @@ class DotBuilder {
 
   NodeData* Concat(int axis, std::vector<NodeData*> inputs) {
     const std::string type{"concat"};
-    auto instr = common::Shared<Node>(
+    auto instr = cinn::common::Shared<Node>(
         new Node(framework::Operator::Get(type), type, node_name(type)));
     instr->attrs.attr_store["axis"] = axis;
     for (auto* in : inputs) {
@@ -158,7 +158,7 @@ class DotBuilder {
                    NodeData* lhs,
                    NodeData* rhs) {
     const std::string type{dot_type_};
-    auto instr = common::Shared<Node>(
+    auto instr = cinn::common::Shared<Node>(
         new Node(framework::Operator::Get(type), type, node_name(type)));
     matmul_ = instr.get();
     instr->attrs.attr_store["trans_a"] = trans_a;
@@ -177,7 +177,7 @@ class DotBuilder {
                   NodeData* input,
                   NodeData* output) {
     const std::string type{"slice"};
-    auto instr = common::Shared<Node>(
+    auto instr = cinn::common::Shared<Node>(
         new Node(framework::Operator::Get(type), type, node_name(type)));
     instr->attrs.attr_store["axes"] = std::move(axes);
     instr->attrs.attr_store["starts"] = std::move(starts);
diff --git a/paddle/cinn/hlir/pass/dot_merger_test.cc b/paddle/cinn/hlir/pass/dot_merger_test.cc
index 2a9bdf9d4f1470..450cd4b5c3f53c 100644
--- a/paddle/cinn/hlir/pass/dot_merger_test.cc
+++ b/paddle/cinn/hlir/pass/dot_merger_test.cc
@@ -34,7 +34,7 @@ void RunModelTest(Program& program,  // NOLINT
         &inputs_data.back(), inputs_data.back().size(), 0.0f, 1.0f, 1e-3);
   }
 
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   std::unordered_map<std::string,
                      std::pair<std::vector<float>, std::vector<float>>>
       outputs;
diff --git a/paddle/cinn/hlir/pass/fusion_helper_base.h b/paddle/cinn/hlir/pass/fusion_helper_base.h
index d3c9e5c075529a..3437b334fa5df7 100644
--- a/paddle/cinn/hlir/pass/fusion_helper_base.h
+++ b/paddle/cinn/hlir/pass/fusion_helper_base.h
@@ -176,7 +176,8 @@ class FusionHelperBase {
       for (int idx = axes.back() + 1; idx < inshape.size(); ++idx) {
         lane = inshape[idx];
       }
-      int max_num_threads = common::DefaultNVGPUTarget().max_num_threads();
+      int max_num_threads =
+          cinn::common::DefaultNVGPUTarget().max_num_threads();
       if (lane > max_num_threads / 2) {
         return 0;
       }
@@ -212,7 +213,7 @@ class FusionHelperBase {
     return 0;
   }
   // target
-  const common::Target& target_;
+  const cinn::common::Target& target_;
   // output node set
   std::unordered_set<const Node*> output_nodes_set_;
   // shape dict
diff --git a/paddle/cinn/hlir/pass/fusion_merge_pass.cc b/paddle/cinn/hlir/pass/fusion_merge_pass.cc
index 8d3233e23c7069..86c0e5360fc0d6 100644
--- a/paddle/cinn/hlir/pass/fusion_merge_pass.cc
+++ b/paddle/cinn/hlir/pass/fusion_merge_pass.cc
@@ -26,8 +26,8 @@ using framework::NodeData;
 using framework::OpPatternKind;
 using framework::shape_t;
 
-using common::GraphEdge;
-using common::GraphNode;
+using cinn::common::GraphEdge;
+using cinn::common::GraphNode;
 
 using Comparator = Graph::Group::SharedGroupComparator;
 using Hasher = Graph::Group::SharedGroupHasher;
diff --git a/paddle/cinn/hlir/pass/fusion_merge_pass_test.cc b/paddle/cinn/hlir/pass/fusion_merge_pass_test.cc
index f4582a5ce65be7..f6f9ecee97c430 100755
--- a/paddle/cinn/hlir/pass/fusion_merge_pass_test.cc
+++ b/paddle/cinn/hlir/pass/fusion_merge_pass_test.cc
@@ -34,7 +34,7 @@ TEST(FusionMergePass, ElementWise_Fusion_0) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   RunDecomposer(&program, target);
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
@@ -60,7 +60,7 @@ TEST(FusionMergePass, ElementWise_Fusion_1) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   RunDecomposer(&program, target);
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
@@ -89,7 +89,7 @@ TEST(FusionMergePass, ElementWise_Fusion_2) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   RunDecomposer(&program, target);
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
@@ -118,7 +118,7 @@ TEST(FusionMergePass, ElementWise_Fusion_3) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   RunDecomposer(&program, target);
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
@@ -147,7 +147,7 @@ TEST(FusionMergePass, ElementWise_Fusion_4) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   RunDecomposer(&program, target);
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
@@ -169,7 +169,7 @@ TEST(FusionMergePass, ElementWise_Fusion_5) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   RunDecomposer(&program, target);
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
@@ -194,7 +194,7 @@ TEST(FusionMergePass, Broadcast_Test_0) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   RunDecomposer(&program, target);
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
@@ -219,7 +219,7 @@ TEST(FusionMergePass, Broadcast_Test_1) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   RunDecomposer(&program, target);
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
@@ -244,7 +244,7 @@ TEST(FusionMergePass, Broadcast_Test_2) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   RunDecomposer(&program, target);
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
@@ -269,7 +269,7 @@ TEST(FusionMergePass, Broadcast_Test_3) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   RunDecomposer(&program, target);
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
@@ -296,7 +296,7 @@ TEST(FusionMergePass, Broadcast_Test_4) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   RunDecomposer(&program, target);
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
@@ -323,7 +323,7 @@ TEST(FusionMergePass, Broadcast_Test_5) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   RunDecomposer(&program, target);
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
@@ -347,7 +347,7 @@ TEST(FusionMergePass, Reduce_Test_0) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   RunDecomposer(&program, target);
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
@@ -370,7 +370,7 @@ TEST(FusionMergePass, Reduce_Test_1) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   RunDecomposer(&program, target);
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
@@ -396,7 +396,7 @@ TEST(FusionMergePass, Reduce_Test_2) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   RunDecomposer(&program, target);
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
@@ -422,7 +422,7 @@ TEST(FusionMergePass, Reduce_Test_3) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   RunDecomposer(&program, target);
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
@@ -449,7 +449,7 @@ TEST(FusionMergePass, Reduce_Test_4) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   RunDecomposer(&program, target);
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
@@ -473,7 +473,7 @@ TEST(FusionMergePass, Reduce_Test_5) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   RunDecomposer(&program, target);
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
diff --git a/paddle/cinn/hlir/pass/fusion_merge_pass_util.h b/paddle/cinn/hlir/pass/fusion_merge_pass_util.h
index 6b6f786cab4a09..bc14748f5f6484 100644
--- a/paddle/cinn/hlir/pass/fusion_merge_pass_util.h
+++ b/paddle/cinn/hlir/pass/fusion_merge_pass_util.h
@@ -138,7 +138,7 @@ CONDITION_FUNC(honrizontal_elementwise_fuse_reduce) {
 }
 
 CONDITION_FUNC(elementwise_fuse_reduce) {
-  if (helper->target_ == common::DefaultHostTarget()) {
+  if (helper->target_ == cinn::common::DefaultHostTarget()) {
     return true;
   }
   // if same shape with horizontal relation
@@ -427,7 +427,7 @@ CONDITION_FUNC(reduce_fuse_broadcast) {
       reduce_size *= reducer_input_shape[idx - 1];
     }
     // Check if the reduce size exceeds the hardware limit
-    if (helper->target_ == common::DefaultNVGPUTarget() &&
+    if (helper->target_ == cinn::common::DefaultNVGPUTarget() &&
         reduce_size > helper->target_.max_num_threads()) {
       return false;
     }
diff --git a/paddle/cinn/hlir/pass/general_fusion_merge_pass.cc b/paddle/cinn/hlir/pass/general_fusion_merge_pass.cc
index 6ea908ed31f318..cf1b91fcc13573 100644
--- a/paddle/cinn/hlir/pass/general_fusion_merge_pass.cc
+++ b/paddle/cinn/hlir/pass/general_fusion_merge_pass.cc
@@ -38,8 +38,8 @@ using framework::NodeData;
 using framework::OpPatternKind;
 using framework::shape_t;
 
-using common::GraphEdge;
-using common::GraphNode;
+using cinn::common::GraphEdge;
+using cinn::common::GraphNode;
 
 using GroupPtr = std::shared_ptr<Graph::Group>;
 using GroupList = std::vector<GroupPtr>;
diff --git a/paddle/cinn/hlir/pass/general_fusion_merge_pass/graph_group_fuse_helper.h b/paddle/cinn/hlir/pass/general_fusion_merge_pass/graph_group_fuse_helper.h
index cd0ac4b0138422..3859ad88ff0169 100644
--- a/paddle/cinn/hlir/pass/general_fusion_merge_pass/graph_group_fuse_helper.h
+++ b/paddle/cinn/hlir/pass/general_fusion_merge_pass/graph_group_fuse_helper.h
@@ -97,7 +97,7 @@ class GraphGroupFuseHelper final : public FuseHelper {
             Visit(node_producer);
           }
         };
-    common::IsReachablePredicator<OpGroupPtr> is_reachable(
+    cinn::common::IsReachablePredicator<OpGroupPtr> is_reachable(
         MinDepth4Node, MaxDepth4Node, VisitNextNodes);
     return is_reachable(consumer, producer, [](OpGroupPtr) {});
   }
@@ -120,7 +120,7 @@ class GraphGroupFuseHelper final : public FuseHelper {
             Visit(node_producer);
           }
         };
-    common::IsReachablePredicator<OpGroupPtr> is_reachable(
+    cinn::common::IsReachablePredicator<OpGroupPtr> is_reachable(
         MinDepth4Node, MaxDepth4Node, VisitNextNodes);
     return is_reachable(consumer, producer, [](OpGroupPtr) {});
   }
diff --git a/paddle/cinn/hlir/pass/general_fusion_merge_pass_utils.h b/paddle/cinn/hlir/pass/general_fusion_merge_pass_utils.h
index 168edb3a97a4a0..2195d4a4f947bd 100644
--- a/paddle/cinn/hlir/pass/general_fusion_merge_pass_utils.h
+++ b/paddle/cinn/hlir/pass/general_fusion_merge_pass_utils.h
@@ -143,7 +143,7 @@ static int GetSharedSize(const api::OpNode& op_node) {
     for (int idx = axes.back() + 1; idx < inshape.size(); ++idx) {
       lane = inshape[idx];
     }
-    int max_num_threads = common::DefaultNVGPUTarget().max_num_threads();
+    int max_num_threads = cinn::common::DefaultNVGPUTarget().max_num_threads();
     if (lane > max_num_threads / 2) {
       return 0;
     }
diff --git a/paddle/cinn/hlir/pass/infershape.cc b/paddle/cinn/hlir/pass/infershape.cc
index b082c98a0fcf69..041a63b42b57c0 100644
--- a/paddle/cinn/hlir/pass/infershape.cc
+++ b/paddle/cinn/hlir/pass/infershape.cc
@@ -24,7 +24,7 @@ namespace cinn {
 namespace hlir {
 namespace pass {
 
-using common::Type;
+using cinn::common::Type;
 using framework::Graph;
 using framework::Node;
 using framework::NodeData;
@@ -34,7 +34,7 @@ using infershape_t = std::function<std::vector<framework::shape_t>(
     const std::vector<framework::shape_t>&, const framework::AttrMapType&)>;
 using inferdtype_t = std::function<std::vector<Type>(
     const std::vector<Type>&, const framework::AttrMapType&)>;
-using dtype_dict_t = absl::flat_hash_map<std::string, common::Type>;
+using dtype_dict_t = absl::flat_hash_map<std::string, cinn::common::Type>;
 using shape_dict_t = absl::flat_hash_map<std::string, framework::shape_t>;
 
 void InferShape(Node* node,
diff --git a/paddle/cinn/hlir/pass/infershape.h b/paddle/cinn/hlir/pass/infershape.h
index db9b8e21e70208..12faf7b8de3aea 100644
--- a/paddle/cinn/hlir/pass/infershape.h
+++ b/paddle/cinn/hlir/pass/infershape.h
@@ -24,7 +24,7 @@ namespace pass {
 
 void InferShape(
     framework::Node* node,
-    absl::flat_hash_map<std::string, common::Type>& dtype_dict,  // NOLINT
+    absl::flat_hash_map<std::string, cinn::common::Type>& dtype_dict,  // NOLINT
     absl::flat_hash_map<std::string, framework::shape_t>&
         shape_dict);  // NOLINT
 
diff --git a/paddle/cinn/hlir/pass/op_fusion_pass.cc b/paddle/cinn/hlir/pass/op_fusion_pass.cc
index 84a95dfe277ddd..242b72f77e77f6 100644
--- a/paddle/cinn/hlir/pass/op_fusion_pass.cc
+++ b/paddle/cinn/hlir/pass/op_fusion_pass.cc
@@ -25,8 +25,8 @@ using framework::NodeData;
 using framework::OpPatternKind;
 using framework::shape_t;
 
-using common::GraphEdge;
-using common::GraphNode;
+using cinn::common::GraphEdge;
+using cinn::common::GraphNode;
 
 using GroupPtr = std::shared_ptr<Graph::Group>;
 using GroupList = std::vector<GroupPtr>;
diff --git a/paddle/cinn/hlir/pass/op_fusion_pass_test.cc b/paddle/cinn/hlir/pass/op_fusion_pass_test.cc
index f433cac8ca43dd..885afd929ba87e 100755
--- a/paddle/cinn/hlir/pass/op_fusion_pass_test.cc
+++ b/paddle/cinn/hlir/pass/op_fusion_pass_test.cc
@@ -34,7 +34,7 @@ TEST(OpFusionPass, ElementWise_Fusion_0) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   RunDecomposer(&program, target);
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
@@ -58,7 +58,7 @@ TEST(OpFusionPass, ElementWise_Fusion_1) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   RunDecomposer(&program, target);
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
@@ -81,7 +81,7 @@ TEST(OpFusionPass, Brodcast_Test_0) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   RunDecomposer(&program, target);
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
@@ -106,7 +106,7 @@ TEST(OpFusionPass, Brodcast_Test_1) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   RunDecomposer(&program, target);
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
@@ -126,7 +126,7 @@ TEST(OpFusionPass, Brodcast_Test_2) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   RunDecomposer(&program, target);
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
@@ -150,7 +150,7 @@ TEST(OpFusionPass, Reduce_Test_0) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   RunDecomposer(&program, target);
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
@@ -175,7 +175,7 @@ TEST(OpFusionPass, Reduce_Test_1) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   RunDecomposer(&program, target);
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
@@ -200,7 +200,7 @@ TEST(OpFusionPass, Reduce_Test_2) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   RunDecomposer(&program, target);
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
@@ -224,7 +224,7 @@ TEST(OpFusionPass, Injective_Test_0) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   RunDecomposer(&program, target);
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
@@ -242,7 +242,7 @@ TEST(OP_LOWERING, Injective_Test_1) {
   auto F = net_builder.Add(D, E);
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   RunDecomposer(&program, target);
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
@@ -264,7 +264,7 @@ TEST(OpFusionPass, Test_Insert_BroadcastTo) {
   }
 
   auto program = net_builder.Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
diff --git a/paddle/cinn/hlir/pass/op_fusion_pass_util.h b/paddle/cinn/hlir/pass/op_fusion_pass_util.h
index 796b6fcb0e10a8..a3a7365024de07 100644
--- a/paddle/cinn/hlir/pass/op_fusion_pass_util.h
+++ b/paddle/cinn/hlir/pass/op_fusion_pass_util.h
@@ -216,7 +216,7 @@ CONDITION_FUNC(horizontal_or_vertical_reduce_relation) {
     break;
   }
 
-  return helper->target_ == common::DefaultNVGPUTarget()
+  return helper->target_ == cinn::common::DefaultNVGPUTarget()
              ? (succesive_reduce_dimension <= helper->target_.max_num_threads()
                     ? true
                     : false)
@@ -263,7 +263,7 @@ CONDITION_FUNC(reduce_fuse_broadcast) {
     return false;
   }
 
-  if (helper->target_ != common::DefaultNVGPUTarget()) {
+  if (helper->target_ != cinn::common::DefaultNVGPUTarget()) {
     return true;
   }
 
diff --git a/paddle/cinn/hlir/pass/opfusion.cc b/paddle/cinn/hlir/pass/opfusion.cc
index f95eed9873d959..537b9abb458817 100644
--- a/paddle/cinn/hlir/pass/opfusion.cc
+++ b/paddle/cinn/hlir/pass/opfusion.cc
@@ -26,8 +26,8 @@ namespace cinn {
 namespace hlir {
 namespace pass {
 
-using common::GraphNode;
-using common::Type;
+using cinn::common::GraphNode;
+using cinn::common::Type;
 using framework::Graph;
 using framework::Node;
 using framework::NodeData;
@@ -203,8 +203,8 @@ class DomTree {
 struct GroupNode {
   GroupNode* parent{nullptr};
   OpPatternKind pattern;
-  common::GraphNode* ref_node{nullptr};
-  common::GraphNode* master_node{nullptr};
+  cinn::common::GraphNode* ref_node{nullptr};
+  cinn::common::GraphNode* master_node{nullptr};
   int index{0};
   int nodes_count{1};
   int op_nodes_count{0};
@@ -518,7 +518,7 @@ class GraphPartition {
       }
     }
   }
-  void SplitGroups(const std::vector<common::GraphNode*>& graph_nodes) {
+  void SplitGroups(const std::vector<cinn::common::GraphNode*>& graph_nodes) {
     // split groups sorted by topo order
     CHECK_EQ(graph_nodes.size(), group_nodes_.size());
     absl::flat_hash_map<int, std::vector<Node*>> group_maps;
diff --git a/paddle/cinn/hlir/pass/opfusion_test.cc b/paddle/cinn/hlir/pass/opfusion_test.cc
index 0dc87573cceb39..5df145453abd14 100644
--- a/paddle/cinn/hlir/pass/opfusion_test.cc
+++ b/paddle/cinn/hlir/pass/opfusion_test.cc
@@ -69,7 +69,7 @@ TEST(complex2, complex2) {
   auto e = program.relu(c);
   auto f = program.elementwise_add(d, e);
 
-  Target target = common::DefaultTarget();
+  Target target = cinn::common::DefaultTarget();
   program.SetInputs({A, B, C, D, E});
   program.Validate();
   LOG(INFO) << "Program:\n" << program;
@@ -125,7 +125,7 @@ TEST(complex1, complex1) {
   auto e = program.relu(c);
   auto f = program.elementwise_add(d, e);
 
-  Target target = common::DefaultTarget();
+  Target target = cinn::common::DefaultTarget();
   program.SetInputs({A, B, C, D, E});
   program.Validate();
   LOG(INFO) << "Program:\n" << program;
@@ -163,7 +163,7 @@ TEST(fuse_add_relu, fuse_add_relu) {
   auto c = program.elementwise_add(A, B, 1);
   auto d = program.relu(c);
 
-  Target target = common::DefaultTarget();
+  Target target = cinn::common::DefaultTarget();
   program.SetInputs({A, B});
   program.Validate();
   LOG(INFO) << "Program:\n" << program;
@@ -202,7 +202,7 @@ TEST(fuse_add, fuse_add) {
   auto c = program.elementwise_add(A, B, 1);
   auto d = program.elementwise_add(c, C, 1);
 
-  Target target = common::DefaultTarget();
+  Target target = cinn::common::DefaultTarget();
   program.SetInputs({A, B, C});
   program.Validate();
   LOG(INFO) << "Program:\n" << program;
@@ -261,7 +261,7 @@ TEST(conv_bn_conv, conv_bn_conv) {
   auto f = program.elementwise_mul(e, D);
   auto g = program.relu(f);
 
-  Target target = common::DefaultTarget();
+  Target target = cinn::common::DefaultTarget();
   program.SetInputs({A, B, C, D, E});
   program.Validate();
   LOG(INFO) << "Program:\n" << program;
@@ -313,7 +313,7 @@ TEST(fuse_conv_add, fuse_conv_add) {
   auto c = program.conv2d(A, B, attrs);
   auto d = program.elementwise_add(c, C, 1);
 
-  Target target = common::DefaultTarget();
+  Target target = cinn::common::DefaultTarget();
   program.SetInputs({A, B, C});
   program.Validate();
   LOG(INFO) << "Program:\n" << program;
@@ -372,7 +372,7 @@ TEST(conv_add_mul, conv_add_mul) {
   auto d = program.elementwise_add(c, Scale);
   auto e = program.elementwise_mul(d, Bias, 1);
 
-  Target target = common::DefaultTarget();
+  Target target = cinn::common::DefaultTarget();
   program.SetInputs({A, B, D});
   program.Validate();
   LOG(INFO) << "Program:\n" << program;
@@ -421,7 +421,7 @@ TEST(fuse_conv_add1, fuse_conv_add1) {
   auto c = program.conv2d(A, B, attrs);
   auto d = program.elementwise_add(c, C);
 
-  Target target = common::DefaultTarget();
+  Target target = cinn::common::DefaultTarget();
   program.SetInputs({A, B, C});
   program.Validate();
   LOG(INFO) << "Program:\n" << program;
@@ -462,7 +462,7 @@ TEST(transpose_reshape_concat, transpose_reshape_concat) {
   auto d = program.reshape(b, {4, 32});
   auto e = program.concat({c, d});
 
-  Target target = common::DefaultTarget();
+  Target target = cinn::common::DefaultTarget();
   program.SetInputs({A, B});
   program.Validate();
   LOG(INFO) << "Program:\n" << program;
@@ -515,7 +515,7 @@ TEST(conv_bn, conv_bn) {
   auto d =
       program.fused_batchnorm_inference(c, Scale, Bias, Mean, Variance, attrs1);
 
-  Target target = common::DefaultTarget();
+  Target target = cinn::common::DefaultTarget();
   program.SetInputs({A, B, Scale, Bias, Mean, Variance});
   program.Validate();
   LOG(INFO) << "Program:\n" << program;
diff --git a/paddle/cinn/hlir/pass/reduce_split_pass.cc b/paddle/cinn/hlir/pass/reduce_split_pass.cc
index bfcfde59ba0426..1f8c500cc9be05 100644
--- a/paddle/cinn/hlir/pass/reduce_split_pass.cc
+++ b/paddle/cinn/hlir/pass/reduce_split_pass.cc
@@ -24,7 +24,7 @@ namespace hlir {
 namespace pass {
 namespace {
 
-using common::GraphNode;
+using cinn::common::GraphNode;
 using framework::Node;
 using framework::NodeData;
 using framework::Operator;
@@ -73,7 +73,7 @@ class ReduceSplitPass {
  public:
   // Find the reduce op with nwhc format and large shape, split it into two ops
   static int Apply(framework::Graph* graph) {
-    int MAX_NUM_THREADS = common::DefaultNVGPUTarget().max_num_threads();
+    int MAX_NUM_THREADS = cinn::common::DefaultNVGPUTarget().max_num_threads();
     constexpr int MAX_ITER_PER_THREAD = 32;  // empirical value
 
     int cnt = 0;
@@ -159,7 +159,7 @@ class ReduceSplitPass {
         //   1. reshape_loop > split_loop
         //   2. reshape thread > max_threads.
         if (shape[0] <= reduce_numel0 &&
-            shape[1] * shape[2] <= common::GetMaxThreads()) {
+            shape[1] * shape[2] <= cinn::common::GetMaxThreads()) {
           VLOG(3) << "  Don't Do Reduce Split!";
           continue;
         }
@@ -173,7 +173,7 @@ class ReduceSplitPass {
         // create reshape node0
         Node* reshape0 = new Node(Operator::Get("reshape"),
                                   "reshape",
-                                  common::UniqName("reshape_split"));
+                                  cinn::common::UniqName("reshape_split"));
         reshape0->attrs.attr_store["shape"] = std::vector<int>{
             reduce_numel0, reduce_numel1, in_shape[in_shape.size() - 1]};
         graph->RegisterNode(reshape0->id(), reshape0);
@@ -181,24 +181,24 @@ class ReduceSplitPass {
         in->UnLinkSingleTo(node);
         node->UnLinkSingleTo(out);
         auto reshape0_data = new NodeData(
-            Shared<Node>(reshape0), 0, 0, common::UniqName("var"), false);
+            Shared<Node>(reshape0), 0, 0, cinn::common::UniqName("var"), false);
         graph->RegisterNode(reshape0_data->id(), reshape0_data);
         reshape0->LinkTo(reshape0_data);
         shape_dict[reshape0_data->id()] =
             absl::get<std::vector<int>>(reshape0->attrs.attr_store.at("shape"));
-        dtype_dict[reshape0_data->id()] =
-            common::Str2Type(common::Type2Str(dtype_dict[in->id()]));
+        dtype_dict[reshape0_data->id()] = cinn::common::Str2Type(
+            cinn::common::Type2Str(dtype_dict[in->id()]));
 
         // create reduce node0
         Node* reduce0 = new Node(
-            Operator::Get(name), name, common::UniqName(name + "_split"));
+            Operator::Get(name), name, cinn::common::UniqName(name + "_split"));
         reduce0->attrs.attr_store["dim"] = std::vector<int>{0};
         reduce0->attrs.attr_store["keep_dim"] =
             absl::get<bool>(n->attrs.attr_store.at("keep_dim"));
         graph->RegisterNode(reduce0->id(), reduce0);
         reshape0_data->LinkTo(reduce0);
         auto reduce0_data = new NodeData(
-            Shared<Node>(reduce0), 0, 0, common::UniqName("var"), false);
+            Shared<Node>(reduce0), 0, 0, cinn::common::UniqName("var"), false);
         graph->RegisterNode(reduce0_data->id(), reduce0_data);
         reduce0->LinkTo(reduce0_data);
         shape_dict[reduce0_data->id()] =
@@ -207,12 +207,12 @@ class ReduceSplitPass {
                                         in_shape[in_shape.size() - 1]}
                      : std::vector<int>{reduce_numel1,
                                         in_shape[in_shape.size() - 1]};
-        dtype_dict[reduce0_data->id()] =
-            common::Str2Type(common::Type2Str(dtype_dict[in->id()]));
+        dtype_dict[reduce0_data->id()] = cinn::common::Str2Type(
+            cinn::common::Type2Str(dtype_dict[in->id()]));
 
         // create reduce node1
         Node* reduce1 = new Node(
-            Operator::Get(name), name, common::UniqName(name + "_split"));
+            Operator::Get(name), name, cinn::common::UniqName(name + "_split"));
         reduce1->attrs.attr_store["dim"] =
             keep_dim ? std::vector<int>{0, 1} : std::vector<int>{0};
         reduce1->attrs.attr_store["keep_dim"] =
@@ -220,24 +220,24 @@ class ReduceSplitPass {
         graph->RegisterNode(reduce1->id(), reduce1);
         reduce0_data->LinkTo(reduce1);
         auto reduce1_data = new NodeData(
-            Shared<Node>(reduce1), 0, 0, common::UniqName("var"), false);
+            Shared<Node>(reduce1), 0, 0, cinn::common::UniqName("var"), false);
         graph->RegisterNode(reduce1_data->id(), reduce1_data);
         reduce1->LinkTo(reduce1_data);
         shape_dict[reduce1_data->id()] =
             keep_dim ? std::vector<int>{1, 1, in_shape[in_shape.size() - 1]}
                      : std::vector<int>{in_shape[in_shape.size() - 1]};
-        dtype_dict[reduce1_data->id()] =
-            common::Str2Type(common::Type2Str(dtype_dict[in->id()]));
+        dtype_dict[reduce1_data->id()] = cinn::common::Str2Type(
+            cinn::common::Type2Str(dtype_dict[in->id()]));
 
         // create reshape node1
         Node* reshape1 = new Node(Operator::Get("reshape"),
                                   "reshape",
-                                  common::UniqName("reshape_split"));
+                                  cinn::common::UniqName("reshape_split"));
         reshape1->attrs.attr_store["shape"] = out_shape;
         graph->RegisterNode(reshape1->id(), reshape1);
         reduce1_data->LinkTo(reshape1);
         reshape1->LinkTo(out);
-        out->source_node = common::Shared<Node>(reshape1);
+        out->source_node = cinn::common::Shared<Node>(reshape1);
 
         // drop old node
         graph->DropNode(node);
diff --git a/paddle/cinn/hlir/pass/reduce_split_pass_test.cc b/paddle/cinn/hlir/pass/reduce_split_pass_test.cc
index 4285c93dd75926..8319c6ae13ec4b 100644
--- a/paddle/cinn/hlir/pass/reduce_split_pass_test.cc
+++ b/paddle/cinn/hlir/pass/reduce_split_pass_test.cc
@@ -24,7 +24,7 @@ std::unordered_map<std::string, std::vector<float>> RunModelTest(
     const std::vector<std::string>&& passes,
     const std::unordered_map<std::string, std::vector<float>>& input_data,
     const std::unordered_set<std::string>& fetch_ids) {
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   auto graph =
       std::make_shared<hlir::framework::Graph>(program, fetch_ids, target);
   hlir::framework::ApplyPasses(graph.get(), passes);
diff --git a/paddle/cinn/hlir/pass/single_group_optimize_pass.cc b/paddle/cinn/hlir/pass/single_group_optimize_pass.cc
index 1f8982192cddc5..816943b38cee08 100644
--- a/paddle/cinn/hlir/pass/single_group_optimize_pass.cc
+++ b/paddle/cinn/hlir/pass/single_group_optimize_pass.cc
@@ -25,14 +25,14 @@ namespace cinn::hlir::pass {
 using framework::Graph;
 using Group = framework::Graph::Group;
 
-using common::GraphEdge;
-using common::GraphNode;
+using cinn::common::GraphEdge;
+using cinn::common::GraphNode;
 
 using framework::Node;
 using framework::NodeData;
 
 using ShapeDict = absl::flat_hash_map<std::string, framework::shape_t>;
-using DtypeDict = absl::flat_hash_map<std::string, common::Type>;
+using DtypeDict = absl::flat_hash_map<std::string, cinn::common::Type>;
 
 namespace utils {
 template <typename T>
@@ -179,7 +179,7 @@ bool SingleGroupOptimizePass::CanReplaceToMemcpy(Node* node) const {
 }
 
 void SingleGroupOptimizePassImpl(Graph* graph) {
-  if (graph->target_ != common::DefaultNVGPUTarget()) {
+  if (graph->target_ != cinn::common::DefaultNVGPUTarget()) {
     return;
   }
   graph->fusion_groups = SingleGroupOptimizePass(graph).Apply();
diff --git a/paddle/cinn/hlir/pass/test_dot_merger.cc b/paddle/cinn/hlir/pass/test_dot_merger.cc
index ee4586571ec06e..bb7c832214750a 100644
--- a/paddle/cinn/hlir/pass/test_dot_merger.cc
+++ b/paddle/cinn/hlir/pass/test_dot_merger.cc
@@ -52,7 +52,7 @@ TEST(DotMerger, lhs) {
   auto h1 = builder.Add(e1, h);
   auto p = builder.Build();
 
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(
       std::vector<absl::string_view>{a.id(), b.id(), c.id(), c1.id()},
@@ -92,7 +92,7 @@ TEST(DotMerger, rhs) {
   auto e = builder.Matmul(b, c);
   auto f = builder.Concat({d, e}, axis);
   auto p = builder.Build();
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
   std::vector<std::string> input_ids;
   absl::c_transform(std::vector<absl::string_view>{a.id(), b.id(), c.id()},
                     std::back_inserter(input_ids),
diff --git a/paddle/cinn/hlir/pass/test_primitive_ops.cc b/paddle/cinn/hlir/pass/test_primitive_ops.cc
index 2b39b5c57f27b8..c44eab12edd2cb 100644
--- a/paddle/cinn/hlir/pass/test_primitive_ops.cc
+++ b/paddle/cinn/hlir/pass/test_primitive_ops.cc
@@ -52,7 +52,7 @@ TEST(batch_norm_meta, batch_norm_meta) {
   auto b =
       program.fused_batchnorm_inference(A, Scale, Bias, Mean, Variance, attrs);
 
-  Target target = common::DefaultTarget();
+  Target target = cinn::common::DefaultTarget();
   program.SetInputs({A});
   program.Validate();
   LOG(INFO) << "Program:\n" << program;
@@ -91,7 +91,7 @@ TEST(reduction, reduce) {
   auto c = program.reduce_prod(A, axis, keep_dim);
   auto d = program.reduce_sum(A, {0, 1, 2, 3}, keep_dim);
 
-  Target target = common::DefaultTarget();
+  Target target = cinn::common::DefaultTarget();
   program.SetInputs({A});
   program.Validate();
   LOG(INFO) << "Program:\n" << program;
@@ -124,7 +124,7 @@ TEST(Compare, Compare) {
   Program program;
   auto a = program.primitive_equal(A, B);
 
-  Target target = common::DefaultTarget();
+  Target target = cinn::common::DefaultTarget();
   program.SetInputs({A, B});
   program.Validate();
   LOG(INFO) << "Program:\n" << program;
diff --git a/paddle/cinn/hlir/pe/broadcast.cc b/paddle/cinn/hlir/pe/broadcast.cc
index 0378e5fe4be214..0d7824955ade07 100644
--- a/paddle/cinn/hlir/pe/broadcast.cc
+++ b/paddle/cinn/hlir/pe/broadcast.cc
@@ -29,7 +29,7 @@ namespace cinn {
 namespace hlir {
 namespace pe {
 
-using common::make_zero;
+using cinn::common::make_zero;
 using ir::Tensor;
 using lang::Compute;
 
@@ -323,8 +323,8 @@ Tensor Atan2(const Tensor& A,
 
   auto fn = [&](const Expr& elem_a, const Expr& elem_b) {
     auto atan = lang::Atan(elem_a / elem_b);
-    auto pi = common::make_const(atan->type(), PI);
-    auto half_pi = common::make_const(atan->type(), PI / 2);
+    auto pi = cinn::common::make_const(atan->type(), PI);
+    auto half_pi = cinn::common::make_const(atan->type(), PI / 2);
     auto zero = ir::Zero(atan->type());
     return ir::Select::Make(
         ir::EQ::Make(elem_b, zero),
diff --git a/paddle/cinn/hlir/pe/broadcast.h b/paddle/cinn/hlir/pe/broadcast.h
index bc7a7da0e3d694..adae2fe33a4b63 100644
--- a/paddle/cinn/hlir/pe/broadcast.h
+++ b/paddle/cinn/hlir/pe/broadcast.h
@@ -43,12 +43,12 @@ void GetBroadcastOutShape(const std::vector<int>& input_shape1,
  * shape(A) = (2, 3, 4, 5), shape(B) = (2), with axis=0
  * shape(A) = (2, 3, 4, 5), shape(B) = (2, 1), with axis=0
  */
-#define HLIR_DCL_BC_PE(name__)                                             \
-  ir::Tensor name__(                                                       \
-      const ir::Tensor& A,                                                 \
-      const ir::Tensor& B,                                                 \
-      const std::string& out_name = common::UniqName("T_" #name__ "_out"), \
-      const Expr& axis = Expr());
+#define HLIR_DCL_BC_PE(name__)                                       \
+  ir::Tensor name__(const ir::Tensor& A,                             \
+                    const ir::Tensor& B,                             \
+                    const std::string& out_name =                    \
+                        cinn::common::UniqName("T_" #name__ "_out"), \
+                    const Expr& axis = Expr());
 
 //! Compute A + B with auto-broadcasting.
 HLIR_DCL_BC_PE(Add);
@@ -107,13 +107,13 @@ ir::Tensor Pow(const ir::Tensor& A,
                const ir::Tensor& B,
                const std::string& output_name,
                const Expr& axis,
-               const common::Target& target);
+               const cinn::common::Target& target);
 
 ir::Tensor BroadcastTo(
     const ir::Tensor& A,
     const std::vector<int>& out_shape,
     const std::vector<int>& broadcast_axes,
-    const std::string& out_name = common::UniqName("T_broadcast_to_out"));
+    const std::string& out_name = cinn::common::UniqName("T_broadcast_to_out"));
 
 // This operator checks if all x and y satisfy the condition: |x - y| <= atol +
 // rtol * |y|
@@ -124,7 +124,7 @@ ir::Tensor IsClose(
     float rtol = 1e-05f,
     float atol = 1e-08f,
     bool equal_nan = false,
-    const std::string& out_name = common::UniqName("IsClose_output"));
+    const std::string& out_name = cinn::common::UniqName("IsClose_output"));
 
 }  // namespace pe
 }  // namespace hlir
diff --git a/paddle/cinn/hlir/pe/elementwise.cc b/paddle/cinn/hlir/pe/elementwise.cc
index 6a147a21b9a084..05455a85299569 100644
--- a/paddle/cinn/hlir/pe/elementwise.cc
+++ b/paddle/cinn/hlir/pe/elementwise.cc
@@ -250,7 +250,7 @@ ir::Tensor Arange(const float start,
         return ir::Cast::Make(
             dtype,
             Expr(start) +
-                Expr(step) * ir::Cast::Make(common::F32(), indices[0]));
+                Expr(step) * ir::Cast::Make(cinn::common::F32(), indices[0]));
       },
       output_name);
   return res;
diff --git a/paddle/cinn/hlir/pe/elementwise.h b/paddle/cinn/hlir/pe/elementwise.h
index 95e93c39d5c27e..e212fa9487a9c2 100644
--- a/paddle/cinn/hlir/pe/elementwise.h
+++ b/paddle/cinn/hlir/pe/elementwise.h
@@ -84,25 +84,26 @@ HLIR_DCL_UNARY_PE(Clz);
 HLIR_DCL_UNARY_PE(Popc);
 
 template <typename T>
-ir::Tensor AssignValue(const std::vector<T>& values,
-                       const common::Type& type = common::type_of<T>(),
-                       const std::string& output_name = "T_assign_value_out") {
+ir::Tensor AssignValue(
+    const std::vector<T>& values,
+    const cinn::common::Type& type = cinn::common::type_of<T>(),
+    const std::string& output_name = "T_assign_value_out") {
   CHECK(!values.empty())
       << "The input of pe::AssignValue should not empty! Please check.";
 
   auto out = lang::Compute(
       {ir::Expr(static_cast<int>(values.size()))},
       [=](const std::vector<ir::Expr>& indice) {
-        auto init_value = (type == common::type_of<T>())
+        auto init_value = (type == cinn::common::type_of<T>())
                               ? ir::Expr(values[0])
-                              : common::cast(ir::Expr(values[0]), type);
+                              : cinn::common::cast(ir::Expr(values[0]), type);
         ir::Expr previous = ir::Select::Make(
             ir::EQ::Make(indice[0], ir::Expr(0)), init_value, lang::Zero(type));
 
         for (int i = 1; i < values.size(); ++i) {
-          auto val = (type == common::type_of<T>())
+          auto val = (type == cinn::common::type_of<T>())
                          ? ir::Expr(values[i])
-                         : common::cast(ir::Expr(values[i]), type);
+                         : cinn::common::cast(ir::Expr(values[i]), type);
           previous = ir::Select::Make(
               ir::EQ::Make(indice[0], ir::Expr(i)), val, previous);
         }
diff --git a/paddle/cinn/hlir/pe/ir_schedule_pe.cc b/paddle/cinn/hlir/pe/ir_schedule_pe.cc
index b8f6d170996b38..2c27c98d5faf90 100644
--- a/paddle/cinn/hlir/pe/ir_schedule_pe.cc
+++ b/paddle/cinn/hlir/pe/ir_schedule_pe.cc
@@ -66,10 +66,10 @@ void SetReduceAxis(ir::Expr loop, ir::Expr block) {
 
 void IRElementwiseSchedule(ir::IRSchedule &ir_sch,  // NOLINT
                            const std::vector<int> &output_shape,
-                           const common::Target &target) {
+                           const cinn::common::Target &target) {
   VLOG(3) << "Before IRElementwiseSchedule, new ir is : "
           << ir_sch.GetModule().GetExprs().at(0);
-  if (target == common::DefaultNVGPUTarget()) {
+  if (target == cinn::common::DefaultNVGPUTarget()) {
     auto blocks = ir_sch.GetAllBlocks();
     std::vector<ir::Expr> loops = ir_sch.GetLoops(blocks[0]);
     ir::Expr loop = ir_sch.Fuse(loops);
@@ -94,10 +94,10 @@ void IRElementwiseSchedule(ir::IRSchedule &ir_sch,  // NOLINT
 
 void IRInjectiveSchedule(ir::IRSchedule &ir_sch,  // NOLINT
                          const std::vector<int> &output_shape,
-                         const common::Target &target) {
+                         const cinn::common::Target &target) {
   VLOG(3) << "Before IRInjectiveSchedule, new ir is : "
           << ir_sch.GetModule().GetExprs().at(0);
-  if (target == common::DefaultNVGPUTarget()) {
+  if (target == cinn::common::DefaultNVGPUTarget()) {
     auto blocks = ir_sch.GetAllBlocks();
     std::vector<ir::Expr> loops = ir_sch.GetLoops(blocks[0]);
     ir::Expr loop = ir_sch.Fuse(loops);
@@ -122,7 +122,7 @@ void IRInjectiveSchedule(ir::IRSchedule &ir_sch,  // NOLINT
 
 void IRScheduleInjectiveCPU(ir::IRSchedule &ir_sch,  // NOLINT
                             const std::vector<int> &output_shape,
-                            const common::Target &target,
+                            const cinn::common::Target &target,
                             bool vectorizable) {
   VLOG(3) << "Begin IRScheduleInjectiveCPU"
           << ir_sch.GetModule().GetExprs().at(0);
@@ -159,7 +159,7 @@ void IRScheduleInjectiveCPU(ir::IRSchedule &ir_sch,  // NOLINT
 
 void IRCudaScheduleInjective(ir::IRSchedule &ir_sch,  // NOLINT
                              const std::vector<int> &output_shape,
-                             const common::Target &target) {
+                             const cinn::common::Target &target) {
   VLOG(3) << "Begin IRCudaScheduleInjective ";
   auto all_blocks = ir_sch.GetAllBlocks();
   auto loops = ir_sch.GetLoops(all_blocks[0]);
@@ -180,10 +180,10 @@ void IRCudaScheduleInjective(ir::IRSchedule &ir_sch,  // NOLINT
           << ir_sch.GetModule().GetExprs().at(0);
 }
 
-std::vector<common::CINNValue> IRCudaScheduleMatMul(
-    const common::CINNValuePack &arg_pack,
+std::vector<cinn::common::CINNValue> IRCudaScheduleMatMul(
+    const cinn::common::CINNValuePack &arg_pack,
     const std::vector<int> &output_shape,
-    const common::Target &target) {
+    const cinn::common::Target &target) {
   if (target.arch == Target::Arch::X86) {
     CINN_NOT_IMPLEMENTED
   }
@@ -230,12 +230,12 @@ std::vector<common::CINNValue> IRCudaScheduleMatMul(
     }
   }
 
-  return {common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
+  return {cinn::common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
 }
 
 void IRCudaScheduleMul(ir::IRSchedule &ir_sch,  // NOLINT
                        const std::vector<int> &output_shape,
-                       const common::Target &target) {
+                       const cinn::common::Target &target) {
   auto all_blocks = ir_sch.GetAllBlocks();
   auto loops = ir_sch.GetLoops(all_blocks.back());
   CHECK_GE(loops.size(), 2U);
@@ -248,7 +248,7 @@ void IRCudaScheduleMul(ir::IRSchedule &ir_sch,  // NOLINT
 
 void IRMulScheduleCPU(ir::IRSchedule &ir_sch,  // NOLINT
                       const std::vector<int> &reduce_first_shape,
-                      const common::Target &target) {
+                      const cinn::common::Target &target) {
   ir_sch.MergeExprs();
   auto all_blocks = ir_sch.GetAllBlocks();
   CHECK_EQ(all_blocks.size(), 4U);
@@ -266,7 +266,7 @@ void IRMulScheduleCPU(ir::IRSchedule &ir_sch,  // NOLINT
 void IRCudaSplitSchedule(ir::IRSchedule &ir_sch,  // NOLINT
                          const std::vector<std::vector<int>> &output_shapes,
                          int axis,
-                         const common::Target &target) {
+                         const cinn::common::Target &target) {
   VLOG(3) << "In IRCudaSplitSchedule, Before schedule expr is : "
           << ir_sch.GetModule().GetExprs().at(0);
   ir_sch.MergeExprs();
@@ -294,7 +294,7 @@ void IRCudaSplitSchedule(ir::IRSchedule &ir_sch,  // NOLINT
     block_names.push_back(get_block_name(block));
   }
   // if output with same shape.
-  if (with_same_shape && target == common::DefaultNVGPUTarget()) {
+  if (with_same_shape && target == cinn::common::DefaultNVGPUTarget()) {
     // flat loops.
     {
       auto tsize = std::accumulate(output_shapes[0].begin(),
@@ -326,7 +326,7 @@ void IRCudaSplitSchedule(ir::IRSchedule &ir_sch,  // NOLINT
                                master_loops[1]);
       }
     }
-  } else if (target == common::DefaultNVGPUTarget()) {
+  } else if (target == cinn::common::DefaultNVGPUTarget()) {
     // flat loops.
     {
       for (int idx = 0; idx < block_names.size(); ++idx) {
@@ -362,7 +362,7 @@ void IRCudaSplitSchedule(ir::IRSchedule &ir_sch,  // NOLINT
 void IRCudaScheduleReduce(ir::IRSchedule &ir_sch,  // NOLINT
                           ir::Tensor output,
                           int last_dimension_num,
-                          const common::Target &target) {
+                          const cinn::common::Target &target) {
   VLOG(3) << "Before IRCudaScheduleReduce : "
           << ir_sch.GetModule().GetExprs().at(0);
   int parallel_thread_num = 1;
@@ -418,7 +418,7 @@ void IRCudaScheduleReduce(ir::IRSchedule &ir_sch,  // NOLINT
 void IRCudaScheduleBlockReduceInternal(ir::IRSchedule &ir_sch,  // NOLINT
                                        ir::Tensor tmp_out,
                                        ir::Tensor out,
-                                       const common::Target &target) {
+                                       const cinn::common::Target &target) {
   VLOG(3) << "Before IRCudaScheduleBlockReduceInternal : "
           << ir_sch.GetModule().GetExprs().at(0);
   int fuse_times = ir_sch.GetLoops(tmp_out->name).size() - 2;
@@ -443,7 +443,7 @@ void IRCudaScheduleBlockReduceInternal(ir::IRSchedule &ir_sch,  // NOLINT
               ->schedule_block->as<ir::ScheduleBlock>());
 
     // create var
-    auto var = ir::Var(ir::Expr(0), ir::Expr(1), common::UniqName("i"));
+    auto var = ir::Var(ir::Expr(0), ir::Expr(1), cinn::common::UniqName("i"));
     out_block->as<ir::ScheduleBlockRealize>()->iter_values.push_back(var);
     out_block->as<ir::ScheduleBlockRealize>()
         ->schedule_block->as<ir::ScheduleBlock>()
@@ -517,7 +517,7 @@ void IRCudaScheduleBlockReduce(ir::IRSchedule &ir_sch,  // NOLINT
                                ir::Tensor reduce_tmp_out,
                                ir::Tensor tmp_out,
                                ir::Tensor out,
-                               const common::Target &target) {
+                               const cinn::common::Target &target) {
   VLOG(3) << "Before IRCudaScheduleBlockReduce : "
           << ir_sch.GetModule().GetExprs().at(0);
   int tmp_put_shape_size_without_reduce = 0;
@@ -667,7 +667,7 @@ void IRCudaScheduleBlockShuffleReduce(ir::IRSchedule &ir_sch,  // NOLINT
                                       ir::Tensor reshape,
                                       ir::Tensor internal,
                                       ir::Tensor reduce_out,
-                                      const common::Target &target) {
+                                      const cinn::common::Target &target) {
   VLOG(3) << "Before IRCudaScheduleBlockShuffleReduce : "
           << ir_sch.GetModule().GetExprs().at(0);
   // reshape compute inline
@@ -929,7 +929,7 @@ void IRCudaTwoStepReduceSchedule(ir::IRSchedule &ir_sch,  // NOLINT
                                  ir::Tensor internal,
                                  ir::Tensor tmp_out,
                                  ir::Tensor out,
-                                 const common::Target &target) {
+                                 const cinn::common::Target &target) {
   VLOG(3) << "Before IRCudaTwoStepReduceSchedule : "
           << ir_sch.GetModule().GetExprs().at(0);
   // fuse axis
@@ -1065,7 +1065,7 @@ void IRSoftmaxScheduleCPU(ir::IRSchedule &ir_sch, int axis) {  // NOLINT
 }
 
 void IRPoolScheduleGPU(ir::IRSchedule &ir_sch,  // NOLINT
-                       const common::Target &target,
+                       const cinn::common::Target &target,
                        int arg_pack_size) {
   VLOG(3) << "Before IRPoolScheduleGPU: "
           << ir_sch.GetModule().GetExprs().at(0);
@@ -1083,7 +1083,7 @@ void IRPoolScheduleGPU(ir::IRSchedule &ir_sch,  // NOLINT
 }
 
 void IRGlobalPoolScheduleGPU(ir::IRSchedule &ir_sch,  // NOLINT
-                             const common::Target &target) {
+                             const cinn::common::Target &target) {
   VLOG(3) << "Before IRGlobalPoolScheduleGPU: "
           << ir_sch.GetModule().GetExprs().at(0);
   auto all_blocks = ir_sch.GetAllBlocks();
@@ -1152,7 +1152,7 @@ void IRCudaScheduleDepthwiseConv(ir::IRSchedule &ir_sch,  // NOLINT
 }
 
 void IRCudaScheduleConv(ir::IRSchedule &ir_sch,  // NOLINT
-                        const common::Target &target) {
+                        const cinn::common::Target &target) {
   VLOG(3) << "Begin IRCudaScheduleConv with expr: "
           << ir_sch.GetModule().GetExprs().at(0);
   auto &res = ScheduleParam::get_cuda_instance().GetParam();
@@ -1297,7 +1297,7 @@ void IRCudaScheduleConv2(ir::IRSchedule &ir_sch,  // NOLINT
                          ir::Tensor &input_pad,   // NOLINT
                          ir::Tensor &weights,     // NOLINT
                          ir::Tensor &output,      // NOLINT
-                         const common::Target &target,
+                         const cinn::common::Target &target,
                          const std::string &key) {
   auto &res = ScheduleParam::get_cuda_instance().GetParam();
 
diff --git a/paddle/cinn/hlir/pe/ir_schedule_pe.h b/paddle/cinn/hlir/pe/ir_schedule_pe.h
index 5a7e32197220f1..4deb4a22277b9a 100644
--- a/paddle/cinn/hlir/pe/ir_schedule_pe.h
+++ b/paddle/cinn/hlir/pe/ir_schedule_pe.h
@@ -33,89 +33,89 @@ namespace pe {
 
 void IRElementwiseSchedule(ir::IRSchedule &ir_sch,  // NOLINT
                            const std::vector<int> &output_shape,
-                           const common::Target &target);
+                           const cinn::common::Target &target);
 
 void IRInjectiveSchedule(ir::IRSchedule &ir_sch,  // NOLINT
                          const std::vector<int> &output_shape,
-                         const common::Target &target);
+                         const cinn::common::Target &target);
 
 void IRScheduleInjectiveCPU(ir::IRSchedule &ir_sch,  // NOLINT
                             const std::vector<int> &output_shape,
-                            const common::Target &target,
+                            const cinn::common::Target &target,
                             bool vectorizable = true);
 
 void IRCudaScheduleInjective(ir::IRSchedule &ir_sch,  // NOLINT
                              const std::vector<int> &output_shape,
-                             const common::Target &target);
+                             const cinn::common::Target &target);
 
-std::vector<common::CINNValue> IRCudaScheduleMatMul(
-    const common::CINNValuePack &arg_pack,
+std::vector<cinn::common::CINNValue> IRCudaScheduleMatMul(
+    const cinn::common::CINNValuePack &arg_pack,
     const std::vector<int> &output_shape,
-    const common::Target &target);
+    const cinn::common::Target &target);
 
 void IRCudaScheduleMul(ir::IRSchedule &ir_sch,  // NOLINT
                        const std::vector<int> &output_shape,
-                       const common::Target &target);
+                       const cinn::common::Target &target);
 
 void IRMulScheduleCPU(ir::IRSchedule &ir_sch,  // NOLINT
                       const std::vector<int> &reduce_first_shape,
-                      const common::Target &target);
+                      const cinn::common::Target &target);
 
 void IRCudaSplitSchedule(ir::IRSchedule &ir_sch,  // NOLINT
                          const std::vector<std::vector<int>> &output_shapes,
                          int axis,
-                         const common::Target &target);
+                         const cinn::common::Target &target);
 
 void IRCudaScheduleReduce(ir::IRSchedule &ir_sch,  // NOLINT
                           ir::Tensor out,
                           int last_dimension_num,
-                          const common::Target &target);
+                          const cinn::common::Target &target);
 
 void IRCudaScheduleBlockReduce(ir::IRSchedule &ir_sch,  // NOLINT
                                ir::Tensor reduce_tmp_out,
                                ir::Tensor tmp_out,
                                ir::Tensor out,
-                               const common::Target &target);
+                               const cinn::common::Target &target);
 
 void IRCudaScheduleBlockReduceInternal(ir::IRSchedule &ir_sch,  // NOLINT
                                        ir::Tensor tmp_out,
                                        ir::Tensor out,
-                                       const common::Target &target);
+                                       const cinn::common::Target &target);
 
 void IRCudaScheduleBlockShuffleReduce(ir::IRSchedule &ir_sch,  // NOLINT
                                       ir::Tensor reshape,
                                       ir::Tensor internal,
                                       ir::Tensor out,
-                                      const common::Target &target);
+                                      const cinn::common::Target &target);
 
 void IRCudaTwoStepReduceSchedule(ir::IRSchedule &ir_sch,  // NOLINT
                                  ir::Tensor reshape,
                                  ir::Tensor internal,
                                  ir::Tensor tmp_out,
                                  ir::Tensor out,
-                                 const common::Target &target);
+                                 const cinn::common::Target &target);
 
 void IRSoftmaxScheduleCPU(ir::IRSchedule &ir_sch, int axis = -1);  // NOLINT
 
 void IRPoolScheduleGPU(ir::IRSchedule &ir_sch,  // NOLINT
-                       const common::Target &target,
+                       const cinn::common::Target &target,
                        int arg_pack_size = 3);
 
 void IRCudaScheduleDepthwiseConv(ir::IRSchedule &ir_sch,  // NOLINT
                                  const std::vector<ir::Expr> &tensors);
 
 void IRGlobalPoolScheduleGPU(ir::IRSchedule &ir_sch,  // NOLINT
-                             const common::Target &target);
+                             const cinn::common::Target &target);
 
 void IRCudaScheduleConv2(ir::IRSchedule &ir_sch,  // NOLINT
                          ir::Tensor &input_pad,   // NOLINT
                          ir::Tensor &weights,     // NOLINT
                          ir::Tensor &output,      // NOLINT
-                         const common::Target &target,
+                         const cinn::common::Target &target,
                          const std::string &key);
 
 void IRCudaScheduleConv(ir::IRSchedule &ir_sch,  // NOLINT
-                        const common::Target &target);
+                        const cinn::common::Target &target);
 
 }  // namespace pe
 }  // namespace hlir
diff --git a/paddle/cinn/hlir/pe/load_params_test.cc b/paddle/cinn/hlir/pe/load_params_test.cc
index 897e8186db4eba..cc76519472b27e 100644
--- a/paddle/cinn/hlir/pe/load_params_test.cc
+++ b/paddle/cinn/hlir/pe/load_params_test.cc
@@ -29,7 +29,7 @@ TEST(load_x86_params, load_x86_params) {
   ASSERT_EQ(res.count(key), 1);
 
   absl::flat_hash_map<std::string, int> conv2d_factors;
-  auto target = common::DefaultHostTarget();
+  auto target = cinn::common::DefaultHostTarget();
   std::vector<int> shape_input = {1, 64, 56, 56};
   std::vector<int> shape_weights = {64, 64, 3, 3};
   std::vector<int> strides = {1, 1};
diff --git a/paddle/cinn/hlir/pe/map_expr_to_ir.cc b/paddle/cinn/hlir/pe/map_expr_to_ir.cc
index 84dd3e7d302eee..4385f543ccda5d 100644
--- a/paddle/cinn/hlir/pe/map_expr_to_ir.cc
+++ b/paddle/cinn/hlir/pe/map_expr_to_ir.cc
@@ -51,7 +51,7 @@ class MapExprToIrTranslator {
  public:
   explicit MapExprToIrTranslator(const MapExpr& map_expr,
                                  const Node2LoweredFuncs& node2lowered_funcs,
-                                 const common::Target& target)
+                                 const cinn::common::Target& target)
       : map_expr_(map_expr),
         node2lowered_funcs_(&node2lowered_funcs),
         target_(target) {
@@ -744,7 +744,7 @@ class MapExprToIrTranslator {
 
   MapExpr map_expr_;
   const Node2LoweredFuncs* node2lowered_funcs_;
-  const common::Target target_;
+  const cinn::common::Target target_;
   TensorIteratorExpr4TensorT TensorIteratorExpr4Tensor;
   LoopDescriptor4LoopIteratorT LoopDescriptor4LoopIterator;
 };
@@ -752,7 +752,7 @@ class MapExprToIrTranslator {
 }  // namespace
 
 ir::Expr MapExprToIr(const MapExprCtx& map_expr_ctx,
-                     const common::Target& target) {
+                     const cinn::common::Target& target) {
   const auto& expr =
       MapExprToIrTranslator(
           map_expr_ctx.map_expr(), map_expr_ctx.node2lowered_funcs(), target)
diff --git a/paddle/cinn/hlir/pe/map_expr_to_ir.h b/paddle/cinn/hlir/pe/map_expr_to_ir.h
index f4d37ad05c3527..32dd771cf5e2ae 100644
--- a/paddle/cinn/hlir/pe/map_expr_to_ir.h
+++ b/paddle/cinn/hlir/pe/map_expr_to_ir.h
@@ -25,6 +25,6 @@ struct Target;
 namespace cinn::adt {
 
 ir::Expr MapExprToIr(const MapExprCtx& map_expr_ctx,
-                     const common::Target& target);
+                     const cinn::common::Target& target);
 
 }
diff --git a/paddle/cinn/hlir/pe/nn.cc b/paddle/cinn/hlir/pe/nn.cc
index a3bae2149297f8..9c10e1ad137c24 100644
--- a/paddle/cinn/hlir/pe/nn.cc
+++ b/paddle/cinn/hlir/pe/nn.cc
@@ -43,7 +43,7 @@ using ir::Min;
 using ir::Select;
 using ir::Tensor;
 
-std::string Type2StrForNN(common::Type type) {
+std::string Type2StrForNN(cinn::common::Type type) {
   std::string suffix;
   if (type.is_float(64)) {
     return "fp64";
@@ -139,7 +139,7 @@ std::vector<ir::Tensor> Conv2d_winograd_NCHW(const ir::Tensor &input,
         return ir::Select::Make(
             cond,
             weights(nn, cc, (yy / dilation_h), (xx / dilation_w)),
-            common::make_const(weights->type(), 0));
+            cinn::common::make_const(weights->type(), 0));
       },
       UniqName("weights_dilation"));
 
@@ -184,12 +184,12 @@ std::vector<ir::Tensor> Conv2d_winograd_NCHW(const ir::Tensor &input,
   output_shape = {
       input->shape[0],    // B
       weights->shape[0],  // O
-      common::AutoSimplify(
+      cinn::common::AutoSimplify(
           (input->shape[2] -
            ((weights_dilation->shape[2] - 1) * dilation_h + 1) + 2 * pad_h) /
               stride_h +
           1),  // H
-      common::AutoSimplify(
+      cinn::common::AutoSimplify(
           (input->shape[3] -
            ((weights_dilation->shape[3] - 1) * dilation_w + 1) + 2 * pad_w) /
               stride_w +
@@ -202,8 +202,8 @@ std::vector<ir::Tensor> Conv2d_winograd_NCHW(const ir::Tensor &input,
   ir::Tensor B = winograd_transform[1];
   ir::Tensor G = winograd_transform[2];
 
-  int nH = (common::AutoSimplify(output_shape[2]).as_int32() + m - 1) / m;
-  int nW = (common::AutoSimplify(output_shape[3]).as_int32() + m - 1) / m;
+  int nH = (cinn::common::AutoSimplify(output_shape[2]).as_int32() + m - 1) / m;
+  int nW = (cinn::common::AutoSimplify(output_shape[3]).as_int32() + m - 1) / m;
 
   int P = input->shape[0].as_int32() * nH * nW;
 
@@ -431,7 +431,7 @@ std::vector<ir::Tensor> Conv2d_NCHW_5D(const ir::Tensor &input,
                                        int dilation_w,
                                        std::string key,
                                        const std::string &output_name,
-                                       const common::Target &target) {
+                                       const cinn::common::Target &target) {
   // input: 4D to 5D, NCHW->NCHWc
   // [batch, in_channel, in_height, in_width] ->
   // [batch, in_channel_chunk, in_height, in_width, in_channel_block]
@@ -440,9 +440,9 @@ std::vector<ir::Tensor> Conv2d_NCHW_5D(const ir::Tensor &input,
   std::vector<Expr> shape_weights = weights->shape;
   CHECK_EQ(shape_input.size(), 4U) << "input's shape size should be 4";
   CHECK_EQ(shape_weights.size(), 4U) << "weight's shape size should be 4";
-  Expr c_in = common::AutoSimplify(shape_input[1]);
-  Expr c_filter = common::AutoSimplify(shape_weights[1]);
-  Expr c_out = common::AutoSimplify(shape_weights[0]);
+  Expr c_in = cinn::common::AutoSimplify(shape_input[1]);
+  Expr c_filter = cinn::common::AutoSimplify(shape_weights[1]);
+  Expr c_out = cinn::common::AutoSimplify(shape_weights[0]);
   absl::flat_hash_map<std::string, int> conv2d_factors;
   int oc = c_out.as_int32();
   int ic = c_in.as_int32();
@@ -507,12 +507,12 @@ std::vector<ir::Tensor> Conv2d_NCHW_5D(const ir::Tensor &input,
   std::vector<Expr> output_shape = {
       batch,  // B
       c_out,  // O
-      common::AutoSimplify((h_in - ((h_f - 1) * dilation_h + 1) + 2 * pad_h) /
-                               stride_h +
-                           1),  // H
-      common::AutoSimplify((w_in - ((w_f - 1) * dilation_w + 1) + 2 * pad_w) /
-                               stride_w +
-                           1)  // W
+      cinn::common::AutoSimplify(
+          (h_in - ((h_f - 1) * dilation_h + 1) + 2 * pad_h) / stride_h +
+          1),  // H
+      cinn::common::AutoSimplify(
+          (w_in - ((w_f - 1) * dilation_w + 1) + 2 * pad_w) / stride_w +
+          1)  // W
   };
   auto res = Compute(
       output_shape,
@@ -532,7 +532,7 @@ std::vector<ir::Tensor> Conv2d_NCHWc(const ir::Tensor &input,
                                      int dilation_h,
                                      int dilation_w,
                                      const std::string &output_name,
-                                     const common::Target &target) {
+                                     const cinn::common::Target &target) {
   // input: [N, c_in_outer, H, W, c_in_inner]
   // weight: [c_out_outer, c_filter_outer, filter_h, filter_w, c_filter_inner,
   // c_out_inner]
@@ -545,33 +545,33 @@ std::vector<ir::Tensor> Conv2d_NCHWc(const ir::Tensor &input,
       << "Conv2d_NCHWc weight's shape size should be 6";
 
   Expr batch = shape_input[0];
-  Expr c_in_outer = common::AutoSimplify(shape_input[1]);
+  Expr c_in_outer = cinn::common::AutoSimplify(shape_input[1]);
   Expr h_in = shape_input[2];
   Expr w_in = shape_input[3];
-  Expr c_in_inner = common::AutoSimplify(shape_input[4]);
+  Expr c_in_inner = cinn::common::AutoSimplify(shape_input[4]);
 
   Expr c_out_outer = shape_weights[0];
-  Expr c_filter_outer = common::AutoSimplify(shape_weights[1]);
+  Expr c_filter_outer = cinn::common::AutoSimplify(shape_weights[1]);
   Expr h_f = shape_weights[2];
   Expr w_f = shape_weights[3];
-  Expr c_filter_inner = common::AutoSimplify(shape_weights[4]);
-  Expr c_out_inner = common::AutoSimplify(shape_weights[5]);
+  Expr c_filter_inner = cinn::common::AutoSimplify(shape_weights[4]);
+  Expr c_out_inner = cinn::common::AutoSimplify(shape_weights[5]);
 
-  Expr c_filter = common::AutoSimplify(c_filter_outer * c_filter_inner);
-  Expr c_out = common::AutoSimplify(c_out_outer * c_out_inner);
-  Expr c_in = common::AutoSimplify(c_in_outer * c_in_inner);
+  Expr c_filter = cinn::common::AutoSimplify(c_filter_outer * c_filter_inner);
+  Expr c_out = cinn::common::AutoSimplify(c_out_outer * c_out_inner);
+  Expr c_in = cinn::common::AutoSimplify(c_in_outer * c_in_inner);
   Var fc(c_filter, UniqName("fc"));
   Var fy(h_f, UniqName("fy"));
   Var fx(w_f, UniqName("fx"));
   std::vector<Expr> output_shape = {
       batch,        // B
       c_out_outer,  // O
-      common::AutoSimplify((h_in - ((h_f - 1) * dilation_h + 1) + 2 * pad_h) /
-                               stride_h +
-                           1),  // H
-      common::AutoSimplify((w_in - ((w_f - 1) * dilation_w + 1) + 2 * pad_w) /
-                               stride_w +
-                           1),  // W
+      cinn::common::AutoSimplify(
+          (h_in - ((h_f - 1) * dilation_h + 1) + 2 * pad_h) / stride_h +
+          1),  // H
+      cinn::common::AutoSimplify(
+          (w_in - ((w_f - 1) * dilation_w + 1) + 2 * pad_w) / stride_w +
+          1),  // W
       c_out_inner};
 
   ir::Tensor input_pad;
@@ -583,18 +583,18 @@ std::vector<ir::Tensor> Conv2d_NCHWc(const ir::Tensor &input,
         },
         UniqName("input_pad"));
   } else {
-    auto pad_h_bound = common::AutoSimplify((output_shape[2] - 1) * stride_h +
-                                            (h_f - 1) * dilation_h + 1);
-    auto pad_w_bound = common::AutoSimplify((output_shape[3] - 1) * stride_w +
-                                            (w_f - 1) * dilation_w + 1);
+    auto pad_h_bound = cinn::common::AutoSimplify(
+        (output_shape[2] - 1) * stride_h + (h_f - 1) * dilation_h + 1);
+    auto pad_w_bound = cinn::common::AutoSimplify(
+        (output_shape[3] - 1) * stride_w + (w_f - 1) * dilation_w + 1);
     auto pad_out_h =
         std::min(pad_h_bound.as_int32(),
-                 common::AutoSimplify(h_in + 2 * pad_h).as_int32());
+                 cinn::common::AutoSimplify(h_in + 2 * pad_h).as_int32());
     auto pad_out_w =
         std::min(pad_w_bound.as_int32(),
-                 common::AutoSimplify(w_in + 2 * pad_w).as_int32());
-    auto h_in_pad = common::AutoSimplify(h_in + pad_h);
-    auto w_in_pad = common::AutoSimplify(w_in + pad_w);
+                 cinn::common::AutoSimplify(w_in + 2 * pad_w).as_int32());
+    auto h_in_pad = cinn::common::AutoSimplify(h_in + pad_h);
+    auto w_in_pad = cinn::common::AutoSimplify(w_in + pad_w);
     input_pad = Compute(
         {batch, c_in_outer, Expr(pad_out_h), Expr(pad_out_w), c_in_inner},
         [=](Expr n, Expr icc, Expr yy, Expr xx, Expr icb) {
@@ -614,20 +614,23 @@ std::vector<ir::Tensor> Conv2d_NCHWc(const ir::Tensor &input,
   auto packed_out = Compute(
       output_shape,
       [=](Expr n, Expr oc_chunk, Expr oh, Expr ow, Expr oc_block) {
-        Expr c_out_per_group = common::AutoSimplify(c_out * c_filter / c_in);
+        Expr c_out_per_group =
+            cinn::common::AutoSimplify(c_out * c_filter / c_in);
         Expr ic_outer, ic_inner;
         if (c_in == c_filter) {
-          ic_outer = common::AutoSimplify(fc / c_in_inner);
-          ic_inner = common::AutoSimplify(fc % c_in_inner);
+          ic_outer = cinn::common::AutoSimplify(fc / c_in_inner);
+          ic_inner = cinn::common::AutoSimplify(fc % c_in_inner);
         } else {
-          ic_outer = common::AutoSimplify(((oc_chunk * c_out_inner + oc_block) /
-                                               c_out_per_group * c_filter +
-                                           fc) /
-                                          c_in_inner);
-          ic_inner = common::AutoSimplify(((oc_chunk * c_out_inner + oc_block) /
-                                               c_out_per_group * c_filter +
-                                           fc) %
-                                          c_in_inner);
+          ic_outer =
+              cinn::common::AutoSimplify(((oc_chunk * c_out_inner + oc_block) /
+                                              c_out_per_group * c_filter +
+                                          fc) /
+                                         c_in_inner);
+          ic_inner =
+              cinn::common::AutoSimplify(((oc_chunk * c_out_inner + oc_block) /
+                                              c_out_per_group * c_filter +
+                                          fc) %
+                                         c_in_inner);
         }
         return lang::ReduceSum(input_pad(n,
                                          ic_outer,
@@ -754,7 +757,7 @@ std::vector<ir::Tensor> Conv2d_NHWC(const ir::Tensor &input,
         return ir::Select::Make(
             cond,
             weights(nn, cc, yy / dilation_h, xx / dilation_w),
-            common::make_const(weights->type(), 0));
+            cinn::common::make_const(weights->type(), 0));
       },
       UniqName("weights_dilation"));
 
@@ -923,8 +926,8 @@ ir::Tensor BatchNorm_NCHW(const ir::Tensor &input,
       input->shape,
       [=](Expr n, Expr c, Expr h, Expr w) {
         return (input(n, c, h, w) - mean(c)) * scale(c) /
-                   lang::Sqrt(variance(c) +
-                              common::make_const(input->type(), epsilon)) +
+                   lang::Sqrt(variance(c) + cinn::common::make_const(
+                                                input->type(), epsilon)) +
                bias(c);
       },
       UniqName(output_name));
@@ -954,8 +957,8 @@ ir::Tensor BatchNorm_NCHWc(const ir::Tensor &input,
       [=](Expr n, Expr icc, Expr h, Expr w, Expr icb) {
         Expr new_c = icc * ic_bn + icb;
         return (input(n, icc, h, w, icb) - mean(new_c)) * scale(new_c) /
-                   lang::Sqrt(variance(new_c) +
-                              common::make_const(input->type(), epsilon)) +
+                   lang::Sqrt(variance(new_c) + cinn::common::make_const(
+                                                    input->type(), epsilon)) +
                bias(new_c);
       },
       UniqName(output_name));
@@ -1101,8 +1104,8 @@ Tensor Pad(const Tensor &tensor,
     if (i >= pad_before.size()) {
       output_shape.push_back(tensor->shape[i]);
     } else {
-      auto shape =
-          common::AutoSimplify(tensor->shape[i] + pad_before[i] + pad_after[i]);
+      auto shape = cinn::common::AutoSimplify(tensor->shape[i] + pad_before[i] +
+                                              pad_after[i]);
       output_shape.push_back(shape);
     }
   }
@@ -1128,8 +1131,8 @@ Tensor Pad(const Tensor &tensor,
       }
       Expr sel_after;
       if (!MathEqual(pad_after[i], Expr(0))) {
-        sel_after =
-            common::AutoSimplify(ovars[i] < pad_before[i] + tensor->shape[i]);
+        sel_after = cinn::common::AutoSimplify(ovars[i] < pad_before[i] +
+                                                              tensor->shape[i]);
         sel.push_back(sel_after);
       }
       if (pad_mode == "edge") {
@@ -1229,7 +1232,7 @@ std::vector<Tensor> PoolImpl(const Tensor &tensor,
     do_pad = (do_pad) ? do_pad : (padding_size[i] || padding_size[i + k_size]);
 
     if (ceil_mode) {
-      pad_tail[i] = common::AutoSimplify(pad_tail[i] + stride[i] - 1);
+      pad_tail[i] = cinn::common::AutoSimplify(pad_tail[i] + stride[i] - 1);
     }
 
     daxis.emplace_back(Var(kernel[i], UniqName("kernel_idx")));
@@ -1237,7 +1240,7 @@ std::vector<Tensor> PoolImpl(const Tensor &tensor,
     pad_before[ii] = pad_head[i];
     pad_after[ii] = pad_tail[i];
 
-    auto out_dim = common::AutoSimplify(
+    auto out_dim = cinn::common::AutoSimplify(
         (tensor->shape[ii] - kernel[i] + pad_head[i] + pad_tail[i]) /
             stride[i] +
         1);
@@ -1292,13 +1295,13 @@ std::vector<Tensor> PoolImpl(const Tensor &tensor,
             auto temp_factor = make_const(Int(32), 1);
             for (int i = 0; i < k_size; i++) {
               int ii = axis[i];
-              start[i] =
-                  common::AutoSimplify(output[ii] * stride[i] - pad_head[i]);
+              start[i] = cinn::common::AutoSimplify(output[ii] * stride[i] -
+                                                    pad_head[i]);
               end[i] = Min::Make(start[i] + kernel[i], tensor->shape[ii]);
               start[i] = Max::Make(start[i], make_const(Int(32), 0));
               temp_factor = temp_factor * (end[i] - start[i]);
             }
-            common::AutoSimplify(temp_factor);
+            cinn::common::AutoSimplify(temp_factor);
             Expr divide_factor = Max::Make(temp_factor, make_const(Int(32), 1));
             return lang::ReduceSum(
                 ir::Div::Make(temp(indices),
@@ -1309,7 +1312,7 @@ std::vector<Tensor> PoolImpl(const Tensor &tensor,
             for (int i = 0; i < k_size; i++) {
               temp_factor = temp_factor * kernel[i];
             }
-            common::AutoSimplify(temp_factor);
+            cinn::common::AutoSimplify(temp_factor);
             return lang::ReduceSum(
                 ir::Div::Make(temp(indices),
                               ir::Cast::Make(temp->type(), temp_factor)),
@@ -1363,7 +1366,7 @@ std::vector<Tensor> PoolImpl(const Tensor &tensor,
                 Expr(static_cast<int>(tensor->shape[axis[i]].get_constant()) /
                      kernel_size[i]);
           }
-          common::AutoSimplify(temp_factor);
+          cinn::common::AutoSimplify(temp_factor);
           Expr divide_factor = Max::Make(temp_factor, make_const(Int(32), 1));
           return lang::ReduceSum(
               ir::Div::Make(temp(indices),
@@ -1421,8 +1424,8 @@ std::vector<Tensor> GlobalPool2d(const Tensor &tensor,
     auto temp = Compute(
         {tensor->shape[0], tensor->shape[1], Expr(32)},
         [=](Expr n, Expr c, Expr k) -> Expr {
-          Expr offset = common::IndiceToAbsOffset(tensor->shape,
-                                                  {n, c, Expr(0), Expr(0)});
+          Expr offset = cinn::common::IndiceToAbsOffset(
+              tensor->shape, {n, c, Expr(0), Expr(0)});
           return lang::CallExtern(
               "cinn_warp_reduce_max_" + Type2StrForNN(tensor->type()),
               {tensor, offset, extend});
@@ -1440,8 +1443,8 @@ std::vector<Tensor> GlobalPool2d(const Tensor &tensor,
     auto temp = Compute(
         {tensor->shape[0], tensor->shape[1], Expr(32)},
         [=](Expr n, Expr c, Expr k) -> Expr {
-          Expr offset = common::IndiceToAbsOffset(tensor->shape,
-                                                  {n, c, Expr(0), Expr(0)});
+          Expr offset = cinn::common::IndiceToAbsOffset(
+              tensor->shape, {n, c, Expr(0), Expr(0)});
           return lang::CallExtern(
               "cinn_warp_reduce_avg_" + Type2StrForNN(tensor->type()),
               {tensor, offset, extend});
@@ -1547,7 +1550,7 @@ Tensor DropoutInfer(const ir::Tensor &tensor,
         tensor->shape,
         [=](const std::vector<Expr> &indice) {
           return tensor(indice) *
-                 common::make_const(tensor->type(), 1 - dropout_prob);
+                 cinn::common::make_const(tensor->type(), 1 - dropout_prob);
         },
         output_name);
   } else if (dropout_implementation == "upscale_in_train") {
@@ -1572,7 +1575,7 @@ ir::Tensor Select(const ir::Tensor &condition,
   return lang::Compute(
       condition->shape,
       [=](const std::vector<Expr> &indice) {
-        return common::select(
+        return cinn::common::select(
             condition(indice), true_value(indice), false_value(indice));
       },
       output_name);
diff --git a/paddle/cinn/hlir/pe/nn.h b/paddle/cinn/hlir/pe/nn.h
index 609bb9ade329f4..32e2db2dc38f71 100755
--- a/paddle/cinn/hlir/pe/nn.h
+++ b/paddle/cinn/hlir/pe/nn.h
@@ -147,7 +147,7 @@ std::vector<ir::Tensor> Conv2d_NCHW_5D(
     int dilation_w,
     std::string key,
     const std::string &output_name = UniqName("T_Conv2d_NCHW_5D_out"),
-    const common::Target &target = common::DefaultHostTarget());
+    const cinn::common::Target &target = cinn::common::DefaultHostTarget());
 
 /**
  * @brief Perform a 2-D convolution with an NCHWc-layout.
@@ -176,7 +176,7 @@ std::vector<ir::Tensor> Conv2d_NCHWc(
     int dilation_h,
     int dilation_w,
     const std::string &output_name = UniqName("T_Conv2d_NCHWc_out"),
-    const common::Target &target = common::DefaultHostTarget());
+    const cinn::common::Target &target = cinn::common::DefaultHostTarget());
 
 #ifdef CINN_WITH_DNNL
 std::vector<ir::Tensor> Conv2d_NCHW_MKLDNN(
diff --git a/paddle/cinn/hlir/pe/nn_util.cc b/paddle/cinn/hlir/pe/nn_util.cc
index 22ae26d03aea7a..8ea958c5a499eb 100644
--- a/paddle/cinn/hlir/pe/nn_util.cc
+++ b/paddle/cinn/hlir/pe/nn_util.cc
@@ -387,13 +387,14 @@ ir::Tensor const_matrix(const std::vector<std::vector<float>>& input,
         auto now = cinn::common::make_const(1.0f);
         for (int ii = 0; ii < row; ii++) {
           for (int jj = 0; jj < col; jj++) {
-            // if (common::is_zero(Expr(ii)-yy) && common::is_zero(Expr(jj)-xx))
+            // if (cinn::common::is_zero(Expr(ii)-yy) &&
+            // cinn::common::is_zero(Expr(jj)-xx))
             // {
             //     now = cinn::common::make_const(input[ii][jj]);
             // }
             auto cond =
-                common::and_all({Expr(ii) - yy == 0, Expr(jj) - xx == 0});
-            now = common::select(
+                cinn::common::and_all({Expr(ii) - yy == 0, Expr(jj) - xx == 0});
+            now = cinn::common::select(
                 cond, cinn::common::make_const(input[ii][jj]), now);
           }
         }
@@ -461,7 +462,8 @@ std::vector<int> GetFirstStepReduceShape(const std::vector<int>& shape,
   // post parallel size
   int post_parallel_size = GetPostParallelSize(shape, axes);
   // the size to unfold las reduce axis
-  int unfold_size = common::GetMaxThreads() / GetParallelSize(shape, axes);
+  int unfold_size =
+      cinn::common::GetMaxThreads() / GetParallelSize(shape, axes);
   CHECK_GT(unfold_size, 1);
 
   // fuse reduce axis.
diff --git a/paddle/cinn/hlir/pe/pe_broadcast_test.cc b/paddle/cinn/hlir/pe/pe_broadcast_test.cc
index 533c3ebdd97068..865f0f0bb475bd 100644
--- a/paddle/cinn/hlir/pe/pe_broadcast_test.cc
+++ b/paddle/cinn/hlir/pe/pe_broadcast_test.cc
@@ -43,7 +43,7 @@ void TestBroadcastPE(const std::string &fn_name,
 
   auto stages = CreateStages({C});
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
   Module::Builder builder("module0", target);
   auto func = Lower("fn", stages, {A, B, C});
   builder.AddFunction(func);
@@ -60,23 +60,24 @@ void TestBroadcastPE(const std::string &fn_name,
   cinn_buffer_t *A_buf;
   cinn_buffer_t *B_buf;
   if (set_value != 0) {
-    A_buf = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
+    A_buf = cinn::common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
                 .set_val(set_value)
                 .Build();
-    B_buf = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
+    B_buf = cinn::common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
                 .set_val(set_value)
                 .Build();
   } else {
-    A_buf = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
+    A_buf = cinn::common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
                 .set_random()
                 .Build();
-    B_buf = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
+    B_buf = cinn::common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
                 .set_random()
                 .Build();
   }
-  auto *C_buf = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
-                    .set_zero()
-                    .Build();
+  auto *C_buf =
+      cinn::common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
+          .set_zero()
+          .Build();
 
   cinn_pod_value_t a_arg(A_buf), b_arg(B_buf), c_arg(C_buf);
   cinn_pod_value_t args[] = {a_arg, b_arg, c_arg};
@@ -102,7 +103,7 @@ void TestBroadcastPE1(const std::string &fn_name,
   Placeholder<float> B("B", {N});
   auto C = func_op(A.tensor(), B.tensor(), "C", Expr(1));
   auto stages = CreateStages({C});
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
   Module::Builder builder("module0", target);
   auto func = Lower("fn", stages, {A, B, C});
   builder.AddFunction(func);
@@ -116,22 +117,23 @@ void TestBroadcastPE1(const std::string &fn_name,
   cinn_buffer_t *A_buf;
   cinn_buffer_t *B_buf;
   if (set_value != 0) {
-    A_buf = common::BufferBuilder(Float(32),
-                                  {M.as_int32(), N.as_int32(), K.as_int32()})
+    A_buf = cinn::common::BufferBuilder(
+                Float(32), {M.as_int32(), N.as_int32(), K.as_int32()})
                 .set_val(set_value)
                 .Build();
-    B_buf = common::BufferBuilder(Float(32), {N.as_int32()})
+    B_buf = cinn::common::BufferBuilder(Float(32), {N.as_int32()})
                 .set_val(set_value)
                 .Build();
   } else {
-    A_buf = common::BufferBuilder(Float(32),
-                                  {M.as_int32(), N.as_int32(), K.as_int32()})
+    A_buf = cinn::common::BufferBuilder(
+                Float(32), {M.as_int32(), N.as_int32(), K.as_int32()})
+                .set_random()
+                .Build();
+    B_buf = cinn::common::BufferBuilder(Float(32), {N.as_int32()})
                 .set_random()
                 .Build();
-    B_buf =
-        common::BufferBuilder(Float(32), {N.as_int32()}).set_random().Build();
   }
-  auto *C_buf = common::BufferBuilder(
+  auto *C_buf = cinn::common::BufferBuilder(
                     Float(32), {M.as_int32(), N.as_int32(), K.as_int32()})
                     .set_zero()
                     .Build();
@@ -163,7 +165,7 @@ void TestBroadcastPE2(const std::string &fn_name,
   Placeholder<float> B("B", {N, K});
   auto C = func_op(A.tensor(), B.tensor(), "C", Expr(1));
   auto stages = CreateStages({C});
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
   Module::Builder builder("module0", target);
   auto func = Lower("fn", stages, {A, B, C});
   builder.AddFunction(func);
@@ -178,25 +180,25 @@ void TestBroadcastPE2(const std::string &fn_name,
   cinn_buffer_t *B_buf;
   if (set_value != 0) {
     A_buf =
-        common::BufferBuilder(
+        cinn::common::BufferBuilder(
             Float(32), {M.as_int32(), N.as_int32(), K.as_int32(), R.as_int32()})
             .set_val(set_value)
             .Build();
-    B_buf = common::BufferBuilder(Float(32), {N.as_int32(), K.as_int32()})
+    B_buf = cinn::common::BufferBuilder(Float(32), {N.as_int32(), K.as_int32()})
                 .set_val(set_value)
                 .Build();
   } else {
     A_buf =
-        common::BufferBuilder(
+        cinn::common::BufferBuilder(
             Float(32), {M.as_int32(), N.as_int32(), K.as_int32(), R.as_int32()})
             .set_random()
             .Build();
-    B_buf = common::BufferBuilder(Float(32), {N.as_int32(), K.as_int32()})
+    B_buf = cinn::common::BufferBuilder(Float(32), {N.as_int32(), K.as_int32()})
                 .set_random()
                 .Build();
   }
   auto *C_buf =
-      common::BufferBuilder(
+      cinn::common::BufferBuilder(
           Float(32), {M.as_int32(), N.as_int32(), K.as_int32(), R.as_int32()})
           .set_zero()
           .Build();
diff --git a/paddle/cinn/hlir/pe/pe_elementwise_test.cc b/paddle/cinn/hlir/pe/pe_elementwise_test.cc
index c96a28a19762b1..806c340d791e2f 100644
--- a/paddle/cinn/hlir/pe/pe_elementwise_test.cc
+++ b/paddle/cinn/hlir/pe/pe_elementwise_test.cc
@@ -51,7 +51,7 @@ void TestElementwisePE(const std::string &fn_name,
 
   stages[A_out[0]]->Parallel(0);
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
   Module::Builder builder("module0", target);
   for (auto &tensor : A_out) {
     stages->InsertLazily(tensor);
@@ -70,15 +70,15 @@ void TestElementwisePE(const std::string &fn_name,
 
   cinn_buffer_t *A_buf;
   if (set_value != 0) {
-    A_buf = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
+    A_buf = cinn::common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
                 .set_val(set_value)
                 .Build();
   } else {
-    A_buf = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
+    A_buf = cinn::common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
                 .set_random()
                 .Build();
   }
-  auto *B_buf = common::BufferBuilder(type, {M.as_int32(), N.as_int32()})
+  auto *B_buf = cinn::common::BufferBuilder(type, {M.as_int32(), N.as_int32()})
                     .set_align(type.bits())
                     .Build();
 
diff --git a/paddle/cinn/hlir/pe/pe_transform_test.cc b/paddle/cinn/hlir/pe/pe_transform_test.cc
index b69b48b4b85bf2..852cc26211298e 100644
--- a/paddle/cinn/hlir/pe/pe_transform_test.cc
+++ b/paddle/cinn/hlir/pe/pe_transform_test.cc
@@ -52,7 +52,7 @@ TEST(MatmulPE, MatmulCase1) {
     tensor_args.push_back(C[i]);
     stages->InsertLazily(C[i]);
   }
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
   Module::Builder builder("module0", target);
   auto func = Lower("fn", stages, tensor_args);
   builder.AddFunction(func);
@@ -66,9 +66,9 @@ TEST(MatmulPE, MatmulCase1) {
   CHECK(fn);
   auto fn_ = reinterpret_cast<void (*)(void *, int32_t)>(fn);
   cinn_buffer_t *A_buf =
-      common::BufferBuilder(Float(32), {m, k}).set_random().Build();
+      cinn::common::BufferBuilder(Float(32), {m, k}).set_random().Build();
   cinn_buffer_t *B_buf =
-      common::BufferBuilder(Float(32), {k, n}).set_random().Build();
+      cinn::common::BufferBuilder(Float(32), {k, n}).set_random().Build();
   cinn_pod_value_t a_arg(A_buf), b_arg(B_buf);
   std::vector<cinn_pod_value_t> args = {a_arg, b_arg};
   std::vector<cinn_buffer_t *> C_buf;
@@ -77,7 +77,8 @@ TEST(MatmulPE, MatmulCase1) {
     for (auto &shape : C[i]->shape) {
       shapes.push_back(shape.as_int32());
     }
-    auto *buffer = common::BufferBuilder(Float(32), shapes).set_zero().Build();
+    auto *buffer =
+        cinn::common::BufferBuilder(Float(32), shapes).set_zero().Build();
     CHECK(buffer);
     C_buf.push_back(buffer);
     cinn_pod_value_t arg(buffer);
@@ -115,9 +116,9 @@ TEST(ScatterAssign, ScatterAssign) {
   int axis = 0;
 
 #ifdef CINN_WITH_CUDA
-  auto target = common::DefaultNVGPUTarget();
+  auto target = cinn::common::DefaultNVGPUTarget();
 #else
-  auto target = common::DefaultHostTarget();
+  auto target = cinn::common::DefaultHostTarget();
 #endif
 
   auto output = hlir::pe::ScatterAssign(
@@ -170,7 +171,7 @@ TEST(SliceAssign, SliceAssign) {
   LOG(INFO) << "func:\n" << func;
 
 #ifdef CINN_WITH_CUDA
-  auto target = common::DefaultNVGPUTarget();
+  auto target = cinn::common::DefaultNVGPUTarget();
   Module::Builder builder("SliceAssign_Builder", target);
   builder.AddFunction(func);
 
@@ -211,7 +212,7 @@ TEST(Concat, ConcatCase0) {
   LOG(INFO) << "func:\n" << func;
 
 #ifdef CINN_WITH_CUDA
-  auto target = common::DefaultNVGPUTarget();
+  auto target = cinn::common::DefaultNVGPUTarget();
   Module::Builder builder("Concat_Builder", target);
   builder.AddFunction(func);
 
diff --git a/paddle/cinn/hlir/pe/reduction.cc b/paddle/cinn/hlir/pe/reduction.cc
index f809efbd13e67d..e4850e96dabcd7 100644
--- a/paddle/cinn/hlir/pe/reduction.cc
+++ b/paddle/cinn/hlir/pe/reduction.cc
@@ -73,7 +73,7 @@ void GetRealAxes(int ndim,
   }
 }
 
-std::string Type2StrForReduce(common::Type type) {
+std::string Type2StrForReduce(cinn::common::Type type) {
   std::string suffix;
   if (type.is_int(32)) {
     return "_int32";
@@ -114,7 +114,7 @@ void GetOutputShape(const std::vector<int>& real_axes,
   if (keep_dims) {
     for (size_t i = 0; i < ndim; ++i) {
       if (std::find(real_axes.begin(), real_axes.end(), i) != real_axes.end()) {
-        output_shape->push_back(common::make_one());
+        output_shape->push_back(cinn::common::make_one());
       } else {
         output_shape->push_back(tensor->shape[i]);
       }
@@ -127,7 +127,7 @@ void GetOutputShape(const std::vector<int>& real_axes,
     }
   }
   if (output_shape->empty()) {
-    output_shape->push_back(common::make_one());
+    output_shape->push_back(cinn::common::make_one());
   }
 }
 
@@ -300,7 +300,7 @@ std::vector<Tensor> WarpReduce(const ir::Tensor& A,
           tmp_indexs.push_back(Expr(0));
         }
         CHECK_EQ(A->shape.size(), tmp_indexs.size());
-        Expr offset = common::IndiceToAbsOffset(A->shape, tmp_indexs);
+        Expr offset = cinn::common::IndiceToAbsOffset(A->shape, tmp_indexs);
         return lang::CallExtern(reduce_type, {A, offset, reduce_width});
       },
       UniqName(output_name + "_" + reduce_type));
@@ -530,7 +530,7 @@ std::vector<ir::Tensor> BlockReduce(const ir::Tensor& A,
         // checkout input shape size equals tmp indexs size.
         CHECK_EQ(A->shape.size(), tmp_indexs.size());
         // compute offset.
-        Expr offset = common::IndiceToAbsOffset(A->shape, tmp_indexs);
+        Expr offset = cinn::common::IndiceToAbsOffset(A->shape, tmp_indexs);
         // call block reduce sum
         return lang::CallExtern(reduce_type, {A, offset, reduce_width});
       },
@@ -753,7 +753,7 @@ std::vector<ir::Tensor> ReduceInternal(const ir::Tensor& A,
       const std::vector<int>& axes,                                            \
       const bool keep_dim,                                                     \
       const std::string& output_name) {                                        \
-    if (common::GetMaxThreads() / GetParallelSize(A, axes) <= 1) {             \
+    if (cinn::common::GetMaxThreads() / GetParallelSize(A, axes) <= 1) {       \
       return {Reduce##name(A, axes, keep_dim, output_name)};                   \
     } else {                                                                   \
       auto rs = ReduceInternal(                                                \
@@ -824,7 +824,7 @@ std::vector<ir::Tensor> TwoStepBlockReduceInternal(
   // If the number of current device SM is smaller than the number of SM
   // required by Warp Reduce, the performance of Warp Reduce is better.
   // Otherwise, use Block Reduce.
-  auto max_num_threads = common::DefaultNVGPUTarget().max_num_threads();
+  auto max_num_threads = cinn::common::DefaultNVGPUTarget().max_num_threads();
   int need_reduce_last_count = 1;
   for (int i = 0; i < A->shape.size(); i++) {
     if (find(axes.begin(), axes.end(), i) == axes.end()) {
@@ -834,9 +834,9 @@ std::vector<ir::Tensor> TwoStepBlockReduceInternal(
   int warp_reduce_need_sm_count =
       ceil((need_reduce_last_count * 32) /
            static_cast<float>(
-               common::DefaultNVGPUTarget().get_max_threads_per_sm()));
+               cinn::common::DefaultNVGPUTarget().get_max_threads_per_sm()));
   // Set Num_max_threads to 32 is Warp Reduce
-  if (common::DefaultNVGPUTarget().get_multi_processor_count() <
+  if (cinn::common::DefaultNVGPUTarget().get_multi_processor_count() <
       warp_reduce_need_sm_count) {
     max_num_threads = 32;
   }
diff --git a/paddle/cinn/hlir/pe/reduction.h b/paddle/cinn/hlir/pe/reduction.h
index a3a5f02915ef9f..4779007daba426 100644
--- a/paddle/cinn/hlir/pe/reduction.h
+++ b/paddle/cinn/hlir/pe/reduction.h
@@ -471,7 +471,7 @@ std::vector<ir::Tensor> TwoStepBlockReduceAny(
 std::string CrossThreadReduceExternalFuncName(const ir::Expr& op,
                                               const ir::Expr& tensor);
 
-std::string Type2StrForReduce(common::Type type);
+std::string Type2StrForReduce(cinn::common::Type type);
 }  // namespace pe
 }  // namespace hlir
 }  // namespace cinn
diff --git a/paddle/cinn/hlir/pe/schedule.cc b/paddle/cinn/hlir/pe/schedule.cc
index 6e9cfe6d887566..c75f9aefccf29c 100644
--- a/paddle/cinn/hlir/pe/schedule.cc
+++ b/paddle/cinn/hlir/pe/schedule.cc
@@ -36,13 +36,13 @@ namespace cinn {
 namespace hlir {
 namespace pe {
 
-ScheduleParam::ScheduleParam(common::Target::Arch arch) {
+ScheduleParam::ScheduleParam(cinn::common::Target::Arch arch) {
   switch (arch) {
-    case common::Target::Arch::X86: {
+    case cinn::common::Target::Arch::X86: {
       param_data = CreateX86Params();
       break;
     }
-    case common::Target::Arch::NVGPU: {
+    case cinn::common::Target::Arch::NVGPU: {
       param_data = CreateCudaParams();
       break;
     }
@@ -85,7 +85,7 @@ int SplitEven(int origin) {
   return res;
 }
 
-int GetBasicFactor(const Type &type, const common::Target &target) {
+int GetBasicFactor(const Type &type, const cinn::common::Target &target) {
   int target_native_vector_bits = target.get_target_bits() * 8;
   int type_bits = type.bits();
   return target_native_vector_bits / type_bits;
@@ -114,7 +114,7 @@ int GetVectorizeFactor(int shape, int split_factor) {
 
 void ScheduleInjectiveCPU(poly::Stage *stage,
                           const std::vector<int> &output_shape,
-                          const common::Target &target,
+                          const cinn::common::Target &target,
                           bool vectorizable) {
   int dims = stage->n_out_dims();
   int factor = GetBasicFactor(stage->tensor()->type(), target);
@@ -142,7 +142,7 @@ void ScheduleInjectiveCPU(poly::Stage *stage,
 
 void ScheduleInjectiveCPU1(poly::Stage *stage,
                            const std::vector<int> &output_shape,
-                           const common::Target &target,
+                           const cinn::common::Target &target,
                            bool vectorizable) {
   int dims = stage->n_out_dims();
   if (dims > 1) {
@@ -187,7 +187,7 @@ void ScheduleInjectiveCPU1(poly::Stage *stage,
 
 int GetArrayPackingFactor(int shape,
                           const Type &type,
-                          const common::Target &target) {
+                          const cinn::common::Target &target) {
   int split_base = GetBasicFactor(type, target);
   int split_factor = 1;
   // temporily use shape-1 instead of shape for isl wrong for1 elimination
@@ -203,7 +203,7 @@ int GetArrayPackingFactor(int shape,
 
 void MatmulScheduleCUDA(poly::StageMap stages,
                         const ir::Tensor &output,
-                        const common::Target &target) {
+                        const cinn::common::Target &target) {
   stages[output]->Split(1, 2);
   stages[output]->Bind(0, "blockIdx.x");
   stages[output]->Bind(1, "threadIdx.x");
@@ -212,7 +212,7 @@ void MatmulScheduleCUDA(poly::StageMap stages,
 void MatmulScheduleCPU(poly::StageMap stages,
                        const ir::Tensor &output,
                        const ir::Tensor &packedB,
-                       const common::Target &target) {
+                       const cinn::common::Target &target) {
   CHECK_EQ(output->type(), packedB->type());
   int basic_split_factor = GetBasicFactor(packedB->type(), target);
   // packedB
@@ -324,7 +324,7 @@ void MatmulScheduleCPU(poly::StageMap stages,
 void MulScheduleCPU(poly::StageMap stages,
                     const ir::Tensor &output,
                     const ir::Tensor &reduce_first,
-                    const common::Target &target) {
+                    const cinn::common::Target &target) {
   int split_factor = GetBasicFactor(output->type(), target);
   auto out_reduce_axis = output->reduce_axis;
   std::vector<Expr> reduce_first_shape = reduce_first->shape;
@@ -384,7 +384,7 @@ int GetBlockBindAxis(const std::vector<ir::Expr> &shape,
 void CudaReduceSchedule(poly::StageMap stages,
                         ir::Tensor output,
                         int last_dimension_num,
-                        const common::Target &target) {
+                        const cinn::common::Target &target) {
   int parallel_thread_num = 1;
   for (int idx = output->shape.size() - 1;
        idx >= static_cast<int>(output->shape.size()) - last_dimension_num;
@@ -419,7 +419,7 @@ void CudaReduceSchedule(poly::StageMap stages,
 void CudaWarpReduceSchedule(poly::StageMap stages,
                             ir::Tensor tmp_out,
                             ir::Tensor out,
-                            const common::Target &target) {
+                            const cinn::common::Target &target) {
   int sum_out_dim = 1;
   for (int idx = 0; idx < static_cast<int>(tmp_out->shape.size()) - 2; ++idx) {
     stages[out]->Fuse(0, 1);
@@ -456,7 +456,7 @@ void CudaWarpReduceSchedule(poly::StageMap stages,
 void CudaBlockReduceInternalSchedule(poly::StageMap stages,
                                      ir::Tensor tmp_out,
                                      ir::Tensor out,
-                                     const common::Target &target) {
+                                     const cinn::common::Target &target) {
   for (int idx = 0; idx < static_cast<int>(tmp_out->shape.size()) - 2; ++idx) {
     stages[tmp_out]->Fuse(0, 1);
     stages[out]->Fuse(0, 1);
@@ -479,7 +479,7 @@ void CudaBlockReduceSchedule(poly::StageMap stages,
                              ir::Tensor reduce_tmp_out,
                              ir::Tensor tmp_out,
                              ir::Tensor out,
-                             const common::Target &target) {
+                             const cinn::common::Target &target) {
   int output_shape_size_without_reduce = tmp_out->shape.size() - 1;
   // fuse last parallel dimension
   for (int idx = 0; idx < reduce_tmp_out->shape.size() - tmp_out->shape.size();
@@ -518,7 +518,7 @@ void CudaBlockShuffleReduceSchedule(poly::StageMap stages,
                                     ir::Tensor reshape,
                                     ir::Tensor internal,
                                     ir::Tensor out,
-                                    const common::Target &target) {
+                                    const cinn::common::Target &target) {
   int fuse_times = internal->shape.size() - 2;
   for (int idx = 0; idx < fuse_times; ++idx) {
     stages[internal]->Fuse(0, 1);
@@ -557,7 +557,7 @@ void CudaTwoStepReduceSchedule(poly::StageMap stages,
                                ir::Tensor internal,
                                ir::Tensor tmp_out,
                                ir::Tensor out,
-                               const common::Target &target) {
+                               const cinn::common::Target &target) {
   // fuse axis
   for (int idx = 0; idx < static_cast<int>(internal->shape.size()) - 2; ++idx) {
     stages[internal]->Fuse(0, 1);
@@ -604,7 +604,7 @@ void SoftmaxScheduleCPU(poly::StageMap stage,
 
 void GlobalPoolScheduleGPU(poly::StageMap stages,
                            const std::vector<ir::Tensor> &output,
-                           const common::Target &target) {
+                           const cinn::common::Target &target) {
   auto &out = output[0];
   auto &reduce = output[1];
   stages[out]->Fuse(0, 1);
@@ -617,7 +617,7 @@ void GlobalPoolScheduleGPU(poly::StageMap stages,
 }
 void PoolScheduleCPU(poly::StageMap stages,
                      const ir::Tensor &output,
-                     const common::Target &target) {
+                     const cinn::common::Target &target) {
   CHECK_GE(stages[output]->n_out_dims(), 2);
   stages[output]->Fuse({0, 1});
   stages[output]->Parallel(0);
@@ -625,7 +625,7 @@ void PoolScheduleCPU(poly::StageMap stages,
 
 void PoolScheduleGPU(poly::StageMap stages,
                      const ir::Tensor &output,
-                     const common::Target &target) {
+                     const cinn::common::Target &target) {
   CHECK_GE(stages[output]->axis_names().size(), 4);
   stages[output]->Fuse({0, 1, 2, 3});
   stages[output]->Split(0, 1024);
@@ -640,7 +640,7 @@ void GetConv2dFactors(absl::flat_hash_map<std::string, int> *factors,
                       int oh,
                       int ow,
                       const Type &type,
-                      const common::Target &target,
+                      const cinn::common::Target &target,
                       const std::string &key,
                       bool import_params) {
   if (import_params) {
@@ -742,7 +742,7 @@ void GetConv2d1x1Factors(absl::flat_hash_map<std::string, int> *factors,
                          int oh,
                          int ow,
                          const Type &type,
-                         const common::Target &target) {
+                         const cinn::common::Target &target) {
   int bn_base = GetBasicFactor(type, target);
   int oc_bn = 1;
   for (int i = bn_base; i > 1; i--) {
@@ -870,7 +870,7 @@ void Conv2d_NCHWc_1X1_Schedule_CPU(poly::StageMap stages,
                                    const ir::Tensor &input_pad,
                                    const ir::Tensor &weights_dilation,
                                    const ir::Tensor &data,
-                                   const common::Target &target,
+                                   const cinn::common::Target &target,
                                    const std::string &key,
                                    bool do_padding) {
   CHECK(target.arch == Target::Arch::X86)
@@ -881,8 +881,8 @@ void Conv2d_NCHWc_1X1_Schedule_CPU(poly::StageMap stages,
   absl::flat_hash_map<std::string, int> conv2d_factors;
   CHECK_EQ(packed_out->shape.size(), 5U)
       << "packed_out's shape size should be 5";
-  Expr h_out = common::AutoSimplify(packed_out->shape[2]);
-  Expr w_out = common::AutoSimplify(packed_out->shape[3]);
+  Expr h_out = cinn::common::AutoSimplify(packed_out->shape[2]);
+  Expr w_out = cinn::common::AutoSimplify(packed_out->shape[3]);
   int oh = h_out.as_int32();
   int ow = w_out.as_int32();
   int basic_split_factor = GetBasicFactor(type, target);
@@ -892,8 +892,8 @@ void Conv2d_NCHWc_1X1_Schedule_CPU(poly::StageMap stages,
 
   auto input_shape = input_pad->shape;
   CHECK_EQ(input_shape.size(), 5U) << "input shape size should be 5";
-  Expr oc_bn = common::AutoSimplify(packed_out->shape.back());
-  Expr ic_bn = common::AutoSimplify(input_shape.back());
+  Expr oc_bn = cinn::common::AutoSimplify(packed_out->shape.back());
+  Expr ic_bn = cinn::common::AutoSimplify(input_shape.back());
   int oc_bn_size = oc_bn.as_int32();
   int ic_bn_size = ic_bn.as_int32();
   VLOG(3) << "oh_bn_size " << oh_bn_size;
@@ -1021,7 +1021,7 @@ void Conv2d_NCHWc_1X1_Schedule_CPU_Nofuse(poly::StageMap stages,
                                           const ir::Tensor &input_pad,
                                           const ir::Tensor &weights_dilation,
                                           const ir::Tensor &data,
-                                          const common::Target &target) {
+                                          const cinn::common::Target &target) {
   CHECK(target.arch == Target::Arch::X86)
       << "Conv2d_NCHWc_1X1_Schedule_CPU_Nofuse schedule only used in x86";
   CHECK(packed_out.defined());
@@ -1030,8 +1030,8 @@ void Conv2d_NCHWc_1X1_Schedule_CPU_Nofuse(poly::StageMap stages,
   absl::flat_hash_map<std::string, int> conv2d_factors;
   CHECK_EQ(packed_out->shape.size(), 5U)
       << "packed_out's shape size should be 5";
-  Expr h_out = common::AutoSimplify(packed_out->shape[2]);
-  Expr w_out = common::AutoSimplify(packed_out->shape[3]);
+  Expr h_out = cinn::common::AutoSimplify(packed_out->shape[2]);
+  Expr w_out = cinn::common::AutoSimplify(packed_out->shape[3]);
   int oh = h_out.as_int32();
   int ow = w_out.as_int32();
   int basic_split_factor = GetBasicFactor(type, target);
@@ -1042,8 +1042,8 @@ void Conv2d_NCHWc_1X1_Schedule_CPU_Nofuse(poly::StageMap stages,
   auto input_shape = input_pad->shape;
   int shape_size = input_shape.size();
   CHECK_EQ(shape_size, 5U) << "input shape size should be 5";
-  Expr oc_bn = common::AutoSimplify(packed_out->shape.back());
-  Expr ic_bn = common::AutoSimplify(input_shape.back());
+  Expr oc_bn = cinn::common::AutoSimplify(packed_out->shape.back());
+  Expr ic_bn = cinn::common::AutoSimplify(input_shape.back());
   int oc_bn_size = oc_bn.as_int32();
   int ic_bn_size = ic_bn.as_int32();
   VLOG(3) << "ow_bn_size" << ow_bn_size;
@@ -1143,7 +1143,7 @@ void Conv2d_NCHWc_Schedule_CPU_Nofuse(poly::StageMap stages,
                                       const ir::Tensor &input_pad,
                                       const ir::Tensor &weights_dilation,
                                       const ir::Tensor &data,
-                                      const common::Target &target) {
+                                      const cinn::common::Target &target) {
   CHECK(target.arch == Target::Arch::X86)
       << "Conv2d_NCHWc_Schedule_CPU_Nofuse schedule only used in x86";
   CHECK(packed_out.defined());
@@ -1152,7 +1152,7 @@ void Conv2d_NCHWc_Schedule_CPU_Nofuse(poly::StageMap stages,
   absl::flat_hash_map<std::string, int> conv2d_factors;
   CHECK_EQ(packed_out->shape.size(), 5U)
       << "packed_out's shape size should be 5";
-  Expr w_out = common::AutoSimplify(packed_out->shape[3]);
+  Expr w_out = cinn::common::AutoSimplify(packed_out->shape[3]);
   int ow = w_out.as_int32();
   int basic_split_factor = GetBasicFactor(type, target);
   GetConv2dFactors(&conv2d_factors, -1, -1, -1, -1, ow, type, target);
@@ -1161,8 +1161,8 @@ void Conv2d_NCHWc_Schedule_CPU_Nofuse(poly::StageMap stages,
   auto input_shape = input_pad->shape;
   int shape_size = input_shape.size();
   CHECK_EQ(shape_size, 5U) << "input shape size should be 5";
-  Expr oc_bn = common::AutoSimplify(packed_out->shape.back());
-  Expr ic_bn = common::AutoSimplify(input_shape.back());
+  Expr oc_bn = cinn::common::AutoSimplify(packed_out->shape.back());
+  Expr ic_bn = cinn::common::AutoSimplify(input_shape.back());
   int oc_bn_size = oc_bn.as_int32();
   int ic_bn_size = ic_bn.as_int32();
   VLOG(3) << "ow_bn_size " << ow_bn_size;
@@ -1248,7 +1248,7 @@ void Conv2d_NCHWc_Schedule_CPU(poly::StageMap stages,
                                const ir::Tensor &input_pad,
                                const ir::Tensor &weights_dilation,
                                const ir::Tensor &data,
-                               const common::Target &target,
+                               const cinn::common::Target &target,
                                const std::string &key,
                                bool do_padding) {
   CHECK(target.arch == Target::Arch::X86)
@@ -1258,13 +1258,13 @@ void Conv2d_NCHWc_Schedule_CPU(poly::StageMap stages,
   auto type = packed_out->type();
   CHECK_EQ(packed_out->shape.size(), 5U)
       << "packed_out's shape size should be 5";
-  Expr w_out = common::AutoSimplify(packed_out->shape[3]);
+  Expr w_out = cinn::common::AutoSimplify(packed_out->shape[3]);
   int ow = w_out.as_int32();
   auto input_shape = input_pad->shape;
   int shape_size = input_shape.size();
   CHECK_EQ(shape_size, 5U) << "input shape size should be 5";
-  Expr oc_bn = common::AutoSimplify(packed_out->shape.back());
-  Expr ic_bn = common::AutoSimplify(input_shape.back());
+  Expr oc_bn = cinn::common::AutoSimplify(packed_out->shape.back());
+  Expr ic_bn = cinn::common::AutoSimplify(input_shape.back());
   int oc_bn_size = oc_bn.as_int32();
   int ic_bn_size = ic_bn.as_int32();
 
@@ -1381,7 +1381,7 @@ void Depthwise_Conv2d_NCHWc_Schedule_CPU_Nofuse(
     const ir::Tensor &input_pad,
     const ir::Tensor &weights_dilation,
     const ir::Tensor &data,
-    const common::Target &target,
+    const cinn::common::Target &target,
     bool do_padding) {
   CHECK(target.arch == Target::Arch::X86)
       << "Depthwise_Conv2d_NCHWc_Schedule_CPU_Nofuse schedule only used in x86";
@@ -1391,7 +1391,7 @@ void Depthwise_Conv2d_NCHWc_Schedule_CPU_Nofuse(
   absl::flat_hash_map<std::string, int> conv2d_factors;
   CHECK_EQ(packed_out->shape.size(), 5U)
       << "packed_out's shape size should be 5";
-  Expr w_out = common::AutoSimplify(packed_out->shape[3]);
+  Expr w_out = cinn::common::AutoSimplify(packed_out->shape[3]);
   int ow = w_out.as_int32();
   int basic_split_factor = GetBasicFactor(type, target);
   GetConv2dFactors(&conv2d_factors, -1, -1, -1, -1, ow, type, target);
@@ -1400,8 +1400,8 @@ void Depthwise_Conv2d_NCHWc_Schedule_CPU_Nofuse(
   auto input_shape = input_pad->shape;
   int shape_size = input_shape.size();
   CHECK_EQ(shape_size, 5U) << "input shape size should be 5";
-  Expr oc_bn = common::AutoSimplify(packed_out->shape.back());
-  Expr ic_bn = common::AutoSimplify(input_shape.back());
+  Expr oc_bn = cinn::common::AutoSimplify(packed_out->shape.back());
+  Expr ic_bn = cinn::common::AutoSimplify(input_shape.back());
   int oc_bn_size = oc_bn.as_int32();
   int ic_bn_size = ic_bn.as_int32();
   VLOG(3) << "ow_bn_size " << ow_bn_size;
@@ -1482,7 +1482,7 @@ void Depthwise_Conv2d_NCHWc_Schedule_CPU_Nofuse(
 void CudaScheduleMul(poly::StageMap stages,
                      ir::Tensor output,
                      const std::vector<int> &output_shape,
-                     const common::Target &target) {
+                     const cinn::common::Target &target) {
   stages[output]->Split(1, 2);
   stages[output]->Bind(0, "blockIdx.x");
   stages[output]->Bind(1, "threadIdx.x");
@@ -2301,7 +2301,7 @@ void SaveSerialData(
 
 void CudaScheduleDepthwiseConv(poly::StageMap stages,
                                ir::Tensor &output,  // NOLINT
-                               const common::Target &target) {
+                               const cinn::common::Target &target) {
   auto OL = stages[output]->CacheWrite("local", stages, output);
   stages[output]->Bind(0, "blockIdx.x");
   stages[output]->Bind(1, "blockIdx.y");
@@ -2316,7 +2316,7 @@ void CudaScheduleConv(poly::StageMap stages,
                       ir::Tensor &input_pad,  // NOLINT
                       ir::Tensor &weights,    // NOLINT
                       ir::Tensor &output,     // NOLINT
-                      const common::Target &target) {
+                      const cinn::common::Target &target) {
   auto &res = ScheduleParam::get_cuda_instance().GetParam();
   int n = output->shape[0].as_int32();
   int c = output->shape[1].as_int32();
@@ -2385,7 +2385,7 @@ void CudaScheduleConv2(poly::StageMap stages,
                        ir::Tensor &input_pad,  // NOLINT
                        ir::Tensor &weights,    // NOLINT
                        ir::Tensor &output,     // NOLINT
-                       const common::Target &target,
+                       const cinn::common::Target &target,
                        const std::string &key) {
   auto &res = ScheduleParam::get_cuda_instance().GetParam();
   stages[input_pad]->ComputeInline();
@@ -2517,7 +2517,7 @@ void CudaScheduleConv2(poly::StageMap stages,
 
 void CudaScheduleWinogradConv(poly::StageMap wino_stages,
                               std::vector<ir::Tensor> &all_tensors,  // NOLINT
-                              const common::Target &target) {
+                              const cinn::common::Target &target) {
   auto &res = ScheduleParam::get_cuda_instance().GetParam();
   auto &wino_weights_dilation = all_tensors[0];
   auto &wino_input_pad = all_tensors[1];
@@ -2673,7 +2673,7 @@ int MaxFactorLessThan(int a, int b) {
 
 void CudaScheduleInjectiveWithVectorize(poly::Stage *stage,
                                         const std::vector<int> &output_shape,
-                                        const common::Target &target) {
+                                        const cinn::common::Target &target) {
   int dims = stage->n_out_dims();
   int prod_size = std::accumulate(
       output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
@@ -2745,7 +2745,7 @@ void CudaScheduleInjectiveWithVectorize(poly::Stage *stage,
 
 void CudaScheduleInjective(poly::Stage *stage,
                            const std::vector<int> &output_shape,
-                           const common::Target &target) {
+                           const cinn::common::Target &target) {
   CHECK_EQ(stage->n_out_dims(), stage->n_in_dims())
       << "The dims of op are not equal";
   if (FLAGS_cinn_use_cuda_vectorize) {
@@ -2777,10 +2777,10 @@ void CudaScheduleInjective(poly::Stage *stage,
   stage->Bind(1, "threadIdx.x");
 }
 
-void CudaSplitSchedule(common::CINNValuePack *arg_pack,
+void CudaSplitSchedule(cinn::common::CINNValuePack *arg_pack,
                        const std::vector<std::vector<int>> &output_shapes,
                        int axis,
-                       const common::Target &target) {
+                       const cinn::common::Target &target) {
   poly::StageMap stages = arg_pack->back();
   std::vector<ir::Tensor> out_tensors;
   int dims = output_shapes[0].size();
diff --git a/paddle/cinn/hlir/pe/schedule.h b/paddle/cinn/hlir/pe/schedule.h
index cd3262cf3ad18e..8e863c50e5b6c2 100644
--- a/paddle/cinn/hlir/pe/schedule.h
+++ b/paddle/cinn/hlir/pe/schedule.h
@@ -35,11 +35,11 @@ class ScheduleParam {
   ScheduleParam(const ScheduleParam &) = delete;
   ScheduleParam &operator=(const ScheduleParam &) = delete;
   static ScheduleParam &get_cuda_instance() {
-    static ScheduleParam instance{common::Target::Arch::NVGPU};
+    static ScheduleParam instance{cinn::common::Target::Arch::NVGPU};
     return instance;
   }
   static ScheduleParam &get_x86_instance() {
-    static ScheduleParam instance{common::Target::Arch::X86};
+    static ScheduleParam instance{cinn::common::Target::Arch::X86};
     return instance;
   }
   absl::flat_hash_map<std::string,
@@ -54,7 +54,7 @@ class ScheduleParam {
   int Count(const std::string &key) { return param_data.count(key); }
 
  private:
-  explicit ScheduleParam(common::Target::Arch arch);
+  explicit ScheduleParam(cinn::common::Target::Arch arch);
   absl::flat_hash_map<std::string,
                       absl::flat_hash_map<std::string, std::vector<int>>>
       param_data;
@@ -66,37 +66,37 @@ int GetVectorizeFactor(int shape, int split_factor);
 
 int SplitEven(int origin);
 
-int GetBasicFactor(const Type &type, const common::Target &target);
+int GetBasicFactor(const Type &type, const cinn::common::Target &target);
 
 int GetBetterSplitFactor(int shape, int split_factor);
 
 int GetArrayPackingFactor(int shape,
                           const Type &type,
-                          const common::Target &target);
+                          const cinn::common::Target &target);
 
 void ScheduleInjectiveCPU(poly::Stage *stage,
                           const std::vector<int> &output_shape,
-                          const common::Target &target,
+                          const cinn::common::Target &target,
                           bool vectorizable = true);
 // to deprecate
 void ScheduleInjectiveCPU1(poly::Stage *stage,
                            const std::vector<int> &output_shape,
-                           const common::Target &target,
+                           const cinn::common::Target &target,
                            bool vectorizable = true);
 
 void MatmulScheduleCUDA(poly::StageMap stages,
                         const ir::Tensor &output,
-                        const common::Target &target);
+                        const cinn::common::Target &target);
 
 void MatmulScheduleCPU(poly::StageMap stage,
                        const ir::Tensor &output,
                        const ir::Tensor &packedB,
-                       const common::Target &target);
+                       const cinn::common::Target &target);
 
 void MulScheduleCPU(poly::StageMap stage,
                     const ir::Tensor &output,
                     const ir::Tensor &input_tensor,
-                    const common::Target &target);
+                    const cinn::common::Target &target);
 
 void SoftmaxScheduleCPU(poly::StageMap stage,
                         const ir::Tensor &output,
@@ -110,7 +110,7 @@ void GetConv2dFactors(absl::flat_hash_map<std::string, int> *factors,
                       int oh,
                       int ow,
                       const Type &type,
-                      const common::Target &target,
+                      const cinn::common::Target &target,
                       const std::string &key = "",
                       bool import_params = true);
 
@@ -120,7 +120,7 @@ void GetConv2d1x1Factors(absl::flat_hash_map<std::string, int> *factors,
                          int oh,
                          int ow,
                          const Type &type,
-                         const common::Target &target);
+                         const cinn::common::Target &target);
 
 void Conv2d_NCHWc_Schedule_CPU(poly::StageMap stages,
                                const ir::Tensor &res,
@@ -128,18 +128,18 @@ void Conv2d_NCHWc_Schedule_CPU(poly::StageMap stages,
                                const ir::Tensor &input_pad,
                                const ir::Tensor &weights_dilation,
                                const ir::Tensor &data,
-                               const common::Target &target,
+                               const cinn::common::Target &target,
                                const std::string &key,
                                bool do_padding);
 void GlobalPoolScheduleGPU(poly::StageMap stages,
                            const std::vector<ir::Tensor> &output,
-                           const common::Target &target);
+                           const cinn::common::Target &target);
 void PoolScheduleCPU(poly::StageMap stages,
                      const ir::Tensor &output,
-                     const common::Target &target);
+                     const cinn::common::Target &target);
 void PoolScheduleGPU(poly::StageMap stages,
                      const ir::Tensor &output,
-                     const common::Target &target);
+                     const cinn::common::Target &target);
 
 void Conv2d_NCHWc_Schedule_CPU_Nofuse(poly::StageMap stages,
                                       const ir::Tensor &res,
@@ -147,7 +147,7 @@ void Conv2d_NCHWc_Schedule_CPU_Nofuse(poly::StageMap stages,
                                       const ir::Tensor &input_pad,
                                       const ir::Tensor &weights_dilation,
                                       const ir::Tensor &data,
-                                      const common::Target &target);
+                                      const cinn::common::Target &target);
 
 void Conv2d_NCHWc_1X1_Schedule_CPU(poly::StageMap stages,
                                    const ir::Tensor &res,
@@ -155,7 +155,7 @@ void Conv2d_NCHWc_1X1_Schedule_CPU(poly::StageMap stages,
                                    const ir::Tensor &input_pad,
                                    const ir::Tensor &weights_dilation,
                                    const ir::Tensor &data,
-                                   const common::Target &target,
+                                   const cinn::common::Target &target,
                                    const std::string &key,
                                    bool do_padding);
 
@@ -165,7 +165,7 @@ void Conv2d_NCHWc_1X1_Schedule_CPU_Nofuse(poly::StageMap stages,
                                           const ir::Tensor &input_pad,
                                           const ir::Tensor &weights_dilation,
                                           const ir::Tensor &data,
-                                          const common::Target &target);
+                                          const cinn::common::Target &target);
 
 void Depthwise_Conv2d_NCHWc_Schedule_CPU_Nofuse(
     poly::StageMap stages,
@@ -174,78 +174,78 @@ void Depthwise_Conv2d_NCHWc_Schedule_CPU_Nofuse(
     const ir::Tensor &input_pad,
     const ir::Tensor &weights_dilation,
     const ir::Tensor &data,
-    const common::Target &target,
+    const cinn::common::Target &target,
     bool do_padding);
 
 void CudaScheduleMul(poly::StageMap stages,
                      ir::Tensor output,
                      const std::vector<int> &output_shape,
-                     const common::Target &target);
+                     const cinn::common::Target &target);
 
 // reduce shedules.
 void CudaReduceSchedule(poly::StageMap stages,
                         ir::Tensor output,
                         int last_dimension_num,
-                        const common::Target &target);
+                        const cinn::common::Target &target);
 
 void CudaWarpReduceSchedule(poly::StageMap stages,
                             ir::Tensor tmp_out,
                             ir::Tensor out,
-                            const common::Target &target);
+                            const cinn::common::Target &target);
 
 void CudaBlockReduceInternalSchedule(poly::StageMap stages,
                                      ir::Tensor tmp_out,
                                      ir::Tensor out,
-                                     const common::Target &target);
+                                     const cinn::common::Target &target);
 
 void CudaBlockReduceSchedule(poly::StageMap stages,
                              ir::Tensor reduce_tmp_out,
                              ir::Tensor tmp_out,
                              ir::Tensor out,
-                             const common::Target &target);
+                             const cinn::common::Target &target);
 
 void CudaBlockShuffleReduceSchedule(poly::StageMap stages,
                                     ir::Tensor reduce_reshape,
                                     ir::Tensor reduce_internal,
                                     ir::Tensor reduce_out,
-                                    const common::Target &target);
+                                    const cinn::common::Target &target);
 
 void CudaTwoStepReduceSchedule(poly::StageMap stages,
                                ir::Tensor reshape,
                                ir::Tensor internal,
                                ir::Tensor tmp_out,
                                ir::Tensor out,
-                               const common::Target &target);
+                               const cinn::common::Target &target);
 
 void CudaScheduleDepthwiseConv(poly::StageMap stages,
                                ir::Tensor &output,  // NOLINT
-                               const common::Target &target);
+                               const cinn::common::Target &target);
 
 void CudaScheduleConv(poly::StageMap stages,
                       ir::Tensor &input_pad,  // NOLINT
                       ir::Tensor &weights,    // NOLINT
                       ir::Tensor &output,     // NOLINT
-                      const common::Target &target);
+                      const cinn::common::Target &target);
 
 void CudaScheduleWinogradConv(poly::StageMap wino_stages,
                               std::vector<ir::Tensor> &all_tensors,  // NOLINT
-                              const common::Target &target);
+                              const cinn::common::Target &target);
 
 void CudaScheduleConv2(poly::StageMap stages,
                        ir::Tensor &input_pad,  // NOLINT
                        ir::Tensor &weights,    // NOLINT
                        ir::Tensor &output,     // NOLINT
-                       const common::Target &target,
+                       const cinn::common::Target &target,
                        const std::string &key);
 
 void CudaScheduleInjective(poly::Stage *stage,
                            const std::vector<int> &output_shape,
-                           const common::Target &target);
+                           const cinn::common::Target &target);
 
-void CudaSplitSchedule(common::CINNValuePack *arg_pack,
+void CudaSplitSchedule(cinn::common::CINNValuePack *arg_pack,
                        const std::vector<std::vector<int>> &output_shapes,
                        int axis,
-                       const common::Target &target);
+                       const cinn::common::Target &target);
 
 void CreateCudaSerialData(const std::string &file_name = "default_serial.log");
 
diff --git a/paddle/cinn/hlir/pe/transform.cc b/paddle/cinn/hlir/pe/transform.cc
index 5c02b4a8493135..81df619097eeff 100644
--- a/paddle/cinn/hlir/pe/transform.cc
+++ b/paddle/cinn/hlir/pe/transform.cc
@@ -391,7 +391,7 @@ std::vector<ir::Tensor> Split(
         out_shape[i],
         [=](const std::vector<Expr>& indice) {
           auto temp = indice;
-          temp[axis] = common::AutoSimplify(temp[axis] + Expr(start[i]));
+          temp[axis] = cinn::common::AutoSimplify(temp[axis] + Expr(start[i]));
           return A(temp);
         },
         names[i]);
@@ -410,7 +410,7 @@ ir::Tensor Concat(const ir::Tensor& A,
   std::vector<Expr> output_shape = A->shape;
   Expr pivot = A->shape[axis];
   output_shape[axis] =
-      common::AutoSimplify(output_shape[axis] + B->shape[axis]);
+      cinn::common::AutoSimplify(output_shape[axis] + B->shape[axis]);
   auto res = Compute(
       output_shape,
       [=](const std::vector<Expr>& indice) {
@@ -438,8 +438,8 @@ ir::Tensor Concat(const std::vector<ir::Tensor>& input_tensors,
     CHECK_EQ(input_tensors[i]->shape.size(), input_dim)
         << "Dimensions of inputs tensors in Concat should be equal! Please "
            "check.";
-    output_shape[axis] = common::AutoSimplify(output_shape[axis] +
-                                              input_tensors[i]->shape[axis]);
+    output_shape[axis] = cinn::common::AutoSimplify(
+        output_shape[axis] + input_tensors[i]->shape[axis]);
   }
 
   auto res = Compute(
@@ -448,7 +448,7 @@ ir::Tensor Concat(const std::vector<ir::Tensor>& input_tensors,
         auto ret = input_tensors[0](indice);
         Expr accumulate_shape = Expr(0);
         for (int i = 0; i < input_size - 1; i++) {
-          accumulate_shape = common::AutoSimplify(
+          accumulate_shape = cinn::common::AutoSimplify(
               accumulate_shape + input_tensors[i]->shape[axis]);
           std::vector<Expr> new_indice = indice;
           new_indice[axis] = indice[axis] - accumulate_shape;
@@ -468,7 +468,7 @@ std::vector<Tensor> MatmulV2(const Tensor& A,
                              bool trans_b,
                              float alpha,
                              const std::string& name,
-                             const common::Target& target) {
+                             const cinn::common::Target& target) {
   std::vector<Expr> shape_A = A->shape;
   std::vector<Expr> shape_B = B->shape;
   int a_dim = shape_A.size();
@@ -564,7 +564,7 @@ std::vector<Tensor> MatmulMKL(const Tensor& A,
                               bool trans_b,
                               float alpha,
                               const std::string& name,
-                              const common::Target& target) {
+                              const cinn::common::Target& target) {
   CHECK(target.arch == Target::Arch::X86)
       << "mkl should be used in the cpu environment";
   std::vector<Expr> shape_A = A->shape;
@@ -597,18 +597,18 @@ std::vector<Tensor> MatmulMKL(const Tensor& A,
         [=]() -> Expr {
           return lang::CallExtern("cinn_cpu_mkl_gemm_fp32",
                                   {
-                                      Expr(alpha),                 // alpha
-                                      M,                           // M
-                                      N,                           // N
-                                      x_width,                     // K
-                                      common::make_bool(trans_a),  // ta
-                                      common::make_bool(trans_b),  // tb
-                                      shape_A.back(),              // lda
-                                      shape_B.back(),              // ldb
-                                      N,                           // ldc
-                                      common::make_zero<float>(),  // beta
-                                      A,                           // A
-                                      B,                           // B
+                                      Expr(alpha),  // alpha
+                                      M,            // M
+                                      N,            // N
+                                      x_width,      // K
+                                      cinn::common::make_bool(trans_a),  // ta
+                                      cinn::common::make_bool(trans_b),  // tb
+                                      shape_A.back(),                    // lda
+                                      shape_B.back(),                    // ldb
+                                      N,                                 // ldc
+                                      cinn::common::make_zero<float>(),  // beta
+                                      A,                                 // A
+                                      B,                                 // B
                                   });
         },
         UniqName("matmul_mkl_out"));
@@ -619,22 +619,22 @@ std::vector<Tensor> MatmulMKL(const Tensor& A,
         [=]() -> Expr {
           return lang::CallExtern("cinn_cpu_mkl_gemm_batch_fp32",
                                   {
-                                      Expr(alpha),                 // alpha
-                                      shape_A.front(),             // batch
-                                      M,                           // M
-                                      N,                           // N
-                                      x_width,                     // K
-                                      common::make_bool(trans_a),  // ta
-                                      common::make_bool(trans_b),  // tb
-                                      shape_A.back(),              // lda
-                                      shape_B.back(),              // ldb
-                                      N,                           // ldc
-                                      M * x_width,                 // a_stride
-                                      N * x_width,                 // b_stride
-                                      M * N,                       // c_stride
-                                      common::make_zero<float>(),  // beta
-                                      A,                           // A
-                                      B,                           // B
+                                      Expr(alpha),      // alpha
+                                      shape_A.front(),  // batch
+                                      M,                // M
+                                      N,                // N
+                                      x_width,          // K
+                                      cinn::common::make_bool(trans_a),  // ta
+                                      cinn::common::make_bool(trans_b),  // tb
+                                      shape_A.back(),                    // lda
+                                      shape_B.back(),                    // ldb
+                                      N,                                 // ldc
+                                      M * x_width,  // a_stride
+                                      N * x_width,  // b_stride
+                                      M * N,        // c_stride
+                                      cinn::common::make_zero<float>(),  // beta
+                                      A,                                 // A
+                                      B,                                 // B
                                   });
         },
         UniqName("batch_matmul_mkl_out"));
@@ -644,7 +644,9 @@ std::vector<Tensor> MatmulMKL(const Tensor& A,
   return {out, call};
 }
 
-int GetMulFactor(int shape, const Type& type, const common::Target& target) {
+int GetMulFactor(int shape,
+                 const Type& type,
+                 const cinn::common::Target& target) {
   int split_base = GetBasicFactor(type, target);
   int split_factor = 1;
   for (size_t i = split_base; i >= 1; --i) {
@@ -659,7 +661,7 @@ int GetMulFactor(int shape, const Type& type, const common::Target& target) {
 std::vector<Tensor> MulBase(const Tensor& A,
                             const Tensor& B,
                             const std::string& name,
-                            const common::Target& target) {
+                            const cinn::common::Target& target) {
   std::vector<Expr> output_shape;
   CHECK_EQ(A->shape.size(), 2U)
       << "tensor_A's shape size should be two while current shape size is "
@@ -748,7 +750,7 @@ std::vector<Tensor> Mul(const Tensor& A,
 std::vector<Tensor> MulMKL(const Tensor& A,
                            const Tensor& B,
                            const std::string& name,
-                           const common::Target& target) {
+                           const cinn::common::Target& target) {
   CHECK(target.arch == Target::Arch::X86)
       << "mkl should be used in the cpu environment";
   std::vector<Expr> shape_A = A->shape;
@@ -776,18 +778,18 @@ std::vector<Tensor> MulMKL(const Tensor& A,
       [=]() -> Expr {
         return lang::CallExtern("cinn_cpu_mkl_gemm_fp32",
                                 {
-                                    Expr(1.0f),                  // alpha
-                                    M,                           // M
-                                    N,                           // N
-                                    x_width,                     // K
-                                    common::make_bool(false),    // ta
-                                    common::make_bool(true),     // tb
-                                    shape_A.back(),              // lda
-                                    shape_B.back(),              // ldb
-                                    N,                           // ldc
-                                    common::make_zero<float>(),  // beta
-                                    A,                           // A
-                                    B,                           // B
+                                    Expr(1.0f),                        // alpha
+                                    M,                                 // M
+                                    N,                                 // N
+                                    x_width,                           // K
+                                    cinn::common::make_bool(false),    // ta
+                                    cinn::common::make_bool(true),     // tb
+                                    shape_A.back(),                    // lda
+                                    shape_B.back(),                    // ldb
+                                    N,                                 // ldc
+                                    cinn::common::make_zero<float>(),  // beta
+                                    A,                                 // A
+                                    B,                                 // B
                                 });
       },
       UniqName("mul_mkl_out"));
@@ -847,7 +849,7 @@ std::vector<Expr> InferShapeLayoutTransform(
         int dst_prim_index = (*split_index_map)[i][0];
         int dst_sub_index = (*split_index_map)[i][1];
         int factor = (*split_index_map)[i][2];
-        Expr chunk_shape = common::AutoSimplify(input_shapes[i] / factor);
+        Expr chunk_shape = cinn::common::AutoSimplify(input_shapes[i] / factor);
         Expr block_shape = Expr(factor);
         output_shape[dst_prim_index] = chunk_shape;
         output_shape[dst_sub_index] = block_shape;
@@ -867,7 +869,7 @@ std::vector<Expr> InferShapeLayoutTransform(
         CHECK_GE(input_shapes.size(), src_sub_index);
         CHECK_EQ(input_shapes[src_sub_index].as_int32(), factor);
         output_shape[i] =
-            common::AutoSimplify(input_shapes[src_prim_index] * factor);
+            cinn::common::AutoSimplify(input_shapes[src_prim_index] * factor);
       } else if ((*split_index_map)[i].size() == 1) {
         int src_prim_index = (*split_index_map)[i][0];
         output_shape[i] = input_shapes[src_prim_index];
@@ -915,11 +917,13 @@ ir::Tensor LayoutTransform(const Tensor& input,
             int sub_index = split_infos[1];
             int factor = split_infos[2];
             if (dst_dim > src_dim) {
-              new_indice[i] = common::AutoSimplify(indice[prim_index] * factor +
-                                                   indice[sub_index]);
+              new_indice[i] = cinn::common::AutoSimplify(
+                  indice[prim_index] * factor + indice[sub_index]);
             } else {
-              new_indice[prim_index] = common::AutoSimplify(indice[i] / factor);
-              new_indice[sub_index] = common::AutoSimplify(indice[i] % factor);
+              new_indice[prim_index] =
+                  cinn::common::AutoSimplify(indice[i] / factor);
+              new_indice[sub_index] =
+                  cinn::common::AutoSimplify(indice[i] % factor);
             }
 
           } else if (split_infos.size() == 1) {
@@ -1186,7 +1190,7 @@ ir::Tensor Gather(const ir::Tensor& x,
         // to int32 in CINN. See the below link for more details:
         // https://github.com/PaddlePaddle/CINN/blob/85ab4981a38926dc5c1dbf672762cec335d2b857/cinn/ir/ir.cc#L477
         transformed_indice[axis] =
-            ir::Cast::Make(common::Int(32), index(indice));
+            ir::Cast::Make(cinn::common::Int(32), index(indice));
         return x(transformed_indice);
       },
       name);
@@ -1196,15 +1200,15 @@ ir::Tensor Gather(const ir::Tensor& x,
 ir::Tensor ScatterAssign(const ir::Tensor& input,
                          const ir::Tensor& updates,
                          const ir::Tensor& index,
-                         const common::Target& target,
+                         const cinn::common::Target& target,
                          const int axis,
                          const std::string& output_name) {
-  CHECK_EQ(index->type(), common::Int(32))
+  CHECK_EQ(index->type(), cinn::common::Int(32))
       << "Param [Index] of ScatterAssign only support int32 ! Please Check.\n";
   std::string extern_fun_name;
-  if (target.arch == common::Target::Arch::NVGPU) {
+  if (target.arch == cinn::common::Target::Arch::NVGPU) {
     extern_fun_name.assign("cinn_cuda_find_int");
-  } else if (target.arch == common::Target::Arch::X86) {
+  } else if (target.arch == cinn::common::Target::Arch::X86) {
     extern_fun_name.assign("cinn_host_find_int");
   } else {
     LOG(FATAL) << "ScatterAssign only support X86 and NVGPU ! Please Check.\n";
@@ -1236,13 +1240,13 @@ ir::Tensor ScatterAssign(const ir::Tensor& input,
 ir::Tensor ScatterAdd(const ir::Tensor& input,
                       const ir::Tensor& updates,
                       const ir::Tensor& index,
-                      const common::Target& target,
+                      const cinn::common::Target& target,
                       const int axis,
                       const std::string& output_name) {
-  CHECK_EQ(target.arch, common::Target::Arch::NVGPU)
+  CHECK_EQ(target.arch, cinn::common::Target::Arch::NVGPU)
       << "Op IndexAdd only support NVGPU now ! Please Check.\n";
 
-  CHECK_EQ(index->type(), common::Int(32))
+  CHECK_EQ(index->type(), cinn::common::Int(32))
       << "Param [index] of IndexAdd only support int32 ! Please Check.\n";
   CHECK_EQ(index->shape.size(), 1) << "The dimension of param [index] of "
                                       "IndexAdd should be 1 ! Please Check.\n";
diff --git a/paddle/cinn/hlir/pe/transform.h b/paddle/cinn/hlir/pe/transform.h
index e6dffa42e803b2..dfc090b0cad444 100644
--- a/paddle/cinn/hlir/pe/transform.h
+++ b/paddle/cinn/hlir/pe/transform.h
@@ -83,7 +83,7 @@ std::vector<ir::Tensor> MatmulV2(
     bool trans_b = false,
     float alpha = 1,
     const std::string& name = UniqName("T_Transform_MatmulV2_out"),
-    const common::Target& target = common::DefaultHostTarget());
+    const cinn::common::Target& target = cinn::common::DefaultHostTarget());
 
 std::vector<ir::Tensor> MatmulMKL(
     const ir::Tensor& A,
@@ -92,9 +92,11 @@ std::vector<ir::Tensor> MatmulMKL(
     bool trans_b = false,
     float alpha = 1,
     const std::string& name = UniqName("T_Transform_MatmulMKL_out"),
-    const common::Target& target = common::DefaultHostTarget());
+    const cinn::common::Target& target = cinn::common::DefaultHostTarget());
 
-int GetMulFactor(int shape, const Type& type, const common::Target& target);
+int GetMulFactor(int shape,
+                 const Type& type,
+                 const cinn::common::Target& target);
 
 /**
  * @brief basic PE that calculates a matrix multiplication
@@ -112,7 +114,7 @@ std::vector<ir::Tensor> MulBase(
     const ir::Tensor& A,
     const ir::Tensor& B,
     const std::string& name = UniqName("T_Transform_MulBase_out"),
-    const common::Target& target = common::DefaultHostTarget());
+    const cinn::common::Target& target = cinn::common::DefaultHostTarget());
 
 std::vector<ir::Tensor> Mul(const ir::Tensor& A,
                             const ir::Tensor& B,
@@ -125,7 +127,7 @@ std::vector<ir::Tensor> MulMKL(
     const ir::Tensor& A,
     const ir::Tensor& B,
     const std::string& name = UniqName("T_Transform_MulMKL_out"),
-    const common::Target& target = common::DefaultHostTarget());
+    const cinn::common::Target& target = cinn::common::DefaultHostTarget());
 
 ir::Tensor LayoutTransform(
     const ir::Tensor& input,
@@ -223,7 +225,7 @@ ir::Tensor ScatterAssign(
     const ir::Tensor& input,
     const ir::Tensor& updates,
     const ir::Tensor& index,
-    const common::Target& target,
+    const cinn::common::Target& target,
     const int axis = 0,
     const std::string& output_name = UniqName("T_Transform_ScatterAssign_out"));
 
@@ -237,7 +239,7 @@ ir::Tensor ScatterAssign(
 ir::Tensor ScatterAdd(const ir::Tensor& input,
                       const ir::Tensor& updates,
                       const ir::Tensor& index,
-                      const common::Target& target,
+                      const cinn::common::Target& target,
                       const int axis,
                       const std::string& output_name);
 
diff --git a/paddle/cinn/ir/buffer.cc b/paddle/cinn/ir/buffer.cc
index ee816d2b0bd716..ada0d4487b7f02 100644
--- a/paddle/cinn/ir/buffer.cc
+++ b/paddle/cinn/ir/buffer.cc
@@ -55,7 +55,7 @@ Buffer _Buffer_::Make(Var data,
   CHECK(dtype.valid());
   CHECK(!dtype.is_unk());
   CHECK(!dtype.is_void());
-  auto *node = common::make_shared<_Buffer_>();
+  auto *node = cinn::common::make_shared<_Buffer_>();
   node->shape = shape;
   node->strides = strides;
   node->elem_offset = elem_offset;
@@ -69,7 +69,7 @@ Buffer _Buffer_::Make(Var data,
 }
 
 Buffer _Buffer_::Make(const std::string &name, const std::vector<Expr> &shape) {
-  auto *node = common::make_shared<_Buffer_>();
+  auto *node = cinn::common::make_shared<_Buffer_>();
   node->name = name;
   node->shape = shape;
   node->dtype = Void();
@@ -77,7 +77,7 @@ Buffer _Buffer_::Make(const std::string &name, const std::vector<Expr> &shape) {
 }
 
 Buffer _Buffer_::Make() {
-  auto *node = common::make_shared<_Buffer_>();
+  auto *node = cinn::common::make_shared<_Buffer_>();
   node->dtype = Void();
   return Buffer(node);
 }
diff --git a/paddle/cinn/ir/dim.cc b/paddle/cinn/ir/dim.cc
index fe5b288850d4e2..0b488e641161cd 100644
--- a/paddle/cinn/ir/dim.cc
+++ b/paddle/cinn/ir/dim.cc
@@ -38,7 +38,7 @@ Dim _Dim_::Make(const std::string& name, const SymbolicDimOp& sym_dim) {
   n->name = name;
   n->sym_dim = sym_dim;
   if (sym_dim.IsDynamic()) {
-    n->dim_expr = Expr(Var(sym_dim.GetSymName(), common::Int(32)));
+    n->dim_expr = Expr(Var(sym_dim.GetSymName(), cinn::common::Int(32)));
   } else {
     n->dim_expr = Expr(static_cast<int32_t>(sym_dim.GetDimSize()));
   }
diff --git a/paddle/cinn/ir/group_schedule/base_group_scheduler.cc b/paddle/cinn/ir/group_schedule/base_group_scheduler.cc
index ab215ee952b8f4..687122741aa2e1 100644
--- a/paddle/cinn/ir/group_schedule/base_group_scheduler.cc
+++ b/paddle/cinn/ir/group_schedule/base_group_scheduler.cc
@@ -22,7 +22,7 @@ namespace ir {
 std::unique_ptr<GroupScheduler> GroupScheduler::Make(
     ir::IRSchedule* ir_sch,
     const std::unordered_set<std::string>& output_tensor_names,
-    const common::Target& target,
+    const cinn::common::Target& target,
     bool is_dy_shape) {
   if (is_dy_shape) {
     return std::make_unique<DynamicShapeGroupScheduler>(
diff --git a/paddle/cinn/ir/group_schedule/base_group_scheduler.h b/paddle/cinn/ir/group_schedule/base_group_scheduler.h
index f941d13e30f149..6a277f01d43bf4 100644
--- a/paddle/cinn/ir/group_schedule/base_group_scheduler.h
+++ b/paddle/cinn/ir/group_schedule/base_group_scheduler.h
@@ -29,7 +29,7 @@ class GroupScheduler {
  public:
   GroupScheduler(ir::IRSchedule* ir_sch,
                  const std::unordered_set<std::string>& output_tensor_names,
-                 const common::Target& target)
+                 const cinn::common::Target& target)
       : ir_sch_(ir_sch),
         output_tensor_names_(output_tensor_names),
         target_(target) {
@@ -39,7 +39,7 @@ class GroupScheduler {
   static std::unique_ptr<GroupScheduler> Make(
       ir::IRSchedule* ir_sch,
       const std::unordered_set<std::string>& output_tensor_names,
-      const common::Target& target,
+      const cinn::common::Target& target,
       bool is_dy_shape = false);
 
   virtual ~GroupScheduler() = default;
@@ -51,7 +51,7 @@ class GroupScheduler {
  protected:
   ir::IRSchedule* ir_sch_;
   const std::unordered_set<std::string>& output_tensor_names_;
-  const common::Target& target_;
+  const cinn::common::Target& target_;
   // Graph in units of ScheduleBlockNode, each node corresponds to a
   // ScheduleBlock in IR.
   std::unique_ptr<ir::ScheduleBlockGraph> schedule_block_graph_;
diff --git a/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.h b/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.h
index 1026ee095425df..6b89a0eff00033 100644
--- a/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.h
+++ b/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.h
@@ -27,7 +27,7 @@ class DynamicShapeGroupScheduler : public GroupScheduler {
   DynamicShapeGroupScheduler(
       ir::IRSchedule* ir_sch,
       const std::unordered_set<std::string>& output_tensor_names,
-      const common::Target& target)
+      const cinn::common::Target& target)
       : GroupScheduler(ir_sch, output_tensor_names, target) {}
 
   void Schedule() override;
diff --git a/paddle/cinn/ir/group_schedule/st_shape_group_scheduler.cc b/paddle/cinn/ir/group_schedule/st_shape_group_scheduler.cc
index 92c674ccd9e132..bced79128ca497 100644
--- a/paddle/cinn/ir/group_schedule/st_shape_group_scheduler.cc
+++ b/paddle/cinn/ir/group_schedule/st_shape_group_scheduler.cc
@@ -672,7 +672,7 @@ void StaticShapeGroupScheduler::AllocateStorage() {
     int extent = 1;
     for (int idx = tensor->shape.size() - 1; idx >= 0; --idx) {
       strides.insert(strides.begin(), extent);
-      tensor->shape[idx] = common::AutoSimplify(tensor->shape[idx]);
+      tensor->shape[idx] = cinn::common::AutoSimplify(tensor->shape[idx]);
       CHECK(tensor->shape[idx].is_constant())
           << "Shape of tensor: " << tensor << " is not constant";
       extent *= tensor->shape[idx].get_constant();
@@ -681,12 +681,12 @@ void StaticShapeGroupScheduler::AllocateStorage() {
     for (int idx = 0; idx < indices.size(); ++idx) {
       flatten_indice = flatten_indice + ir::Expr(strides[idx]) * indices[idx];
     }
-    flatten_indice = common::AutoSimplify(flatten_indice);
+    flatten_indice = cinn::common::AutoSimplify(flatten_indice);
     for (int idx = 0; idx < iter_vars.size(); ++idx) {
       optim::ReplaceVarWithExpr(
           &flatten_indice, iter_vars[idx], iter_values[idx]);
     }
-    flatten_indice = common::AutoSimplify(flatten_indice);
+    flatten_indice = cinn::common::AutoSimplify(flatten_indice);
     VLOG(6) << "flatten_indice of " << load_or_store << " : " << flatten_indice;
     return flatten_indice;
   };
@@ -781,12 +781,12 @@ void StaticShapeGroupScheduler::AllocateStorage() {
     }
     VLOG(6) << "lower_bound before simplify of " << indice_value << " = "
             << copy_for_lower_bound;
-    copy_for_lower_bound =
-        common::AutoSimplify(common::AutoSimplify(copy_for_lower_bound));
+    copy_for_lower_bound = cinn::common::AutoSimplify(
+        cinn::common::AutoSimplify(copy_for_lower_bound));
     VLOG(6) << "upper_bound before simplify of " << indice_value << " = "
             << copy_for_upper_bound;
-    copy_for_upper_bound =
-        common::AutoSimplify(common::AutoSimplify(copy_for_upper_bound));
+    copy_for_upper_bound = cinn::common::AutoSimplify(
+        cinn::common::AutoSimplify(copy_for_upper_bound));
     VLOG(6) << "lower_bound of " << indice_value << " = "
             << copy_for_lower_bound;
     VLOG(6) << "upper_bound of " << indice_value << " = "
@@ -839,7 +839,7 @@ void StaticShapeGroupScheduler::AllocateStorage() {
               << indice_value << " = " << indice_copies[i] << ", range = ("
               << coef_and_ranges[i].second.min << ", "
               << coef_and_ranges[i].second.max << ")";
-      indice_copies[i] = common::AutoSimplify(indice_copies[i]);
+      indice_copies[i] = cinn::common::AutoSimplify(indice_copies[i]);
       VLOG(6) << "after simplify [" << i << "], the coefficient of "
               << indice_value << " = " << indice_copies << ", range = ("
               << coef_and_ranges[i].second.min << ", "
diff --git a/paddle/cinn/ir/group_schedule/st_shape_group_scheduler.h b/paddle/cinn/ir/group_schedule/st_shape_group_scheduler.h
index 0187d171b06e7c..be27d59b18f0bb 100644
--- a/paddle/cinn/ir/group_schedule/st_shape_group_scheduler.h
+++ b/paddle/cinn/ir/group_schedule/st_shape_group_scheduler.h
@@ -46,7 +46,7 @@ class StaticShapeGroupScheduler : public GroupScheduler {
   StaticShapeGroupScheduler(
       ir::IRSchedule* ir_sch,
       const std::unordered_set<std::string>& output_tensor_names,
-      const common::Target& target)
+      const cinn::common::Target& target)
       : GroupScheduler(ir_sch, output_tensor_names, target) {}
 
   void Schedule() override;
diff --git a/paddle/cinn/ir/ir.cc b/paddle/cinn/ir/ir.cc
index 4b4979bc86169f..b556dad00cb324 100644
--- a/paddle/cinn/ir/ir.cc
+++ b/paddle/cinn/ir/ir.cc
@@ -29,7 +29,7 @@
 namespace cinn {
 namespace ir {
 
-using common::make_shared;
+using cinn::common::make_shared;
 
 Expr Cast::Make(Type t, Expr v) {
   CHECK(!t.is_unk());
@@ -389,7 +389,7 @@ Expr Store::index() const {
   if (indices.size() == 1) {
     return indices[0];
   }
-  Expr res = common::IndiceToAbsOffset(tensor_n->shape, indices);
+  Expr res = cinn::common::IndiceToAbsOffset(tensor_n->shape, indices);
   optim::Simplify(&res);
   return res;
 }
@@ -477,7 +477,7 @@ Expr Call::Make(Type type,
     CHECK(read_args[i].defined());
   }
 
-  auto node = common::make_shared<Call>(type);
+  auto node = cinn::common::make_shared<Call>(type);
   node->name = name;
   node->read_args = read_args;
   node->write_args = write_args;
@@ -623,7 +623,7 @@ Expr Load::index() const {
     if (indices.size() == 1) {
       return indices[0];
     }
-    Expr res = common::IndiceToAbsOffset(tensor_n->shape, indices);
+    Expr res = cinn::common::IndiceToAbsOffset(tensor_n->shape, indices);
     VLOG(3) << "Begin Load::index Simplify";
     optim::Simplify(&res);
     return res;
@@ -747,7 +747,7 @@ Expr Reduce::Make(Reduce::ReduceType reduce_type,
                   const std::vector<Var> &reduce_aixs) {
   CHECK(body.defined());
   CHECK(init.defined());
-  auto n = common::make_shared<Reduce>();
+  auto n = cinn::common::make_shared<Reduce>();
   n->init = init;
   n->body = body;
   n->reduce_type = reduce_type;
diff --git a/paddle/cinn/ir/ir.h b/paddle/cinn/ir/ir.h
index 9a40d3fb32f0c7..4b510a3b156fb7 100644
--- a/paddle/cinn/ir/ir.h
+++ b/paddle/cinn/ir/ir.h
@@ -44,8 +44,8 @@ class BufferRange;
 struct LoweredFunc;
 class Module;
 
-using common::Object;
-using common::Shared;
+using cinn::common::Object;
+using cinn::common::Shared;
 // NOTE attr_t only support POD, can not contain Expr or other IR nodes, or the
 // IRVisitor or IRCopy on PrimitiveNode will result in undefined behavior.
 using attr_t = absl::variant<int, float, bool, std::string>;
diff --git a/paddle/cinn/ir/ir_analyzer/ir_analyzer.cc b/paddle/cinn/ir/ir_analyzer/ir_analyzer.cc
index 29c503255c1876..724cca3e6279ce 100644
--- a/paddle/cinn/ir/ir_analyzer/ir_analyzer.cc
+++ b/paddle/cinn/ir/ir_analyzer/ir_analyzer.cc
@@ -173,7 +173,7 @@ Expr AddUnitLoop(const std::vector<Expr>& exprs, const Expr& block) {
                 ->schedule_block.As<ir::ScheduleBlock>()
                 ->name == block_name) {
           auto block = ir::Block::Make({GetBlock(exprs, block_name)});
-          auto loop = ir::For::Make(ir::Var(common::UniqName("ix")),
+          auto loop = ir::For::Make(ir::Var(cinn::common::UniqName("ix")),
                                     ir::Expr(0),
                                     ir::Expr(1),
                                     ir::ForType::Serial,
@@ -186,7 +186,7 @@ Expr AddUnitLoop(const std::vector<Expr>& exprs, const Expr& block) {
     }
   } else if (visitor.target_->As<ir::For>()) {
     auto block = ir::Block::Make({visitor.target_->As<ir::For>()->body});
-    auto loop = ir::For::Make(ir::Var(common::UniqName("ix")),
+    auto loop = ir::For::Make(ir::Var(cinn::common::UniqName("ix")),
                               ir::Expr(0),
                               ir::Expr(1),
                               ir::ForType::Serial,
@@ -197,7 +197,7 @@ Expr AddUnitLoop(const std::vector<Expr>& exprs, const Expr& block) {
   } else if (visitor.target_->As<ir::ScheduleBlock>()) {
     auto block =
         ir::Block::Make({visitor.target_->As<ir::ScheduleBlock>()->body});
-    auto loop = ir::For::Make(ir::Var(common::UniqName("ix")),
+    auto loop = ir::For::Make(ir::Var(cinn::common::UniqName("ix")),
                               ir::Expr(0),
                               ir::Expr(1),
                               ir::ForType::Serial,
diff --git a/paddle/cinn/ir/ir_base.h b/paddle/cinn/ir/ir_base.h
index b6a94259fbb85f..c333448d029ae0 100644
--- a/paddle/cinn/ir/ir_base.h
+++ b/paddle/cinn/ir/ir_base.h
@@ -29,12 +29,12 @@
 namespace cinn {
 
 namespace ir {
-using common::BFloat16;
-using common::Float;
-using common::Float16;
-using common::Int;
-using common::Type;
-using common::type_of;
+using cinn::common::BFloat16;
+using cinn::common::Float;
+using cinn::common::Float16;
+using cinn::common::Int;
+using cinn::common::Type;
+using cinn::common::type_of;
 
 class Module;
 class IRVisitor;
@@ -144,7 +144,7 @@ struct Expr;
 /**
  * The base of all the nodes in the IR.
  */
-class IrNode : public common::Object {
+class IrNode : public cinn::common::Object {
  public:
   //! The operands of this operator.
   std::vector<Expr> operands;
@@ -177,7 +177,7 @@ class IrNode : public common::Object {
 /**
  * A handle to store any IRNode.
  */
-class IrNodeRef : public common::Shared<IrNode> {
+class IrNodeRef : public cinn::common::Shared<IrNode> {
  public:
   IrNodeRef() = default;
   IrNodeRef(const IrNodeRef& other) : Shared(other.p_) {}
diff --git a/paddle/cinn/ir/ir_printer.cc b/paddle/cinn/ir/ir_printer.cc
index 25ac1daca49e70..b8676cb078960a 100644
--- a/paddle/cinn/ir/ir_printer.cc
+++ b/paddle/cinn/ir/ir_printer.cc
@@ -28,8 +28,8 @@
 namespace cinn {
 namespace ir {
 
-using common::bfloat16;
-using common::float16;
+using cinn::common::bfloat16;
+using cinn::common::float16;
 
 void IrPrinter::Print(const Expr &e) {
   IRVisitorRequireReImpl::Visit(&e);
diff --git a/paddle/cinn/ir/lowered_func.cc b/paddle/cinn/ir/lowered_func.cc
index 13f0fe9eaabc9b..129fc5d6e32782 100644
--- a/paddle/cinn/ir/lowered_func.cc
+++ b/paddle/cinn/ir/lowered_func.cc
@@ -33,8 +33,8 @@
 namespace cinn {
 namespace ir {
 
-using common::bfloat16;
-using common::float16;
+using cinn::common::bfloat16;
+using cinn::common::float16;
 
 const _LoweredFunc_* LoweredFunc::operator->() const {
   return As<_LoweredFunc_>();
@@ -171,7 +171,7 @@ std::vector<Expr> _LoweredFunc_::PrepareCreateTempBufferExprs() const {
       auto expr = ir::intrinsics::BufferCreate::Make(temp_buf);
       auto buffer_ptr_type =
           Type()
-              .set_customized_type(common::customized_type::kbuffer_t)
+              .set_customized_type(cinn::common::customized_type::kbuffer_t)
               .set_cpp_handle();
       Var variable = ir::_Var_::Make(temp_buf->name, buffer_ptr_type);
       expr = ir::Let::Make(variable, expr);
@@ -301,7 +301,7 @@ void _LoweredFunc_::PrepareArgumentExprs() {
   // type of `cinn_buffer_t*`
   auto buffer_ptr_type =
       Type()
-          .set_customized_type(common::customized_type::kbuffer_t)
+          .set_customized_type(cinn::common::customized_type::kbuffer_t)
           .set_cpp_handle();
   // type of `const cinn_buffer_t*`
   auto const_buffer_ptr_type = buffer_ptr_type.with_cpp_const();
@@ -309,13 +309,13 @@ void _LoweredFunc_::PrepareArgumentExprs() {
 
   Var args_passed_in("_args", type_of<void*>());
   auto pod_value_ptr =
-      common::CastIfNeeded(args_passed_in, type_of<cinn_pod_value_t*>());
+      cinn::common::CastIfNeeded(args_passed_in, type_of<cinn_pod_value_t*>());
 
   if (FLAGS_cinn_runtime_display_debug_info) {
     argument_prepare_exprs.push_back(runtime::IntrinsicCall(
         Void(),
         runtime::intrinsic::print_debug_args_repr,
-        {pod_value_ptr, common::make_const(Int(32), args.size())}));
+        {pod_value_ptr, cinn::common::make_const(Int(32), args.size())}));
   }
 
   /*
@@ -333,7 +333,7 @@ void _LoweredFunc_::PrepareArgumentExprs() {
     // cast arg to cinn_pod_value_t*
 
     // something like `_args[0]`
-    Expr load_expr = Load::Make(pod_value_ptr, {common::make_const(i)});
+    Expr load_expr = Load::Make(pod_value_ptr, {cinn::common::make_const(i)});
     CHECK_EQ(load_expr.type(), type_of<cinn_pod_value_t>());
     load_expr = ir::intrinsics::GetAddr::Make(load_expr);
 
diff --git a/paddle/cinn/ir/module.h b/paddle/cinn/ir/module.h
index 6d122a2b8d764d..fad8377e6b0158 100644
--- a/paddle/cinn/ir/module.h
+++ b/paddle/cinn/ir/module.h
@@ -36,7 +36,7 @@ class Module : public ir::IrNodeRef {
  public:
   struct Builder {
     Builder(const std::string& name, const Target& target)
-        : module_(common::make_shared<ir::_Module_>()) {
+        : module_(cinn::common::make_shared<ir::_Module_>()) {
       module_->name = name;
       module_->target = target;
     }
diff --git a/paddle/cinn/ir/op/ir_operators.cc b/paddle/cinn/ir/op/ir_operators.cc
index 69bdea2378a747..fcb0e19a6bb95a 100644
--- a/paddle/cinn/ir/op/ir_operators.cc
+++ b/paddle/cinn/ir/op/ir_operators.cc
@@ -82,9 +82,9 @@ Expr operator|(Expr a, Expr b) {
     }
   }
   auto target = cinn::runtime::CurrentTarget::GetCurrentTarget();
-  if (target.arch == common::Target::Arch::X86) {
+  if (target.arch == cinn::common::Target::Arch::X86) {
     return lang::CallExtern("bitwise_or", {a, b}, {{"vectorizable", false}});
-  } else if (target.arch == common::Target::Arch::NVGPU) {
+  } else if (target.arch == cinn::common::Target::Arch::NVGPU) {
     auto func_name = hlir::GetExternFuncName(target, t_a, "bitwise_or");
     return lang::CallExtern(func_name, {a, b}, {{"vectorizable", false}});
   } else {
@@ -105,9 +105,9 @@ Expr operator&(Expr a, Expr b) {
     }
   }
   auto target = cinn::runtime::CurrentTarget::GetCurrentTarget();
-  if (target.arch == common::Target::Arch::X86) {
+  if (target.arch == cinn::common::Target::Arch::X86) {
     return lang::CallExtern("bitwise_and", {a, b}, {{"vectorizable", false}});
-  } else if (target.arch == common::Target::Arch::NVGPU) {
+  } else if (target.arch == cinn::common::Target::Arch::NVGPU) {
     auto func_name = hlir::GetExternFuncName(target, t_a, "bitwise_and");
     return lang::CallExtern(func_name, {a, b}, {{"vectorizable", false}});
   } else {
@@ -129,9 +129,9 @@ Expr operator^(Expr a, Expr b) {
     }
   }
   auto target = cinn::runtime::CurrentTarget::GetCurrentTarget();
-  if (target.arch == common::Target::Arch::X86) {
+  if (target.arch == cinn::common::Target::Arch::X86) {
     return lang::CallExtern("bitwise_xor", {a, b}, {{"vectorizable", false}});
-  } else if (target.arch == common::Target::Arch::NVGPU) {
+  } else if (target.arch == cinn::common::Target::Arch::NVGPU) {
     auto func_name = hlir::GetExternFuncName(target, t_a, "bitwise_xor");
     return lang::CallExtern(func_name, {a, b}, {{"vectorizable", false}});
   } else {
@@ -143,9 +143,9 @@ Expr operator^(Expr a, Expr b) {
 Expr operator~(Expr a) {
   CHECK(a.type().is_int() || a.type().is_uint());
   auto target = cinn::runtime::CurrentTarget::GetCurrentTarget();
-  if (target.arch == common::Target::Arch::X86) {
+  if (target.arch == cinn::common::Target::Arch::X86) {
     return lang::CallExtern("bitwise_not", {a}, {{"vectorizable", false}});
-  } else if (target.arch == common::Target::Arch::NVGPU) {
+  } else if (target.arch == cinn::common::Target::Arch::NVGPU) {
     auto func_name = hlir::GetExternFuncName(target, a->type(), "bitwise_not");
     return lang::CallExtern(func_name, {a}, {{"vectorizable", false}});
   } else {
diff --git a/paddle/cinn/ir/operation.cc b/paddle/cinn/ir/operation.cc
index 6a6b6a3107c7d5..f6ceb45964ba81 100644
--- a/paddle/cinn/ir/operation.cc
+++ b/paddle/cinn/ir/operation.cc
@@ -62,7 +62,7 @@ Operation ComputeOp::Make(const std::string &name,
   n->reduce_axis = reduce_axis;
   n->tag = tag;
   n->attrs = attrs;
-  n->axis = common::GenDefaultAxis(domain.size());
+  n->axis = cinn::common::GenDefaultAxis(domain.size());
   std::vector<Expr> tmp_axis;
   for (auto &x : n->axis) {
     tmp_axis.push_back(x);
diff --git a/paddle/cinn/ir/schedule/factorize_reduction.h b/paddle/cinn/ir/schedule/factorize_reduction.h
index 4075feb93599e0..82ad269e0750ae 100644
--- a/paddle/cinn/ir/schedule/factorize_reduction.h
+++ b/paddle/cinn/ir/schedule/factorize_reduction.h
@@ -33,7 +33,7 @@ namespace ir {
 Tensor CreateRFTensor(const Tensor& original_tensor,
                       const Expr& rf_loop,
                       int rf_axis) {
-  std::string name = common::UniqName(original_tensor->name + "_rf");
+  std::string name = cinn::common::UniqName(original_tensor->name + "_rf");
   std::vector<Expr> new_shape = original_tensor->shape;
   new_shape.insert(new_shape.begin() + rf_axis, rf_loop.As<For>()->extent);
   Tensor rf_tensor = _Tensor_::Make(name,
diff --git a/paddle/cinn/ir/schedule/impl/base.cc b/paddle/cinn/ir/schedule/impl/base.cc
index f8cdd8f7279148..63f7d252ab2ab1 100644
--- a/paddle/cinn/ir/schedule/impl/base.cc
+++ b/paddle/cinn/ir/schedule/impl/base.cc
@@ -363,7 +363,7 @@ std::vector<Expr> StScheduleImpl::SamplePerfectTile(
   CHECK_GE(n, 2) << "The number of tile factors should be at least 2";
   CHECK_GE(max_innermost_factor, 1)
       << "The max innermost factor should be at least 1";
-  CHECK(common::is_zero(loop.As<ir::For>()->min))
+  CHECK(cinn::common::is_zero(loop.As<ir::For>()->min))
       << "The For loop should start from 0";
   int loop_extent = GetLoopExtent(loop);
   std::vector<int> innermost_factors;
diff --git a/paddle/cinn/ir/schedule/impl/for_type.cc b/paddle/cinn/ir/schedule/impl/for_type.cc
index 0e5bf07d9fb880..63ad5b888d0ea6 100644
--- a/paddle/cinn/ir/schedule/impl/for_type.cc
+++ b/paddle/cinn/ir/schedule/impl/for_type.cc
@@ -90,7 +90,8 @@ void StScheduleImpl::Bind(const Expr& loop, const std::string& thread_axis) {
       << "thread_axis " << thread_axis << " is not supported";
   int offset = thread_axis.back() - 'x';
   auto cur_dev_info =
-      common::DevInfoMgr<common::Target::Arch::NVGPU>::GetDevInfo(0);
+      cinn::common::DevInfoMgr<cinn::common::Target::Arch::NVGPU>::GetDevInfo(
+          0);
   const std::array<int, 3> kMaxBlockDims = cur_dev_info->GetMaxBlockDims();
   const std::array<int, 3> kMaxGridDims = cur_dev_info->GetMaxGridDims();
   auto check_offset = [&](const char& c) -> bool {
diff --git a/paddle/cinn/ir/schedule/impl/loop_transformation.cc b/paddle/cinn/ir/schedule/impl/loop_transformation.cc
index c628382c1f7009..f49ffb46b73acc 100644
--- a/paddle/cinn/ir/schedule/impl/loop_transformation.cc
+++ b/paddle/cinn/ir/schedule/impl/loop_transformation.cc
@@ -83,7 +83,7 @@ std::vector<Expr> StScheduleImpl::Split(const Expr& loop,
   CHECK(loop.As<ir::For>())
       << "Expr param of Split must be For node! Please check.";
   auto* for_node = loop.As<ir::For>();
-  CHECK(common::is_zero(for_node->min))
+  CHECK(cinn::common::is_zero(for_node->min))
       << "The For node must start with 0! Please check.";
   CHECK(for_node->extent.is_constant())
       << "The For node's extent must be constant! Please check.";
@@ -105,12 +105,12 @@ std::vector<Expr> StScheduleImpl::Split(const Expr& loop,
   std::vector<Var> new_loop_vars;
   Expr substitute_value(0);
   for (int i = 0; i < processed_factors.size(); ++i) {
-    Var temp_var(common::UniqName(for_node->loop_var->name));
+    Var temp_var(cinn::common::UniqName(for_node->loop_var->name));
     substitute_value =
         Expr(temp_var) + substitute_value * Expr(processed_factors[i]);
     new_loop_vars.push_back(temp_var);
   }
-  substitute_value = common::AutoSimplify(substitute_value);
+  substitute_value = cinn::common::AutoSimplify(substitute_value);
   Expr new_node = ir::ir_utils::IRCopy(for_node->body);
   ReplaceExpr(&new_node, {for_node->loop_var}, {substitute_value});
   std::vector<Expr> splited_loops;
@@ -181,7 +181,7 @@ Expr StScheduleImpl::Fuse(const std::vector<Expr>& loops) {
   for (int i = 0; i < loops_number; ++i) {
     fused_extent = fused_extent * for_nodes[i]->extent;
   }
-  fused_extent = common::AutoSimplify(fused_extent);
+  fused_extent = cinn::common::AutoSimplify(fused_extent);
 
   if (!fused_body.As<ir::Block>()) fused_body = Block::Make({fused_body});
   Expr new_stmt = For::Make(fused_var,
diff --git a/paddle/cinn/ir/schedule/ir_schedule_util.cc b/paddle/cinn/ir/schedule/ir_schedule_util.cc
index db378eba741945..ac9f609e0c20fc 100644
--- a/paddle/cinn/ir/schedule/ir_schedule_util.cc
+++ b/paddle/cinn/ir/schedule/ir_schedule_util.cc
@@ -72,7 +72,7 @@ Tensor GetReadTensor(const Expr& block, int index) {
 
 int GetLoopExtent(const Expr& loop) {
   CHECK(loop.As<ir::For>());
-  CHECK(common::is_zero(loop.As<ir::For>()->min));
+  CHECK(cinn::common::is_zero(loop.As<ir::For>()->min));
   CHECK(loop.As<ir::For>()->extent.is_constant());
   return static_cast<int>(loop.As<ir::For>()->extent.get_constant());
 }
@@ -92,7 +92,7 @@ void SetCudaAxisInfo(Expr* lowered_func) {
           auto bind_info = x->As<ir::For>()->bind_info();
           info.set_valid(true);
           if (bind_info.for_type == ForType::GPUThread) {
-            CHECK(common::is_zero(x->As<ir::For>()->min));
+            CHECK(cinn::common::is_zero(x->As<ir::For>()->min));
             CHECK(x->As<ir::For>()->extent.is_constant());
             int range = x->As<ir::For>()->extent.get_constant();
             range = range > info.block_dim(bind_info.offset)
@@ -102,7 +102,7 @@ void SetCudaAxisInfo(Expr* lowered_func) {
                     << range;
             info.set_block_dim(bind_info.offset, range);
           } else if (bind_info.for_type == ForType::GPUBlock) {
-            CHECK(common::is_zero(x->As<ir::For>()->min));
+            CHECK(cinn::common::is_zero(x->As<ir::For>()->min));
             CHECK(x->As<ir::For>()->extent.is_constant());
             int range = x->As<ir::For>()->extent.get_constant();
             range = range > info.grid_dim(bind_info.offset)
@@ -362,8 +362,8 @@ IterRange GetAccessedRange(const Expr& index,
   ReplaceExpr(&indice_min, iter_vars, var_mins);
   ReplaceExpr(&indice_max, iter_vars, var_maxs);
   // simplify expression
-  indice_min = common::AutoSimplify(indice_min);
-  indice_max = common::AutoSimplify(indice_max);
+  indice_min = cinn::common::AutoSimplify(indice_min);
+  indice_max = cinn::common::AutoSimplify(indice_max);
 
   Expr indice_extent;
   Expr mod_extent(0);
@@ -371,7 +371,7 @@ IterRange GetAccessedRange(const Expr& index,
     Expr mod_right_min = indice_min.As<Mod>()->a();
     Expr mod_right_max = indice_max.As<Mod>()->a();
     Expr mod_right_extent =
-        common::AutoSimplify(mod_right_max - mod_right_min + 1);
+        cinn::common::AutoSimplify(mod_right_max - mod_right_min + 1);
     mod_extent = indice_min.As<Mod>()->b();
     if (mod_right_extent.get_constant() < mod_extent.get_constant()) {
       mod_extent = mod_right_extent;
@@ -379,15 +379,16 @@ IterRange GetAccessedRange(const Expr& index,
   }
 
   if (indice_min == indice_max) {
-    if (common::is_zero(mod_extent)) {
+    if (cinn::common::is_zero(mod_extent)) {
       // If a index keeps constant, its extent should be 1.
       indice_extent = Expr(1);
     } else {
       indice_extent = mod_extent;
     }
   } else {
-    indice_extent = common::AutoSimplify(common::AutoSimplify(indice_max) -
-                                         common::AutoSimplify(indice_min) + 1);
+    indice_extent =
+        cinn::common::AutoSimplify(cinn::common::AutoSimplify(indice_max) -
+                                   cinn::common::AutoSimplify(indice_min) + 1);
   }
 
   if (indice_extent.is_constant() && indice_extent.get_constant() < 0) {
@@ -500,10 +501,10 @@ Expr MakeCacheBlock(const std::vector<IterRange>& buffer_ranges,
   // Create loop vars and block vars' binding_value
   for (const auto& range : buffer_ranges) {
     Var loop_var(
-        common::UniqName("cache_ax" + std::to_string(loop_vars.size())));
+        cinn::common::UniqName("cache_ax" + std::to_string(loop_vars.size())));
     // Var loop_var("ax" + std::to_string(loop_vars.size()));
     loop_vars.push_back(loop_var);
-    iter_values.push_back(common::AutoSimplify(range.min + loop_var));
+    iter_values.push_back(cinn::common::AutoSimplify(range.min + loop_var));
   }
   // block variables
   std::vector<Var> block_vars;
@@ -516,7 +517,7 @@ Expr MakeCacheBlock(const std::vector<IterRange>& buffer_ranges,
   }
   auto body = new_tensor->tensor_store_expanded_body();
   std::vector<Var> axis_vars =
-      common::GenDefaultAxis(new_tensor->domain.size());
+      cinn::common::GenDefaultAxis(new_tensor->domain.size());
   axis_vars.insert(axis_vars.end(),
                    new_tensor->reduce_axis.begin(),
                    new_tensor->reduce_axis.end());
@@ -531,7 +532,7 @@ Expr MakeCacheBlock(const std::vector<IterRange>& buffer_ranges,
   for (int i = static_cast<int>(loop_vars.size()) - 1; i >= 0; i--) {
     new_body = For::Make(loop_vars[i],
                          Expr(0),
-                         common::AutoSimplify(buffer_ranges[i].extent),
+                         cinn::common::AutoSimplify(buffer_ranges[i].extent),
                          ir::ForType::Serial,
                          device_api,
                          ir::Block::Make({new_body}));
@@ -1031,9 +1032,9 @@ void InsertBlock(Expr& for_loop, const Expr& insertion, int index) {  // NOLINT
 }
 
 IterRange RangeUnion(const IterRange& range1, const IterRange& range2) {
-  Expr new_min = common::AutoSimplify(Min::Make(range1.min, range2.min));
-  Expr new_extent = common::AutoSimplify(
-      common::AutoSimplify(
+  Expr new_min = cinn::common::AutoSimplify(Min::Make(range1.min, range2.min));
+  Expr new_extent = cinn::common::AutoSimplify(
+      cinn::common::AutoSimplify(
           Max::Make(range1.min + range1.extent, range2.min + range2.extent)) -
       new_min);
   return IterRange(new_min, new_extent);
diff --git a/paddle/cinn/ir/schedule/ir_schedule_util.h b/paddle/cinn/ir/schedule/ir_schedule_util.h
index 9d9b416ae6c159..ecb269ca61d991 100644
--- a/paddle/cinn/ir/schedule/ir_schedule_util.h
+++ b/paddle/cinn/ir/schedule/ir_schedule_util.h
@@ -476,7 +476,7 @@ struct RfMutator : public ir::IRMutator<> {
       auto* rf_for = rf_loop_.As<For>();
       CHECK(rf_for);
       CHECK(is_zero(rf_for->min)) << "rfactor loop's min should be zero";
-      auto extent = common::AutoSimplify(rf_for->extent);
+      auto extent = cinn::common::AutoSimplify(rf_for->extent);
       auto& shape = tensor->shape;
       auto& domain = tensor->domain;
       CHECK_LE(rf_axis_, shape.size())
@@ -578,14 +578,14 @@ struct LoopReconstructor : public ir::IRMutator<> {
       const auto& range = iter_ranges[i];
       if (keep_unit_loops || range.extent != Expr(1)) {
         std::string var_name =
-            common::UniqName("ax" + std::to_string(loop_vars.size()));
+            cinn::common::UniqName("ax" + std::to_string(loop_vars.size()));
         new_var_names.push_back(var_name);
         Var var(var_name, Int(32));
         loop_vars.push_back(var);
         loop_extents.push_back(range.extent);
-        iter_values.push_back(common::AutoSimplify(range.min) + var);
+        iter_values.push_back(cinn::common::AutoSimplify(range.min) + var);
       } else {
-        iter_values.push_back(common::AutoSimplify(range.min));
+        iter_values.push_back(cinn::common::AutoSimplify(range.min));
       }
     }
     auto schedule_block_node =
diff --git a/paddle/cinn/ir/schedule_block_graph.cc b/paddle/cinn/ir/schedule_block_graph.cc
index fc8b73104110d2..e879a15776675b 100644
--- a/paddle/cinn/ir/schedule_block_graph.cc
+++ b/paddle/cinn/ir/schedule_block_graph.cc
@@ -36,15 +36,15 @@ std::vector<Expr> ScheduleBlockNode::ControlStmts() const {
   return ir_sch_.GetLoops(id_);
 }
 
-bool EdgeCompare(const common::Shared<common::GraphEdge>& a,
-                 const common::Shared<common::GraphEdge>& b) {
+bool EdgeCompare(const cinn::common::Shared<cinn::common::GraphEdge>& a,
+                 const cinn::common::Shared<cinn::common::GraphEdge>& b) {
   CHECK_NOTNULL(a.get());
   CHECK_NOTNULL(b.get());
   return a->index() < b->index();
 }
-std::vector<common::Shared<common::GraphEdge>>
+std::vector<cinn::common::Shared<cinn::common::GraphEdge>>
 ScheduleBlockNode::OrderedInLinks() const {
-  std::vector<common::Shared<common::GraphEdge>> ordered_links;
+  std::vector<cinn::common::Shared<cinn::common::GraphEdge>> ordered_links;
   for (auto& in_edge : this->inlinks()) {
     ordered_links.push_back(in_edge);
     CHECK_GE(in_edge->index(), 0)
@@ -55,9 +55,9 @@ ScheduleBlockNode::OrderedInLinks() const {
   return ordered_links;
 }
 
-std::vector<common::Shared<common::GraphEdge>>
+std::vector<cinn::common::Shared<cinn::common::GraphEdge>>
 ScheduleBlockNode::OrderedOutLinks() const {
-  std::vector<common::Shared<common::GraphEdge>> ordered_links;
+  std::vector<cinn::common::Shared<cinn::common::GraphEdge>> ordered_links;
   for (auto& out_edge : this->outlinks()) {
     ordered_links.push_back(out_edge);
     CHECK_GE(out_edge->index(), 0)
@@ -132,7 +132,7 @@ void ScheduleBlockGraph::Update(const IRSchedule& ir_sch) {
 
 std::vector<ScheduleBlockNode*> ScheduleBlockGraph::StartPoints() {
   std::vector<ScheduleBlockNode*> res;
-  for (common::GraphNode* node : nodes()) {
+  for (cinn::common::GraphNode* node : nodes()) {
     if (node->inlinks().empty()) {
       res.push_back(dynamic_cast<ScheduleBlockNode*>(node));
     }
@@ -142,7 +142,7 @@ std::vector<ScheduleBlockNode*> ScheduleBlockGraph::StartPoints() {
 
 std::vector<ScheduleBlockNode*> ScheduleBlockGraph::EndPoints() {
   std::vector<ScheduleBlockNode*> res;
-  for (common::GraphNode* node : nodes()) {
+  for (cinn::common::GraphNode* node : nodes()) {
     if (node->outlinks().empty()) {
       res.push_back(dynamic_cast<ScheduleBlockNode*>(node));
     }
@@ -151,7 +151,7 @@ std::vector<ScheduleBlockNode*> ScheduleBlockGraph::EndPoints() {
 }
 
 void ScheduleBlockGraph::NodesWalk(const NodeHandlerType& NodeHandler) {
-  for (common::GraphNode* node : nodes()) {
+  for (cinn::common::GraphNode* node : nodes()) {
     ScheduleBlockNode* cur_node = dynamic_cast<ScheduleBlockNode*>(node);
     NodeHandler(cur_node);
   }
@@ -175,8 +175,8 @@ void ScheduleBlockGraph::DFSTopoWalk(const NodeHandlerType& NodeHandler,
       NextNodeHandler(next_node);
     }
   };
-  common::DfsTopoWalker<ScheduleBlockNode*> walker(VisitPreNodes,
-                                                   VisitNextNodes);
+  cinn::common::DfsTopoWalker<ScheduleBlockNode*> walker(VisitPreNodes,
+                                                         VisitNextNodes);
   std::vector<ScheduleBlockNode*> starts =
       is_reverse ? EndPoints() : StartPoints();
   walker(starts.begin(), starts.end(), NodeHandler);
diff --git a/paddle/cinn/ir/schedule_block_graph.h b/paddle/cinn/ir/schedule_block_graph.h
index 2ccced20457f19..1cad28951926b3 100644
--- a/paddle/cinn/ir/schedule_block_graph.h
+++ b/paddle/cinn/ir/schedule_block_graph.h
@@ -28,7 +28,7 @@ namespace cinn {
 namespace ir {
 
 // Node in units of ScheduleBlock.
-class ScheduleBlockNode : public common::GraphNode {
+class ScheduleBlockNode : public cinn::common::GraphNode {
  public:
   ScheduleBlockNode(Expr block, const IRSchedule& ir_sch);
 
@@ -66,8 +66,10 @@ class ScheduleBlockNode : public common::GraphNode {
   }
 
  private:
-  std::vector<common::Shared<common::GraphEdge>> OrderedInLinks() const;
-  std::vector<common::Shared<common::GraphEdge>> OrderedOutLinks() const;
+  std::vector<cinn::common::Shared<cinn::common::GraphEdge>> OrderedInLinks()
+      const;
+  std::vector<cinn::common::Shared<cinn::common::GraphEdge>> OrderedOutLinks()
+      const;
 
  private:
   std::string id_;
@@ -78,7 +80,7 @@ class ScheduleBlockNode : public common::GraphNode {
 
 // Graph in units of ScheduleBlockNode, each node corresponds to a ScheduleBlock
 // in IR.
-class ScheduleBlockGraph : public common::Graph {
+class ScheduleBlockGraph : public cinn::common::Graph {
  public:
   explicit ScheduleBlockGraph(const IRSchedule& ir_sch);
 
@@ -88,7 +90,8 @@ class ScheduleBlockGraph : public common::Graph {
   // Retrieve a node in the graph by id, the id is same as the name of
   // ScheduleBlock.
   ScheduleBlockNode* RetrieveNode(const std::string& id) {
-    return dynamic_cast<ScheduleBlockNode*>(common::Graph::RetrieveNode(id));
+    return dynamic_cast<ScheduleBlockNode*>(
+        cinn::common::Graph::RetrieveNode(id));
   }
 
   // Get all block name in order,
diff --git a/paddle/cinn/ir/tensor.cc b/paddle/cinn/ir/tensor.cc
index 1576564fcb1232..261db949b997bc 100644
--- a/paddle/cinn/ir/tensor.cc
+++ b/paddle/cinn/ir/tensor.cc
@@ -208,7 +208,7 @@ PlaceholderOp *_Tensor_::get_placeholder_op() const {
 
 void _Tensor_::InitAxis() const {
   // CHECK(!domain_without_reduce_axis().empty());
-  axis_ = common::GenDefaultAxis(domain_without_reduce_axis().size());
+  axis_ = cinn::common::GenDefaultAxis(domain_without_reduce_axis().size());
 }
 
 bool _Tensor_::has_expression() const {
@@ -232,7 +232,7 @@ isl::set _Tensor_::GenerateIslDomain() const {
       } else {
         dims.emplace_back(_axis_with_reduce[i]->name,
                           Expr(0),
-                          Sub::Make(dim, common::make_const(1)));
+                          Sub::Make(dim, cinn::common::make_const(1)));
       }
     }
   }
@@ -410,7 +410,7 @@ Expr _Tensor_::tensor_store_expanded_body() {
   Expr final_body = body();
   if (shape.empty()) return final_body;
 
-  std::vector<Expr> g_axis = common::GenDefaultAxisAsExpr(shape.size());
+  std::vector<Expr> g_axis = cinn::common::GenDefaultAxisAsExpr(shape.size());
   if (!new_indices.empty()) {
     g_axis = new_indices;
   }
@@ -472,7 +472,7 @@ void _Tensor_::Bind(const Buffer &buffer) {
 void _Tensor_::WithBuffer(const Type &type) {
   Type buf_type = type.is_void() ? type_ : type;
   lang::Buffer buf(buf_type);
-  buf->target = common::DefaultHostTarget();
+  buf->target = cinn::common::DefaultHostTarget();
   Bind(buf);
 }
 
@@ -494,7 +494,7 @@ void _Tensor_::WithBuffer(const std::string &memory_type,
     }
   } else {
     lang::Buffer buf(buf_type, buffer_name);
-    buf->target = common::DefaultHostTarget();
+    buf->target = cinn::common::DefaultHostTarget();
     Bind(buf);
 
     if (memory_type == "shared") {
@@ -513,8 +513,8 @@ bool _Tensor_::HasSameShapeWith(const Tensor &other) const {
   if (shape.size() != other->shape.size()) return false;
 
   for (int i = 0; i < shape.size(); i++) {
-    Expr dim0 = common::AutoSimplify(shape[i]);
-    Expr dim1 = common::AutoSimplify(other->shape[i]);
+    Expr dim0 = cinn::common::AutoSimplify(shape[i]);
+    Expr dim1 = cinn::common::AutoSimplify(other->shape[i]);
 
     if (dim0 != dim1) return false;
   }
diff --git a/paddle/cinn/ir/tensor.h b/paddle/cinn/ir/tensor.h
index c0cd53ec2349d7..d9047e01cee9c2 100644
--- a/paddle/cinn/ir/tensor.h
+++ b/paddle/cinn/ir/tensor.h
@@ -306,7 +306,7 @@ class _Tensor_ : public ExprNode<_Tensor_> {
                   const Type& type = Void());
   Tensor GetInitTensor(
       poly::StageMap stages,
-      const Target& target = common::DefaultHostTarget()) const;
+      const Target& target = cinn::common::DefaultHostTarget()) const;
 
   /**
    * Create the initialization tensor.
@@ -316,7 +316,7 @@ class _Tensor_ : public ExprNode<_Tensor_> {
    */
   ir::Tensor InitReduction(
       poly::StageMap stages,
-      const Target& target = common::DefaultHostTarget()) const;
+      const Target& target = cinn::common::DefaultHostTarget()) const;
 
  private:
   //! Initialize the axis field after the shape field is assigned.
diff --git a/paddle/cinn/ir/test/ir_compare_test.cc b/paddle/cinn/ir/test/ir_compare_test.cc
index cc9ce438221a2e..bb1c6eb46866cd 100644
--- a/paddle/cinn/ir/test/ir_compare_test.cc
+++ b/paddle/cinn/ir/test/ir_compare_test.cc
@@ -25,7 +25,7 @@ namespace cinn {
 namespace ir {
 namespace ir_utils {
 TEST(TestIrCompare, SingleFunction) {
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 
   ir::Expr M(32);
   ir::Expr N(32);
diff --git a/paddle/cinn/ir/test/schedule_block_graph_test.cc b/paddle/cinn/ir/test/schedule_block_graph_test.cc
index 78c809dc117d46..3190cec2bc2f10 100644
--- a/paddle/cinn/ir/test/schedule_block_graph_test.cc
+++ b/paddle/cinn/ir/test/schedule_block_graph_test.cc
@@ -27,17 +27,16 @@ namespace ir {
 
 IRSchedule MakeIRSchedule(frontend::Program* program) {
 #ifdef CINN_WITH_CUDA
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 #else
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
 #endif
   std::unordered_set<std::string> fetch_ids;
   auto graph = frontend::Optimize(program, fetch_ids, target);
   LOG_IF(WARNING, graph->fusion_groups.size() > 1)
       << "Test Graph has more than 1 group";
-  auto& dtype_dict =
-      graph->GetMutableAttrs<absl::flat_hash_map<std::string, common::Type>>(
-          "inferdtype");
+  auto& dtype_dict = graph->GetMutableAttrs<
+      absl::flat_hash_map<std::string, cinn::common::Type>>("inferdtype");
   auto& shape_dict = graph->GetMutableAttrs<
       absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
   auto op_lowerer =
diff --git a/paddle/cinn/ir/test/schedule_desc_test.cc b/paddle/cinn/ir/test/schedule_desc_test.cc
index b360f021d6b2c5..dcd8b90ef120dd 100644
--- a/paddle/cinn/ir/test/schedule_desc_test.cc
+++ b/paddle/cinn/ir/test/schedule_desc_test.cc
@@ -121,7 +121,7 @@ std::string SourceCodeGen(const ModuleExpr& module_expr,
 
 class TestScheduleDesc : public ::testing::Test {
  public:
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
   std::vector<ir::LoweredFunc> lowered_funcs;
   ScheduleDesc trace;
   void SetUp() override { Context::Global().ResetNameId(); }
diff --git a/paddle/cinn/ir/test/st_shape_group_scheduler_test.cc b/paddle/cinn/ir/test/st_shape_group_scheduler_test.cc
index 22f64849a8f7a2..37f084c436543f 100644
--- a/paddle/cinn/ir/test/st_shape_group_scheduler_test.cc
+++ b/paddle/cinn/ir/test/st_shape_group_scheduler_test.cc
@@ -30,7 +30,7 @@ using frontend::RunDecomposer;
 
 void Compile(NetBuilder* net_builder) {
   auto program = net_builder->Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
   RunDecomposer(&program, target);
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
@@ -68,7 +68,7 @@ void CheckAccuracy(NetBuilder* net_builder,
                    const std::vector<std::string>& input_names) {
   FLAGS_cinn_new_group_scheduler = true;
   auto program = net_builder->Build();
-  auto target = common::DefaultTarget();
+  auto target = cinn::common::DefaultTarget();
 
   auto graph = std::make_shared<hlir::framework::Graph>(program, target);
   hlir::framework::ApplyPasses(graph.get(),
diff --git a/paddle/cinn/ir/test/tensor_test.cc b/paddle/cinn/ir/test/tensor_test.cc
index 26e3ce6a851fc0..cea1263f2aba38 100644
--- a/paddle/cinn/ir/test/tensor_test.cc
+++ b/paddle/cinn/ir/test/tensor_test.cc
@@ -95,10 +95,10 @@ TEST(Tensor, Reshape) {
 
   auto func = lang::Lower("fn", stages, {A, B});
 
-  ir::Module::Builder builder("some_modue", common::DefaultHostTarget());
+  ir::Module::Builder builder("some_modue", cinn::common::DefaultHostTarget());
   builder.AddFunction(func);
 
-  backends::CodeGenC codegenc(common::DefaultHostTarget());
+  backends::CodeGenC codegenc(cinn::common::DefaultHostTarget());
   codegenc.SetInlineBuiltinCodes(false);
   auto source = codegenc.Compile(builder.Build(), CodeGenC::OutputKind::CImpl);
   LOG(INFO) << "source:\n" << source;
@@ -144,10 +144,10 @@ TEST(Tensor, ReshapeCopied) {
 
   stages->InsertLazily(B);
 
-  ir::Module::Builder builder("some_modue", common::DefaultHostTarget());
+  ir::Module::Builder builder("some_modue", cinn::common::DefaultHostTarget());
   auto func = lang::Lower("fn", stages, {A, B}, {}, {}, &builder);
 
-  backends::CodeGenC codegenc(common::DefaultHostTarget());
+  backends::CodeGenC codegenc(cinn::common::DefaultHostTarget());
   codegenc.SetInlineBuiltinCodes(false);
   auto source = codegenc.Compile(builder.Build(), CodeGenC::OutputKind::CImpl);
   LOG(INFO) << "source:\n" << source;
diff --git a/paddle/cinn/ir/utils/ir_copy.cc b/paddle/cinn/ir/utils/ir_copy.cc
index a47150d6ab2aaf..08dc2bc1e628cd 100644
--- a/paddle/cinn/ir/utils/ir_copy.cc
+++ b/paddle/cinn/ir/utils/ir_copy.cc
@@ -52,7 +52,7 @@ struct IRCopyVisitor : public ir::IRVisitorRequireReImpl<Expr> {
     return Expr(make_shared<FloatImm>(op->type(), op->value));
   }
   Expr Visit(const ir::StringImm* op) override {
-    return Expr(common::make_shared<StringImm>(op->value));
+    return Expr(cinn::common::make_shared<StringImm>(op->value));
   }
 
   Expr Visit(const ir::Cast* op) override {
@@ -367,7 +367,7 @@ struct IRCopyVisitor : public ir::IRVisitorRequireReImpl<Expr> {
       arguments.push_back(Visit(args));
     }
 
-    auto n = common::make_shared<ir::PrimitiveNode>();
+    auto n = cinn::common::make_shared<ir::PrimitiveNode>();
     n->name = op->name;
     n->attrs = op->attrs;  // attrs are PODs
     n->arguments = arguments;
diff --git a/paddle/cinn/lang/buffer.cc b/paddle/cinn/lang/buffer.cc
index 864adfb165cade..83bdfef63217ec 100644
--- a/paddle/cinn/lang/buffer.cc
+++ b/paddle/cinn/lang/buffer.cc
@@ -29,7 +29,7 @@ Buffer::Buffer(Type type, const std::string& name) {
   if (!name.empty()) {
     buffer_->name = name;
   }
-  buffer_->target = common::DefaultHostTarget();
+  buffer_->target = cinn::common::DefaultHostTarget();
 }
 
 }  // namespace lang
diff --git a/paddle/cinn/lang/builtin.cc b/paddle/cinn/lang/builtin.cc
index 3e7ef7390cf7e8..b50a49096847b5 100644
--- a/paddle/cinn/lang/builtin.cc
+++ b/paddle/cinn/lang/builtin.cc
@@ -127,10 +127,10 @@ Expr FloorDivide(Expr a, Expr b) {
   } else {
     auto div = a / b;
     auto mod = a % b;
-    auto ret =
-        ir::Select::Make(ir::EQ::Make(mod, common::make_const(a.type(), 0)),
-                         div,
-                         div - common::make_const(a.type(), 1));
+    auto ret = ir::Select::Make(
+        ir::EQ::Make(mod, cinn::common::make_const(a.type(), 0)),
+        div,
+        div - cinn::common::make_const(a.type(), 1));
     return ir::Select::Make((a > 0 && b > 0) || (a < 0 && b < 0), div, ret);
   }
 }
@@ -227,11 +227,11 @@ Expr Abs(Expr e) {
 Expr IsNan(Expr e) {
   Type type = e->type();
   if (type.is_int() || type.is_uint()) {
-    return common::make_bool(false, type.lanes());
+    return cinn::common::make_bool(false, type.lanes());
   } else if (type.is_float()) {
     auto* node = e.As<ir::FloatImm>();
     if (node) {
-      return common::make_bool(std::isnan(node->value), type.lanes());
+      return cinn::common::make_bool(std::isnan(node->value), type.lanes());
     }
     return CallExtern("isnan", {e}, {{"vectorizable", false}});
   } else {
@@ -258,11 +258,11 @@ Expr Infinity(const Type& type) {
 Expr IsInf(Expr e) {
   Type type = e->type();
   if (type.is_int() || type.is_uint()) {
-    return common::make_bool(false, type.lanes());
+    return cinn::common::make_bool(false, type.lanes());
   } else if (type.is_float()) {
     auto* node = e.As<ir::FloatImm>();
     if (node) {
-      return common::make_bool(std::isinf(node->value), type.lanes());
+      return cinn::common::make_bool(std::isinf(node->value), type.lanes());
     }
     return CallExtern("isinf", {e}, {{"vectorizable", false}});
   } else {
diff --git a/paddle/cinn/lang/compute.cc b/paddle/cinn/lang/compute.cc
index a81ea059cc3fa7..4828eaac64e13c 100644
--- a/paddle/cinn/lang/compute.cc
+++ b/paddle/cinn/lang/compute.cc
@@ -128,7 +128,7 @@ ir::Tensor Compute(const std::vector<Expr> &domain,
                    std::function<Expr(const std::vector<Expr> &)> fn,
                    const std::string &name,
                    const std::vector<Expr> &shape) {
-  auto axises = common::GenDefaultAxis(domain.size());
+  auto axises = cinn::common::GenDefaultAxis(domain.size());
   std::vector<Expr> _axis;
   for (auto &x : axises) _axis.push_back(x);
   Expr fn_body = fn(_axis);
@@ -172,7 +172,7 @@ ir::Tensor Compute(const std::vector<Expr> &domain,
 
   // check reduce_axis not include the reserved axis name
   for (auto &ra : reduce_axis) {
-    CHECK(!common::IsAxisNameReserved(ra->name))
+    CHECK(!cinn::common::IsAxisNameReserved(ra->name))
         << "reduce axis [" << ra->name << "]'s name is reserved";
   }
 
diff --git a/paddle/cinn/lang/lower.cc b/paddle/cinn/lang/lower.cc
index c509a1977555f2..d53a9e4d5dbe19 100644
--- a/paddle/cinn/lang/lower.cc
+++ b/paddle/cinn/lang/lower.cc
@@ -307,7 +307,7 @@ std::vector<ir::LoweredFunc> LowerToAstVec(
       target);
   std::vector<ir::LoweredFunc> result = lower_instance();
   for (auto& res : result) {
-    if (target == common::DefaultNVGPUTarget()) {
+    if (target == cinn::common::DefaultNVGPUTarget()) {
       res->device_api = ir::DeviceAPI::GPU;
     }
   }
@@ -352,7 +352,7 @@ ir::LoweredFunc Lower(const std::string& name,
           break;
         }
       }
-      if (target == common::DefaultNVGPUTarget()) {
+      if (target == cinn::common::DefaultNVGPUTarget()) {
         res->device_api = ir::DeviceAPI::GPU;
       }
     }
@@ -406,7 +406,7 @@ std::vector<ir::LoweredFunc> LowerVec(const std::string& name,
         }
       }
 
-      if (target == common::DefaultNVGPUTarget()) {
+      if (target == cinn::common::DefaultNVGPUTarget()) {
         res->device_api = ir::DeviceAPI::GPU;
       }
     }
diff --git a/paddle/cinn/lang/lower.h b/paddle/cinn/lang/lower.h
index b3f27129778b9c..d4793fb27ca977 100644
--- a/paddle/cinn/lang/lower.h
+++ b/paddle/cinn/lang/lower.h
@@ -50,7 +50,7 @@ ir::LoweredFunc Lower(const std::string &name,
                       const std::vector<Var> &scalar_args = {},
                       const std::vector<Tensor> &temp_tensors = {},
                       ir::Module::Builder *b = nullptr,
-                      const Target &target = common::DefaultHostTarget(),
+                      const Target &target = cinn::common::DefaultHostTarget(),
                       bool support_ir_schedule = false);
 
 /**
@@ -71,19 +71,20 @@ std::vector<ir::LoweredFunc> LowerVec(
     const std::vector<Var> &scalar_args = {},
     const std::vector<Tensor> &temp_tensors = {},
     ir::Module::Builder *b = nullptr,
-    const Target &target = common::DefaultHostTarget(),
+    const Target &target = cinn::common::DefaultHostTarget(),
     bool support_ir_schedule = false);
 
-ir::LoweredFunc LowerToAst(const std::string &name,
-                           const std::vector<Tensor> &tensor_args,
-                           ast_gen_ius::TensorGroup *tensor_group,
-                           const Target &target = common::DefaultHostTarget());
+ir::LoweredFunc LowerToAst(
+    const std::string &name,
+    const std::vector<Tensor> &tensor_args,
+    ast_gen_ius::TensorGroup *tensor_group,
+    const Target &target = cinn::common::DefaultHostTarget());
 
 std::vector<ir::LoweredFunc> LowerToAstVec(
     const std::string &name,
     const std::vector<Tensor> &tensor_args,
     ast_gen_ius::TensorGroup *tensor_group,
-    const Target &target = common::DefaultHostTarget());
+    const Target &target = cinn::common::DefaultHostTarget());
 
 std::vector<ir::Buffer> GetTempBuffers(
     const std::vector<Tensor> &tensor_args,
diff --git a/paddle/cinn/lang/lower_impl.cc b/paddle/cinn/lang/lower_impl.cc
index 9b3b6d7ebb746b..f19ff767cece62 100644
--- a/paddle/cinn/lang/lower_impl.cc
+++ b/paddle/cinn/lang/lower_impl.cc
@@ -208,12 +208,12 @@ std::string CompuGraphNode::id() const {
  * @param t The tensor.
  * @param stages The stage map.
  */
-void CreateCompGraphWithInlineTensors(common::Graph* graph,
+void CreateCompGraphWithInlineTensors(cinn::common::Graph* graph,
                                       const ir::Tensor& t,
                                       StageMap stages,
                                       std::set<ir::Tensor>* visited) {
   if (visited->count(t)) return;
-  common::GraphNode* t_node = graph->RetrieveNode(t->name);
+  cinn::common::GraphNode* t_node = graph->RetrieveNode(t->name);
   if (!t_node) {
     t_node = graph->RegisterNode(t->name, new CompuGraphNode(t));
   }
@@ -239,10 +239,10 @@ void CreateCompGraphWithInlineTensors(common::Graph* graph,
   }
 }
 
-std::unique_ptr<common::Graph> CreateCompGraphWithInlineTensorHidden(
+std::unique_ptr<cinn::common::Graph> CreateCompGraphWithInlineTensorHidden(
     const std::vector<ir::Tensor>& tensors, StageMap stages) {
   // create a graph with inline tensor first.
-  std::unique_ptr<common::Graph> graph(new common::Graph);
+  std::unique_ptr<cinn::common::Graph> graph(new cinn::common::Graph);
   std::set<ir::Tensor> visited;
   for (auto& t : tensors) {
     CreateCompGraphWithInlineTensors(graph.get(), t, stages, &visited);
@@ -251,9 +251,9 @@ std::unique_ptr<common::Graph> CreateCompGraphWithInlineTensorHidden(
   // greedy remove the inline tensor, each time merge the inputs of an inline
   // tensor to its sink node.
 
-  std::set<common::GraphNode*> inline_nodes;
+  std::set<cinn::common::GraphNode*> inline_nodes;
   do {
-    inline_nodes = graph->CollectNodes([&](const common::GraphNode* x) {
+    inline_nodes = graph->CollectNodes([&](const cinn::common::GraphNode* x) {
       auto* comp_node = x->safe_as<CompuGraphNode>();
       return stages[comp_node->tensor]->inlined();
     });
@@ -295,7 +295,7 @@ std::unique_ptr<common::Graph> CreateCompGraphWithInlineTensorHidden(
   return graph;
 }
 
-void CompuGraphAddCtrlDepLinks(common::Graph* graph, StageMap stages) {
+void CompuGraphAddCtrlDepLinks(cinn::common::Graph* graph, StageMap stages) {
   for (auto& x : graph->nodes()) {
     auto* node = x->safe_as<CompuGraphNode>();
     CHECK(node);
@@ -309,14 +309,14 @@ void CompuGraphAddCtrlDepLinks(common::Graph* graph, StageMap stages) {
   }
 }
 
-std::unique_ptr<common::Graph> CreateCompGraph(
+std::unique_ptr<cinn::common::Graph> CreateCompGraph(
     const std::vector<ir::Tensor>& tensors, StageMap stages, bool hide_inline) {
   if (hide_inline) {
     auto graph = CreateCompGraphWithInlineTensorHidden(tensors, stages);
     CompuGraphAddCtrlDepLinks(graph.get(), stages);
     return graph;
   } else {
-    auto graph = std::make_unique<common::Graph>();
+    auto graph = std::make_unique<cinn::common::Graph>();
     std::set<ir::Tensor> visited;
     for (auto& t : tensors) {
       CreateCompGraphWithInlineTensors(graph.get(), t, stages, &visited);
@@ -559,7 +559,7 @@ std::vector<ir::LoweredFunc> LowerImpl::operator()() {
       func_iterator = ir::ScheduleBlockRealize::Make(
           {},
           ir::ScheduleBlock::Make(
-              {}, {}, {}, common::UniqName("root"), func_iterator));
+              {}, {}, {}, cinn::common::UniqName("root"), func_iterator));
     }
     std::set<std::string> temp_tensor_names;
     for (auto& t : temp_tensor_args_) temp_tensor_names.insert(t->name);
@@ -609,7 +609,7 @@ std::vector<ir::LoweredFunc> LowerImpl::operator()() {
     std::unordered_set<std::string> buffer_name_set;
     // TODO(Superjomn) write buffer latter.
 
-    if (target_ == common::DefaultNVGPUTarget()) {
+    if (target_ == cinn::common::DefaultNVGPUTarget()) {
       for (auto& t : new_temp_tensors) {
         if (!tensor_map.count(t->name)) continue;
         auto& tt = tensor_map.at(t->name);
@@ -630,7 +630,7 @@ std::vector<ir::LoweredFunc> LowerImpl::operator()() {
     }
 
     ir::LoweredFunc func;
-    if (target_ == common::DefaultNVGPUTarget()) {
+    if (target_ == cinn::common::DefaultNVGPUTarget()) {
       auto func_args2 =
           GenFuncArgForSplitKernel(func_iterator, new_temp_tensors);
       std::string new_fn_name = fn_name_;
@@ -745,7 +745,7 @@ std::vector<Expr> LowerImpl::GenerateFunctionBody(
         for (auto& var : tensor->reduce_axis) {
           CHECK(var->lower_bound.defined());
           CHECK(var->upper_bound.defined());
-          CHECK(common::is_zero(var->lower_bound));
+          CHECK(cinn::common::is_zero(var->lower_bound));
           CHECK(var->upper_bound.is_constant());
           int_shape.push_back(
               static_cast<int>(var->upper_bound.get_constant()));
@@ -754,7 +754,7 @@ std::vector<Expr> LowerImpl::GenerateFunctionBody(
         std::vector<Var> block_vars;
         std::vector<Expr> iter_values;
         std::vector<Var> axis_vars =
-            common::GenDefaultAxis(tensor->shape.size());
+            cinn::common::GenDefaultAxis(tensor->shape.size());
         // bind var_values
         axis_vars.insert(axis_vars.end(),
                          tensor->reduce_axis.begin(),
@@ -779,7 +779,7 @@ std::vector<Expr> LowerImpl::GenerateFunctionBody(
             ir::ScheduleBlock::Make(
                 block_vars, {}, {}, tensor->name, store_body));
         // iter_values, ir::ScheduleBlock::Make(block_vars, {}, {},
-        // common::UniqName(tensor->name), store_body));
+        // cinn::common::UniqName(tensor->name), store_body));
         VLOG(3) << "store body\n" << store_body;
       }
       tuple_to_expr[tensor->name] = store_body;
@@ -795,7 +795,7 @@ std::vector<Expr> LowerImpl::GenerateFunctionBody(
 
     if (group_expr.defined()) {
       cuda_axis_info_.emplace_back(std::move(temp_cuda_axis_info));
-      if (target_ == common::DefaultNVGPUTarget() && !all_temp_tensor) {
+      if (target_ == cinn::common::DefaultNVGPUTarget() && !all_temp_tensor) {
         exprs.push_back(group_expr);
         Expr body = ir::Block::Make(exprs);
         result.push_back(body);
@@ -805,7 +805,7 @@ std::vector<Expr> LowerImpl::GenerateFunctionBody(
       }
     }
   }
-  if (target_ == common::DefaultHostTarget()) {
+  if (target_ == cinn::common::DefaultHostTarget()) {
     Expr body = ir::Block::Make(exprs);
     result.push_back(body);
     exprs.clear();
diff --git a/paddle/cinn/lang/lower_impl.h b/paddle/cinn/lang/lower_impl.h
index 208a57c868bd74..b5f82ba7312e67 100644
--- a/paddle/cinn/lang/lower_impl.h
+++ b/paddle/cinn/lang/lower_impl.h
@@ -75,7 +75,7 @@ Expr LowerGroup(const poly::ScheduleGroup& group,
 /**
  * A Computation graph node.
  */
-struct CompuGraphNode : public common::GraphNode {
+struct CompuGraphNode : public cinn::common::GraphNode {
   explicit CompuGraphNode(ir::Tensor tensor) : tensor(tensor) {}
 
   ir::Tensor tensor;
@@ -94,7 +94,7 @@ struct CompuGraphNode : public common::GraphNode {
  * @param hide_inline hide inline tensor nodes.
  * @return a graph.
  */
-std::unique_ptr<common::Graph> CreateCompGraph(
+std::unique_ptr<cinn::common::Graph> CreateCompGraph(
     const std::vector<ir::Tensor>& tensors,
     StageMap stages,
     bool hide_inline = false);
@@ -114,7 +114,7 @@ class LowerImpl {
             const std::vector<Tensor>& tensor_args,
             const std::vector<Var>& scalar_args,
             const std::vector<Tensor>& temp_tensor_args = {},
-            const Target& target = common::DefaultHostTarget(),
+            const Target& target = cinn::common::DefaultHostTarget(),
             bool support_ir_schedule = false);
 
   std::vector<ir::LoweredFunc> operator()();
@@ -122,7 +122,7 @@ class LowerImpl {
   /**
    * Get the computational graph.
    */
-  const common::Graph* comp_graph() const { return compu_graph_.get(); }
+  const cinn::common::Graph* comp_graph() const { return compu_graph_.get(); }
 
   /**
    * \brief generate the argument list of the final output function.
@@ -193,7 +193,7 @@ class LowerImpl {
   StageMap stages_;
 
   //! A computation graph generated from the tensor_args and scalar_args.
-  std::unique_ptr<common::Graph> compu_graph_;
+  std::unique_ptr<cinn::common::Graph> compu_graph_;
 
   //! CUDA axis info for this function.
   std::vector<ir::CudaAxisInfo> cuda_axis_info_;
diff --git a/paddle/cinn/lang/lower_tensor_group.cc b/paddle/cinn/lang/lower_tensor_group.cc
index f59ac4ceff52fc..93453621e18393 100644
--- a/paddle/cinn/lang/lower_tensor_group.cc
+++ b/paddle/cinn/lang/lower_tensor_group.cc
@@ -61,7 +61,7 @@ std::vector<ir::LoweredFunc> LowerTensorGroup::operator()() {
     func_body = ir::ScheduleBlockRealize::Make(
         {},
         ir::ScheduleBlock::Make(
-            {}, {}, {}, common::UniqName("root"), func_body));
+            {}, {}, {}, cinn::common::UniqName("root"), func_body));
     // 2. Assign buffer to tensors
     auto tensor_map = tensor_group_->AllocateBuffers();
     // copy the tensor(with buffer assigned) back to func's args.
@@ -217,7 +217,7 @@ std::vector<ir::Expr> LowerTensorGroup::GenerateFunctionBody(
           tensor->buffer.defined() &&
           (tensor->buffer->memory_type == ir::MemoryType::GPUShared ||
            tensor->buffer->memory_type == ir::MemoryType::GPULocal);
-      if (target_ == common::DefaultNVGPUTarget() && !gpu_local) {
+      if (target_ == cinn::common::DefaultNVGPUTarget() && !gpu_local) {
         result.push_back(bodies.size() == 1 ? bodies[0]
                                             : ir::Block::Make(bodies));
         bodies.clear();
diff --git a/paddle/cinn/lang/lower_tensor_group.h b/paddle/cinn/lang/lower_tensor_group.h
index 358e2d9ec953d5..aae18e119dd2a9 100644
--- a/paddle/cinn/lang/lower_tensor_group.h
+++ b/paddle/cinn/lang/lower_tensor_group.h
@@ -49,7 +49,7 @@ class LowerTensorGroup {
                    const std::vector<ir::Var>& scalar_args,
                    ast_gen_ius::TensorGroup* tensor_group,
                    const std::vector<ir::Tensor>& temp_tensor_args = {},
-                   const Target& target = common::DefaultHostTarget());
+                   const Target& target = cinn::common::DefaultHostTarget());
 
   std::vector<ir::LoweredFunc> operator()();
 
diff --git a/paddle/cinn/lang/lower_test.cc b/paddle/cinn/lang/lower_test.cc
index 452b9e7afb7725..25b0bb20f19567 100644
--- a/paddle/cinn/lang/lower_test.cc
+++ b/paddle/cinn/lang/lower_test.cc
@@ -141,7 +141,7 @@ TEST(lower, temp_buffer_collects) {
   auto output = Compute(
       {M}, [&](Expr i) -> Expr { return D(i); }, "output");
 
-  ir::Module::Builder b("somemodule", common::DefaultHostTarget());
+  ir::Module::Builder b("somemodule", cinn::common::DefaultHostTarget());
 
   auto stages = CreateStages({B, C, D, output});
 
diff --git a/paddle/cinn/lang/packed_func.h b/paddle/cinn/lang/packed_func.h
index fa7f3e05cd34b2..94eb1e442f0dc0 100644
--- a/paddle/cinn/lang/packed_func.h
+++ b/paddle/cinn/lang/packed_func.h
@@ -24,7 +24,7 @@
 
 namespace cinn {
 namespace lang {
-using common::CINNValue;
+using cinn::common::CINNValue;
 
 /**
  * A single argument value to Function.
@@ -54,8 +54,8 @@ class Args {
   ArgValue& operator[](int i) { return values_[i]; }
   const ArgValue& operator[](int i) const { return values_[i]; }
 
-  common::CINNValuePack ToValuePack() const {
-    return common::CINNValuePack(values_);
+  cinn::common::CINNValuePack ToValuePack() const {
+    return cinn::common::CINNValuePack(values_);
   }
 
  private:
diff --git a/paddle/cinn/lang/packed_func_test.cc b/paddle/cinn/lang/packed_func_test.cc
index f803f97f58f793..47253996e2ec6c 100644
--- a/paddle/cinn/lang/packed_func_test.cc
+++ b/paddle/cinn/lang/packed_func_test.cc
@@ -78,12 +78,13 @@ TEST(Function, ReturnMultiValue) {
     int c = a + b;
     int d = a - b;
 
-    *ret = common::CINNValuePack{{common::CINNValue(c), common::CINNValue(d)}};
+    *ret = cinn::common::CINNValuePack{
+        {cinn::common::CINNValue(c), cinn::common::CINNValue(d)}};
   };
 
   PackedFunc func(body);
 
-  common::CINNValuePack ret = func(1, 2);
+  cinn::common::CINNValuePack ret = func(1, 2);
   int c = ret[0];
   int d = ret[1];
 
diff --git a/paddle/cinn/lang/placeholder.h b/paddle/cinn/lang/placeholder.h
index 3c20fa3942c909..f36d0edd2adbb8 100644
--- a/paddle/cinn/lang/placeholder.h
+++ b/paddle/cinn/lang/placeholder.h
@@ -125,7 +125,7 @@ void Placeholder<T>::Init(const std::string &name,
 
   std::vector<ir::Var> axis;
   for (int i = 0; i < shape.size(); i++)
-    axis.emplace_back(common::axis_name(i));
+    axis.emplace_back(cinn::common::axis_name(i));
 
   auto op = ir::PlaceholderOp::Make(name, shape, type_of<T>());
 
@@ -145,7 +145,7 @@ void Placeholder<T>::Init(const std::string &name,
 
   std::vector<ir::Var> axis;
   for (int i = 0; i < shape.size(); i++)
-    axis.emplace_back(common::axis_name(i));
+    axis.emplace_back(cinn::common::axis_name(i));
 
   auto op = ir::PlaceholderOp::Make(name, shape, type_of<T>());
 
diff --git a/paddle/cinn/optim/buffer_assign.cc b/paddle/cinn/optim/buffer_assign.cc
index 6e5e4eb0da734b..256624617cc436 100644
--- a/paddle/cinn/optim/buffer_assign.cc
+++ b/paddle/cinn/optim/buffer_assign.cc
@@ -25,7 +25,7 @@ namespace optim {
 
 namespace {
 
-struct BufferUFNode : public common::UnionFindNode {
+struct BufferUFNode : public cinn::common::UnionFindNode {
   explicit BufferUFNode(const std::string& x) : tensor_name(x) {}
 
   const char* type_info() const override { return __type_info__; }
@@ -57,7 +57,7 @@ std::map<std::string, ir::Tensor> InitialAssignBuffer(
     Expr* expr,
     poly::StageMap stages,
     const std::map<std::string, ir::Tensor>& all_tensor_map,
-    const common::Graph* comp_graph,
+    const cinn::common::Graph* comp_graph,
     const std::set<std::string>& temp_tensor_names) {
   // The tensor map helps to reserve only one tensor instance for a
   // tensor(called the same name).
@@ -69,7 +69,7 @@ std::map<std::string, ir::Tensor> InitialAssignBuffer(
   }
 
   // union-find to cluster the tensors with the same buffer.
-  common::UnionFind union_find;
+  cinn::common::UnionFind union_find;
 
   // unify all the tensor occurance with a global one, e.g. there are multiple
   // tensor B exists in the expression, replace them with a shared one.
@@ -107,7 +107,7 @@ std::map<std::string, ir::Tensor> InitialAssignBuffer(
   auto _topo_order_topo_edges_ = comp_graph->topological_order();
   auto& topo_order = std::get<0>(_topo_order_topo_edges_);
   auto& topo_edges = std::get<1>(_topo_order_topo_edges_);
-  for (common::GraphNode* n : topo_order) {
+  for (cinn::common::GraphNode* n : topo_order) {
     auto nn = n->safe_as<lang::detail::CompuGraphNode>();
     CHECK(nn);
     {
@@ -124,7 +124,7 @@ std::map<std::string, ir::Tensor> InitialAssignBuffer(
   // Get a center of the cluster, it will consider the following rules
   // 1. Prefer a tensor arg than a temp tensor.
   auto cluster_get_center_tensor =
-      [&](const std::vector<common::UnionFindNode*>& cluster) {
+      [&](const std::vector<cinn::common::UnionFindNode*>& cluster) {
         ir::Tensor some_tensor;
         // try to find a node that is a tensor_arg, allocate buffer for it, and
         // make others share buffer with it.
diff --git a/paddle/cinn/optim/buffer_assign.h b/paddle/cinn/optim/buffer_assign.h
index e44b3a77cee2e7..03f2987bebb3da 100644
--- a/paddle/cinn/optim/buffer_assign.h
+++ b/paddle/cinn/optim/buffer_assign.h
@@ -33,7 +33,7 @@ std::map<std::string, ir::Tensor> InitialAssignBuffer(
     Expr* expr,
     poly::StageMap stages,
     const std::map<std::string, ir::Tensor>& all_tensor_map,
-    const common::Graph* comp_graph,
+    const cinn::common::Graph* comp_graph,
     const std::set<std::string>& temp_tensor_names);
 
 }  // namespace optim
diff --git a/paddle/cinn/optim/call_arg_list_to_pod_value.cc b/paddle/cinn/optim/call_arg_list_to_pod_value.cc
index 62afec620f3647..b2142b77ff52a6 100644
--- a/paddle/cinn/optim/call_arg_list_to_pod_value.cc
+++ b/paddle/cinn/optim/call_arg_list_to_pod_value.cc
@@ -48,7 +48,7 @@ struct CallArgListToPodValueMutator : ir::IRMutator<> {
       auto new_call = ir::Call::Make(
           Void(),
           op->name,
-          {pod_array_var, common::make_const(Int(32), args.size())},
+          {pod_array_var, cinn::common::make_const(Int(32), args.size())},
           {},
           ir::CallType::CINN,
           op->func,
diff --git a/paddle/cinn/optim/compute_inline_expand.cc b/paddle/cinn/optim/compute_inline_expand.cc
index 20cba25ad38f17..7f42a3500ee760 100644
--- a/paddle/cinn/optim/compute_inline_expand.cc
+++ b/paddle/cinn/optim/compute_inline_expand.cc
@@ -115,7 +115,7 @@ struct TensorInlineExpandMutator : public ir::IRMutator<> {
           auto shapes = tensor->shape;
           CHECK_EQ(shapes.size(), node->indices.size());
           for (int i = 0; i < shapes.size(); i++) {
-            if (common::is_zero(shapes[i] - 1)) {
+            if (cinn::common::is_zero(shapes[i] - 1)) {
               node->indices[i] = Expr(0);
             }
           }
@@ -175,7 +175,7 @@ struct TensorInlineExpandMutator : public ir::IRMutator<> {
   }
 };
 
-struct SSANode : public common::GraphNode {
+struct SSANode : public cinn::common::GraphNode {
   std::string id_;
 
   explicit SSANode(const std::string &id) : id_(id) {}
@@ -191,7 +191,7 @@ struct SSANode : public common::GraphNode {
 // ir::CollectIRNodes method collects all the tensors recursively, so it can not
 // reserve the level information, fix it.
 struct SSABuilder : public ir::IRMutator<> {
-  common::Graph graph;
+  cinn::common::Graph graph;
 
   SSABuilder &operator()(Expr *expr) {
     ir::IRMutator<>::Visit(expr, expr);
diff --git a/paddle/cinn/optim/ir_simplify.cc b/paddle/cinn/optim/ir_simplify.cc
index 3076b32d3ca7b7..601e869a5b91b5 100644
--- a/paddle/cinn/optim/ir_simplify.cc
+++ b/paddle/cinn/optim/ir_simplify.cc
@@ -34,9 +34,9 @@
 namespace cinn {
 namespace optim {
 using namespace ir;  // NOLINT
-using common::bfloat16;
-using common::ExprToGinacConverter;
-using common::float16;
+using cinn::common::bfloat16;
+using cinn::common::ExprToGinacConverter;
+using cinn::common::float16;
 using utils::GetStreamCnt;
 using utils::Replace;
 
@@ -48,16 +48,16 @@ namespace {
 //! them.
 void PartialSimplify(
     Expr* expr,
-    const absl::flat_hash_map<std::string, common::CasInterval>& var_intervals =
-        {}) {
-  *expr = common::AutoSimplify(*expr, var_intervals);
+    const absl::flat_hash_map<std::string, cinn::common::CasInterval>&
+        var_intervals = {}) {
+  *expr = cinn::common::AutoSimplify(*expr, var_intervals);
 }
 
 //! Simplify the expression but Load.
 struct SimplifyNoPureMathMutator : public ir::IRMutator<ir::Expr*> {
-  common::cas_intervals_t& var_intervals;
+  cinn::common::cas_intervals_t& var_intervals;
   explicit SimplifyNoPureMathMutator(
-      common::cas_intervals_t& var_intervals)  // NOLINT
+      cinn::common::cas_intervals_t& var_intervals)  // NOLINT
       : var_intervals(var_intervals) {}
 
   void operator()(Expr* x) { ir::IRMutator<ir::Expr*>::Visit(x, x); }
@@ -79,7 +79,8 @@ struct SimplifyNoPureMathMutator : public ir::IRMutator<ir::Expr*> {
 
   void Visit(const PolyFor* op, Expr* expr) override {
     auto* node = expr->As<ir::PolyFor>();
-    node->condition = common::SolveInequality(op->condition, op->iterator);
+    node->condition =
+        cinn::common::SolveInequality(op->condition, op->iterator);
 
     Visit(&node->body, &node->body);
   }
@@ -93,10 +94,10 @@ struct SimplifyNoPureMathMutator : public ir::IRMutator<ir::Expr*> {
     if (min_i && extent_i && extent_i->value > min_i->value) {
       var_intervals.emplace(
           op->loop_var->name,
-          common::CasInterval{min_i->value, extent_i->value - 1});
+          cinn::common::CasInterval{min_i->value, extent_i->value - 1});
     } else {
       var_intervals.emplace(op->loop_var->name,
-                            common::CasInterval{op->min, op->extent - 1});
+                            cinn::common::CasInterval{op->min, op->extent - 1});
     }
 
     Visit(&node->body, &node->body);
@@ -123,7 +124,7 @@ struct SimplifyLoadMutator : public ir::IRMutator<ir::Expr*> {
   void Visit(const Load* expr, Expr* op) override {
     auto* node = op->As<Load>();
     for (auto& idx : node->indices) {
-      if (common::IsPureMath(idx)) {
+      if (cinn::common::IsPureMath(idx)) {
         PartialSimplify(&idx, var_intervals_);
       } else {
         SimplifyNoPureMathMutator mutator(var_intervals_);
@@ -138,7 +139,7 @@ struct SimplifyLoadMutator : public ir::IRMutator<ir::Expr*> {
     if (min_i && extent_i && extent_i->value > min_i->value) {
       var_intervals_.emplace(
           op->loop_var->name,
-          common::CasInterval{min_i->value, extent_i->value - 1});
+          cinn::common::CasInterval{min_i->value, extent_i->value - 1});
     }
 
     auto* node = expr->As<For>();
@@ -151,7 +152,7 @@ struct SimplifyLoadMutator : public ir::IRMutator<ir::Expr*> {
     }
   }
 
-  common::cas_intervals_t var_intervals_;
+  cinn::common::cas_intervals_t var_intervals_;
 };
 
 struct SimplifyStoreMutator : public ir::IRMutator<ir::Expr*> {
@@ -161,7 +162,7 @@ struct SimplifyStoreMutator : public ir::IRMutator<ir::Expr*> {
     auto* node = op->As<Store>();
 
     for (auto& idx : node->indices) {
-      if (common::IsPureMath(idx)) {
+      if (cinn::common::IsPureMath(idx)) {
         PartialSimplify(&idx, var_intervals_);
       } else {
         SimplifyNoPureMathMutator mutator(var_intervals_);
@@ -176,7 +177,7 @@ struct SimplifyStoreMutator : public ir::IRMutator<ir::Expr*> {
     if (min_i && extent_i) {
       var_intervals_.emplace(
           op->loop_var->name,
-          common::CasInterval{min_i->value, extent_i->value - 1});
+          cinn::common::CasInterval{min_i->value, extent_i->value - 1});
     }
 
     auto* node = expr->As<For>();
@@ -189,7 +190,7 @@ struct SimplifyStoreMutator : public ir::IRMutator<ir::Expr*> {
     }
   }
 
-  common::cas_intervals_t var_intervals_;
+  cinn::common::cas_intervals_t var_intervals_;
 };
 
 struct SimplifyRampMutator : public ir::IRMutator<Expr*> {
@@ -198,9 +199,9 @@ struct SimplifyRampMutator : public ir::IRMutator<Expr*> {
   void Visit(const Ramp* op, Expr* expr) override {
     auto* node = expr->As<ir::Ramp>();
 
-    CHECK(common::IsPureMath(node->base))
+    CHECK(cinn::common::IsPureMath(node->base))
         << node->base << "is not a pure math!";
-    CHECK(common::IsPureMath(node->stride))
+    CHECK(cinn::common::IsPureMath(node->stride))
         << node->stride << "is not a pure math!";
 
     PartialSimplify(&node->base);
@@ -215,8 +216,9 @@ struct SimplifyRampMutator : public ir::IRMutator<Expr*> {
     auto b_ramp = b.As<ir::Ramp>();
 
     if (a_ramp && b_ramp && a_ramp->lanes == b_ramp->lanes) {
-      Expr base_add = common::AutoSimplify(a_ramp->base + b_ramp->base);
-      Expr stride_add = common::AutoSimplify(a_ramp->stride + b_ramp->stride);
+      Expr base_add = cinn::common::AutoSimplify(a_ramp->base + b_ramp->base);
+      Expr stride_add =
+          cinn::common::AutoSimplify(a_ramp->stride + b_ramp->stride);
       *expr = ir::Ramp::Make(base_add, stride_add, a_ramp->lanes);
     }
   }
@@ -229,7 +231,7 @@ struct SimplifyIfThenElseMutator : public ir::IRMutator<> {
 
   void Visit(const IfThenElse* op, Expr* expr) override {
     auto* node = expr->As<ir::IfThenElse>();
-    node->condition = common::AutoSimplify(node->condition);
+    node->condition = cinn::common::AutoSimplify(node->condition);
 
     auto* condition_int = node->condition.As<ir::IntImm>();
     auto* condition_uint = node->condition.As<ir::UIntImm>();
@@ -335,7 +337,7 @@ struct SimplifyBlocksMutator : public ir::IRMutator<> {
 };
 
 struct SimplifyForLoopsMutator : public ir::IRMutator<> {
-  absl::flat_hash_map<std::string, common::CasInterval> var_intervals;
+  absl::flat_hash_map<std::string, cinn::common::CasInterval> var_intervals;
   SimplifyForLoopsMutator() {}
 
   void operator()(Expr* x) { ir::IRMutator<ir::Expr*>::Visit(x, x); }
@@ -353,7 +355,8 @@ struct SimplifyForLoopsMutator : public ir::IRMutator<> {
       VLOG(6) << "Simplify current For Loop";
       std::string var_name = node->loop_var->name;
       var_intervals.emplace(
-          var_name, common::CasInterval{min_i->value, extent_i->value - 1});
+          var_name,
+          cinn::common::CasInterval{min_i->value, extent_i->value - 1});
 
       *expr = node->body;
 
@@ -468,7 +471,7 @@ void Simplify(Expr* expr) {
   SimplifyStoreMutator()(expr);
   SimplifyIfThenElseMutator()(expr);
 
-  common::cas_intervals_t var_intervals;
+  cinn::common::cas_intervals_t var_intervals;
   SimplifyNoPureMathMutator mutator(var_intervals);
   mutator(expr);
 
diff --git a/paddle/cinn/optim/map_extern_call.cc b/paddle/cinn/optim/map_extern_call.cc
index 3a9531391ca9dc..91122c0b5b60a2 100644
--- a/paddle/cinn/optim/map_extern_call.cc
+++ b/paddle/cinn/optim/map_extern_call.cc
@@ -91,8 +91,8 @@ void MapExternCall(Expr *e, Target target) {
         return;
       }
 
-      std::string extern_func =
-          hlir::GetExternFuncName(common::DefaultNVGPUTarget(), dtype, name);
+      std::string extern_func = hlir::GetExternFuncName(
+          cinn::common::DefaultNVGPUTarget(), dtype, name);
       *expr = lang::CallExtern(extern_func, node->read_args, node->attrs);
     }
 
diff --git a/paddle/cinn/optim/remove_schedule_block_test.cc b/paddle/cinn/optim/remove_schedule_block_test.cc
index 643412b2f261d2..401225fee2f6f0 100644
--- a/paddle/cinn/optim/remove_schedule_block_test.cc
+++ b/paddle/cinn/optim/remove_schedule_block_test.cc
@@ -33,7 +33,7 @@ TEST(RemovescheduleBlock, basic) {
   Context::Global().ResetNameId();
   Placeholder<float> A("A", {Expr(100), Expr(20)});
   Placeholder<float> B("B", {Expr(20), Expr(50)});
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
   Module::Builder builder("matmul", target);
   // C = A * B
   Var k(20, "k0");
diff --git a/paddle/cinn/optim/replace_cross_thread_reduction_test.cc b/paddle/cinn/optim/replace_cross_thread_reduction_test.cc
index fb8c0d185ed119..d7bd9f6defc49d 100644
--- a/paddle/cinn/optim/replace_cross_thread_reduction_test.cc
+++ b/paddle/cinn/optim/replace_cross_thread_reduction_test.cc
@@ -33,7 +33,7 @@ TEST(CrossThreadReductionReplacer, basic) {
 #ifdef CINN_WITH_CUDA
   Context::Global().ResetNameId();
   Placeholder<float> A("A", {Expr(64), Expr(128)});
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
   Module::Builder builder("reduce_sum", target);
   Var reduce_j(128, "reduce_j");
   ir::Tensor B = Compute(
diff --git a/paddle/cinn/optim/replace_var_with_expr.cc b/paddle/cinn/optim/replace_var_with_expr.cc
index aae5f9a4f89450..25633dfd768fc4 100644
--- a/paddle/cinn/optim/replace_var_with_expr.cc
+++ b/paddle/cinn/optim/replace_var_with_expr.cc
@@ -158,7 +158,7 @@ std::vector<std::vector<Expr>> CollectTensorIndex(
   std::vector<std::vector<Expr>> result = mutator(source);
   for (auto& i : result) {
     for (auto& j : i) {
-      j = common::AutoSimplify(j);
+      j = cinn::common::AutoSimplify(j);
     }
   }
   return result;
diff --git a/paddle/cinn/optim/transform_gpu_forloop.cc b/paddle/cinn/optim/transform_gpu_forloop.cc
index f9c74702733263..7d80539541b22e 100644
--- a/paddle/cinn/optim/transform_gpu_forloop.cc
+++ b/paddle/cinn/optim/transform_gpu_forloop.cc
@@ -104,7 +104,7 @@ void RemoveGpuForloopsAxis(Expr *expr) {
       if (for_n) {
         // for(i, 2, 100);
         //        ^
-        if (for_n->min != common::make_const(0)) {
+        if (for_n->min != cinn::common::make_const(0)) {
           condition_append(ir::GE::Make(for_n->loop_var, for_n->min));
         }
 
@@ -112,7 +112,7 @@ void RemoveGpuForloopsAxis(Expr *expr) {
         //            ^
         condition_append(ir::LT::Make(for_n->loop_var, for_n->extent));
       } else {
-        if (poly_for_n->init != common::make_const(0)) {
+        if (poly_for_n->init != cinn::common::make_const(0)) {
           condition_append(
               ir::GE::Make(poly_for_n->iterator, poly_for_n->init));
         }
@@ -162,7 +162,7 @@ void CudaSyncThreadsDropIfThenElse(Expr *expr) {
         if (!blocked_statement_stack.empty()) {
           auto *last_for = blocked_statement_stack.back()->As<ir::IfThenElse>();
           if (auto *eq_n = last_for->condition.As<ir::EQ>()) {
-            if (eq_n->b() == common::make_const(0)) {
+            if (eq_n->b() == cinn::common::make_const(0)) {
               *blocked_statement_stack.back() = *expr;
             }
           }
@@ -376,7 +376,7 @@ void UpdateBufferAxisPass(ir::Expr *expr) {
           auto &indices = load ? load->indices : store->indices;
           for (auto &indice : indices) {
             optim::ReplaceVarWithExpr(&indice, loop_var, ir::Expr(0));
-            indice = common::AutoSimplify(indice);
+            indice = cinn::common::AutoSimplify(indice);
           }
         }
       }
@@ -436,7 +436,7 @@ class SharedAxisVisitor : public ir::IRMutator<> {
         for (auto axis : gpu_axis) {
           optim::ReplaceVarWithExpr(&indice, ir::Var(axis), ir::Expr(0));
         }
-        indice = common::AutoSimplify(indice);
+        indice = cinn::common::AutoSimplify(indice);
       }
     }
     ir::IRMutator<>::Visit(op, expr);
@@ -457,7 +457,7 @@ class SharedAxisVisitor : public ir::IRMutator<> {
         for (auto axis : gpu_axis) {
           optim::ReplaceVarWithExpr(&indice, ir::Var(axis), ir::Expr(0));
         }
-        indice = common::AutoSimplify(indice);
+        indice = cinn::common::AutoSimplify(indice);
       }
     }
     ir::IRMutator<>::Visit(op, expr);
@@ -484,7 +484,7 @@ class LocalAxisVisitor : public ir::IRMutator<> {
         for (auto axis : gpu_axis) {
           optim::ReplaceVarWithExpr(&indice, ir::Var(axis), ir::Expr(0));
         }
-        indice = common::AutoSimplify(indice);
+        indice = cinn::common::AutoSimplify(indice);
       }
     }
     ir::IRMutator<>::Visit(op, expr);
@@ -505,7 +505,7 @@ class LocalAxisVisitor : public ir::IRMutator<> {
         for (auto axis : gpu_axis) {
           optim::ReplaceVarWithExpr(&indice, ir::Var(axis), ir::Expr(0));
         }
-        indice = common::AutoSimplify(indice);
+        indice = cinn::common::AutoSimplify(indice);
       }
     }
     ir::IRMutator<>::Visit(op, expr);
@@ -602,8 +602,8 @@ class ResizeBufferSizeVisitor : public ir::IRMutator<> {
         ReplaceVarWithExpr(&tmp, var, Expr(idx));
 
         if (deep == vars.size() - 1) {
-          auto simplify = common::AutoSimplify(tmp);
-          auto range = common::AutoSimplify(simplify);
+          auto simplify = cinn::common::AutoSimplify(tmp);
+          auto range = cinn::common::AutoSimplify(simplify);
           CHECK(range.is_constant());
           max_range = std::max(max_range, range.as_int32() + 1);
         } else {
@@ -635,7 +635,7 @@ class ReplaceVarToZero : public ir::IRMutator<> {
       for (auto var_ : loop_var_) {
         optim::ReplaceVarWithExpr(&indice, ir::Var(var_), ir::Expr(0));
       }
-      indice = common::AutoSimplify(indice);
+      indice = cinn::common::AutoSimplify(indice);
     }
     ir::IRMutator<>::Visit(op, expr);
   }
@@ -651,7 +651,7 @@ class ReplaceVarToZero : public ir::IRMutator<> {
       for (auto var_ : loop_var_) {
         optim::ReplaceVarWithExpr(&indice, ir::Var(var_), ir::Expr(0));
       }
-      indice = common::AutoSimplify(indice);
+      indice = cinn::common::AutoSimplify(indice);
     }
 
     ir::IRMutator<>::Visit(op, expr);
diff --git a/paddle/cinn/optim/transform_polyfor_to_for.cc b/paddle/cinn/optim/transform_polyfor_to_for.cc
index 9649364ea13821..8a7392ed5d54ba 100644
--- a/paddle/cinn/optim/transform_polyfor_to_for.cc
+++ b/paddle/cinn/optim/transform_polyfor_to_for.cc
@@ -74,12 +74,12 @@ struct PolyForWithSimpleConditionToForMutator : public ir::IRMutator<Expr*> {
     auto* le_n = node->condition.As<ir::LE>();
 
     if (lt_n) {
-      if (lt_n->b() != common::make_const(0)) {
+      if (lt_n->b() != cinn::common::make_const(0)) {
         node->condition = lt_n->a() - lt_n->b() < 0;
       }
     }
     if (le_n) {
-      if (le_n->b() != common::make_const(0)) {
+      if (le_n->b() != cinn::common::make_const(0)) {
         node->condition = le_n->a() - le_n->b() <= 0;
       }
     }
@@ -119,7 +119,7 @@ struct PolyForWithSimpleConditionToForMutator : public ir::IRMutator<Expr*> {
 
     Expr lhs = lt_n ? lt_n->a() : le_n->a();
     Expr rhs = lt_n ? lt_n->b() : PlusOneWithMinMax(le_n->b());
-    rhs = common::AutoSimplify(rhs);
+    rhs = cinn::common::AutoSimplify(rhs);
 
     if (op->is_vectorized()) CHECK(op->vectorize_info().valid());
 
diff --git a/paddle/cinn/optim/unroll_loops_test.cc b/paddle/cinn/optim/unroll_loops_test.cc
index 5ce412a245e3eb..63936d931f34f7 100644
--- a/paddle/cinn/optim/unroll_loops_test.cc
+++ b/paddle/cinn/optim/unroll_loops_test.cc
@@ -39,7 +39,7 @@ TEST(UnrollLoops, unrolled_tag) {
 
   auto stages = CreateStages({C});
 
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
   auto func = cinn::lang::LowerVec(
       "test_unrolled_tag", stages, {A, B, C}, {}, {}, nullptr, target, true);
   auto ast_expr = func[0]->body;
@@ -80,7 +80,7 @@ TEST(UnrollLoops, auto_unroll) {
       "B");
 
   auto stages = CreateStages({B});
-  Target target = common::DefaultHostTarget();
+  Target target = cinn::common::DefaultHostTarget();
   auto func = cinn::lang::LowerVec(
       "test_auto_unroll", stages, {A, B}, {}, {}, nullptr, target, true);
   auto ast_expr = func[0]->body;
diff --git a/paddle/cinn/optim/var_mod_simplify.cc b/paddle/cinn/optim/var_mod_simplify.cc
index dcd6de24fef2e7..811208c49de256 100644
--- a/paddle/cinn/optim/var_mod_simplify.cc
+++ b/paddle/cinn/optim/var_mod_simplify.cc
@@ -80,11 +80,11 @@ struct ReplaceVarWithDivMutator : public ir::IRMutator<> {
 }  // namespace
 
 void VarModSimplify(Expr* e) {
-  *e = common::AutoSimplify(*e);
+  *e = cinn::common::AutoSimplify(*e);
   ReplaceModWithDivMutator()(e);
   ReplaceDivWithVarMutator mutator;
   mutator(e);
-  *e = common::AutoSimplify(*e);
+  *e = cinn::common::AutoSimplify(*e);
   auto div_var_map = mutator.div_var_map_;
   ReplaceVarWithDivMutator()(e, mutator.div_var_map_);
 }
diff --git a/paddle/cinn/optim/vectorize_loops.cc b/paddle/cinn/optim/vectorize_loops.cc
index 30701216ade95c..0495c1ef0ffe72 100644
--- a/paddle/cinn/optim/vectorize_loops.cc
+++ b/paddle/cinn/optim/vectorize_loops.cc
@@ -37,9 +37,9 @@
 namespace cinn {
 namespace optim {
 using namespace ir;  // NOLINT
-using common::make_const;
-using common::make_one;
-using common::make_zero;
+using cinn::common::make_const;
+using cinn::common::make_one;
+using cinn::common::make_zero;
 
 //! Widen an expression to the given number of lanes.
 Expr Widen(Expr e, int lanes) {
@@ -62,7 +62,7 @@ class TensorVectorizeTeller : public ir::IRMutator<const Expr *> {
   TensorVectorizeTeller(
       const Var &iter_var,
       const int factor,
-      const absl::flat_hash_map<std::string, common::CasInterval>
+      const absl::flat_hash_map<std::string, cinn::common::CasInterval>
           *var_intervals)
       : iter_var_(iter_var), factor_(factor), var_intervals_(var_intervals) {}
 
@@ -78,7 +78,8 @@ class TensorVectorizeTeller : public ir::IRMutator<const Expr *> {
   const Var
       iter_var_;  // loop var of new for-loop split from the vectorized loop
   const int factor_;
-  const absl::flat_hash_map<std::string, common::CasInterval> *var_intervals_;
+  const absl::flat_hash_map<std::string, cinn::common::CasInterval>
+      *var_intervals_;
   // save (tensor name) -> (bool flag) to indentify whether tensors can be
   // vectorized or not
   std::unordered_map<std::string, bool> tensor2flag_;
@@ -154,7 +155,7 @@ class TensorVectorizeTeller : public ir::IRMutator<const Expr *> {
     for (int i = 1; i < interval.r; ++i) {
       Expr next_idx = ir::ir_utils::IRCopy(indices.back());
       cinn::ir::ir_utils::IrReplace(&next_idx, Expr(iter_var_), Expr(i));
-      auto gap = common::AutoSimplify(Expr(next_idx - first_idx));
+      auto gap = cinn::common::AutoSimplify(Expr(next_idx - first_idx));
       if (!gap.As<IntImm>() || gap.as_int32() != i) {
         VLOG(5) << "Tensor:" << tensor->name
                 << " is not accessed sequentially, next:" << next_idx
@@ -195,10 +196,11 @@ class CudaVectorizer : public IRMutator<Expr *> {
 
  public:
   static constexpr int CudaVectorTypeMaxLanes = 8;
-  CudaVectorizer(const Var &iter_var,
-                 const int factor,
-                 const absl::flat_hash_map<std::string, common::CasInterval>
-                     *var_intervals)
+  CudaVectorizer(
+      const Var &iter_var,
+      const int factor,
+      const absl::flat_hash_map<std::string, cinn::common::CasInterval>
+          *var_intervals)
       : iter_var_(iter_var),
         factor_(factor),
         vectorized_teller_(iter_var, factor, var_intervals) {
@@ -268,7 +270,8 @@ class CudaVectorizer : public IRMutator<Expr *> {
   }
 
   std::string GetVectorTypeName(Type type) {
-    std::string name_prefix = common::customized_type::kcuda_builtin_vector_t;
+    std::string name_prefix =
+        cinn::common::customized_type::kcuda_builtin_vector_t;
 #define GET_CUDA_VECTOR_TYPE_NAME(pred_expr, scalar_name)       \
   if (pred_expr) {                                              \
     return name_prefix + scalar_name + std::to_string(factor_); \
@@ -359,7 +362,7 @@ class Vectorizer : public IRMutator<Expr *> {
 
   Expr ramp_;
 
-  absl::flat_hash_map<std::string, common::CasInterval> var_intervals_;
+  absl::flat_hash_map<std::string, cinn::common::CasInterval> var_intervals_;
 
   //! A suffix to attach to widened variables.
   std::string widen_suffix;
@@ -367,7 +370,7 @@ class Vectorizer : public IRMutator<Expr *> {
  public:
   Vectorizer(const Var &var,
              int lanes,
-             const absl::flat_hash_map<std::string, common::CasInterval>
+             const absl::flat_hash_map<std::string, cinn::common::CasInterval>
                  &var_intervals = {})
       : var(var), lanes_(lanes), var_intervals_(var_intervals) {
     // the identity ramp.
@@ -575,10 +578,10 @@ class Vectorizer : public IRMutator<Expr *> {
     std::map<const ir::_Var_ *, Expr> var_map;
     var_map[var.As<ir::_Var_>()] = idx;
 
-    common::Substitute(expr, var_map);
+    cinn::common::Substitute(expr, var_map);
     *expr = ir::For::Make(idx,
-                          common::make_const(0),
-                          common::make_const(lanes_),
+                          cinn::common::make_const(0),
+                          cinn::common::make_const(lanes_),
                           ForType::Serial,
                           DeviceAPI::Host,
                           *expr);
@@ -666,7 +669,7 @@ class Vectorizer : public IRMutator<Expr *> {
 
 struct VectorizeLoops_ : public IRMutator<Expr *> {
   const Target &target;
-  absl::flat_hash_map<std::string, common::CasInterval> var_intervals;
+  absl::flat_hash_map<std::string, cinn::common::CasInterval> var_intervals;
   bool vectorizable_ = true;
 
   explicit VectorizeLoops_(const Target &t) : target(t) {}
@@ -680,7 +683,8 @@ struct VectorizeLoops_ : public IRMutator<Expr *> {
     bool is_changed = false;
     // simplify the complicated index from poly in the format of div/mod
     for (int i = 0; i < indices.size(); i++) {
-      node->indices[i] = common::AutoSimplify(node->indices[i], var_intervals);
+      node->indices[i] =
+          cinn::common::AutoSimplify(node->indices[i], var_intervals);
       Simplify(&node->indices[i]);
       if (!node->indices[i].same_as(indices[i])) {
         is_changed = true;
@@ -700,7 +704,8 @@ struct VectorizeLoops_ : public IRMutator<Expr *> {
     bool is_changed = false;
     // simplify the complicated index from poly in the format of div/mod
     for (int i = 0; i < indices.size(); i++) {
-      node->indices[i] = common::AutoSimplify(node->indices[i], var_intervals);
+      node->indices[i] =
+          cinn::common::AutoSimplify(node->indices[i], var_intervals);
       Simplify(&node->indices[i]);
       if (!node->indices[i].same_as(indices[i])) {
         is_changed = true;
@@ -723,10 +728,12 @@ struct VectorizeLoops_ : public IRMutator<Expr *> {
     auto loopvar_name = forloop->loop_var->name;
     if (forloop->extent.As<IntImm>()) {
       var_intervals.emplace(
-          loopvar_name, common::CasInterval{0, forloop->extent.as_int32() - 1});
+          loopvar_name,
+          cinn::common::CasInterval{0, forloop->extent.as_int32() - 1});
     } else {
-      var_intervals.emplace(loopvar_name,
-                            common::CasInterval{Expr(0), forloop->extent - 1});
+      var_intervals.emplace(
+          loopvar_name,
+          cinn::common::CasInterval{Expr(0), forloop->extent - 1});
     }
     // the extent the forloops marked as Vectorized should be int constant
     if (forloop->is_vectorized()) {
@@ -735,7 +742,7 @@ struct VectorizeLoops_ : public IRMutator<Expr *> {
       CHECK_GT(forloop->vectorize_info().factor, 0);
 
       CHECK(is_zero(forloop->min));
-      Expr for_extent = common::AutoSimplify(forloop->extent);
+      Expr for_extent = cinn::common::AutoSimplify(forloop->extent);
       Simplify(&for_extent);
       node->extent = for_extent;
       auto *extent_min = for_extent.As<Min>();
@@ -744,7 +751,7 @@ struct VectorizeLoops_ : public IRMutator<Expr *> {
       vectorizable_ = true;
       IRMutator<>::Visit(&node->body, &node->body);
 
-      if (target == common::DefaultNVGPUTarget()) {
+      if (target == cinn::common::DefaultNVGPUTarget()) {
         if (!forloop->extent.As<IntImm>() ||
             forloop->extent.as_int32() % forloop->vectorize_info().factor !=
                 0) {
@@ -794,7 +801,7 @@ struct VectorizeLoops_ : public IRMutator<Expr *> {
               << extent;
       VLOG(2) << "before vectorize body:\n" << node->body;
 
-      if (target == common::DefaultNVGPUTarget()) {
+      if (target == cinn::common::DefaultNVGPUTarget()) {
         CudaVectorizer cuda_vectorizer(
             new_forloop->loop_var, factor, &var_intervals);
         cuda_vectorizer.Visit(&new_forloop->body);
@@ -841,7 +848,7 @@ struct VectorizeLoops_ : public IRMutator<Expr *> {
   bool UnrollCmpFor(For *outer_for, For *inner_for, Expr *expr) {
     CHECK(outer_for);
     CHECK(inner_for);
-    Expr inner_for_extent = common::AutoSimplify(inner_for->extent);
+    Expr inner_for_extent = cinn::common::AutoSimplify(inner_for->extent);
     Simplify(&inner_for_extent);
     auto *extent_min = inner_for_extent.As<Min>();
     if (extent_min) {
@@ -855,7 +862,7 @@ struct VectorizeLoops_ : public IRMutator<Expr *> {
       auto b_int = a.As<IntImm>();
       if (a_int || b_int) {
         condition =
-            common::SolveInequality(LE::Make(a, b), outer_for->loop_var);
+            cinn::common::SolveInequality(LE::Make(a, b), outer_for->loop_var);
         Simplify(&condition);
       }
       if (condition.defined()) {
@@ -870,7 +877,7 @@ struct VectorizeLoops_ : public IRMutator<Expr *> {
                                      DeviceAPI::UNK,
                                      inner_for->body,
                                      inner_for->vectorize_info())});
-          Expr new_extent_a = common::AutoSimplify(le_n->b() + 1);
+          Expr new_extent_a = cinn::common::AutoSimplify(le_n->b() + 1);
           Expr out_for_a = For::Make(outer_for->loop_var,
                                      outer_for->min,
                                      new_extent_a,
@@ -879,9 +886,9 @@ struct VectorizeLoops_ : public IRMutator<Expr *> {
                                      inner_for_a,
                                      outer_for->vectorize_info());
           Var new_iterator_inner(
-              common::UniqName(inner_for->loop_var->name + "_s"));
+              cinn::common::UniqName(inner_for->loop_var->name + "_s"));
           Var new_iterator_outer(
-              common::UniqName(outer_for->loop_var->name + "_s"));
+              cinn::common::UniqName(outer_for->loop_var->name + "_s"));
 
           Expr inner_for_b =
               Block::Make({For::Make(new_iterator_inner,
@@ -928,10 +935,10 @@ struct VectorizeLoops_ : public IRMutator<Expr *> {
       int extent_trunc = extent_int / factor;
       int extent_times =
           extent_int % factor == 0 ? extent_trunc : extent_trunc + 1;
-      times = common::make_const(forloop->extent->type(), extent_times);
+      times = cinn::common::make_const(forloop->extent->type(), extent_times);
     } else {
-      times =
-          common::AutoSimplify(Div::Make(forloop->extent, make_const(factor)));
+      times = cinn::common::AutoSimplify(
+          Div::Make(forloop->extent, make_const(factor)));
       Simplify(&times);
     }
 
@@ -943,21 +950,22 @@ struct VectorizeLoops_ : public IRMutator<Expr *> {
     if (times_int && forloop->extent.as_int32() >= 1) {
       var_intervals.emplace(
           forloop->loop_var->name,
-          common::CasInterval{0, forloop->extent.as_int32() - 1});
+          cinn::common::CasInterval{0, forloop->extent.as_int32() - 1});
     } else {
       var_intervals.erase(forloop->loop_var->name);
-      var_intervals.emplace(forloop->loop_var->name,
-                            common::CasInterval{Expr(0), forloop->extent - 1});
+      var_intervals.emplace(
+          forloop->loop_var->name,
+          cinn::common::CasInterval{Expr(0), forloop->extent - 1});
     }
 
     // create the new forloop
     {
       Var new_iterator(Context::Global().NewName("vi"));
       var_intervals.emplace(new_iterator->name,
-                            common::CasInterval{0, factor - 1});
+                            cinn::common::CasInterval{0, factor - 1});
       // eliminate for 1
       Expr new_index;
-      if (common::is_zero(times - 1)) {
+      if (cinn::common::is_zero(times - 1)) {
         new_index = Expr(new_iterator);
       } else {
         new_index = Expr(forloop->loop_var) * factor + Expr(new_iterator);
diff --git a/paddle/cinn/optim/vectorize_loops_test.cc b/paddle/cinn/optim/vectorize_loops_test.cc
index f3ad6ee07f3c44..270e37f1dc46ae 100644
--- a/paddle/cinn/optim/vectorize_loops_test.cc
+++ b/paddle/cinn/optim/vectorize_loops_test.cc
@@ -52,7 +52,7 @@ TEST(Vectorize, replace_var) {
 
   auto funcs = Lower("matmul", stages, {A, B, C});
 
-  Expr func = optim::Optimize(funcs, common::DefaultHostTarget());
+  Expr func = optim::Optimize(funcs, cinn::common::DefaultHostTarget());
 
   Target target;
   target.arch = Target::Arch ::X86;
@@ -221,14 +221,14 @@ TEST(Vectorize, single_for) {
 
   VectorizeInfo vectorize_info(0, 16);
   auto forloop = ir::For::Make(loop_var,
-                               common::make_const(0),
-                               common::make_const(16),
+                               cinn::common::make_const(0),
+                               cinn::common::make_const(16),
                                ir::ForType::Vectorized,
                                ir::DeviceAPI::UNK,
                                body,
                                vectorize_info);
 
-  forloop = optim::Optimize(forloop, common::DefaultHostTarget());
+  forloop = optim::Optimize(forloop, cinn::common::DefaultHostTarget());
 
   LOG(INFO) << "Forloop\n" << forloop;
 }
@@ -244,7 +244,7 @@ TEST(Vectorize, cuda_vectorize) {
 
   auto stages = CreateStages({C});
   stages[C]->Vectorize(1, 4);
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
   auto func = Lower("matmul", stages, {A, B, C}, {}, {}, nullptr, target);
 
   auto target_expr = R"ROC(
@@ -281,7 +281,7 @@ TEST(Vectorize, cuda_vectorize_with_constant) {
 
   auto stages = CreateStages({C});
   stages[C]->Vectorize(1, 4);
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
   auto func = Lower("mul_const", stages, {A, C}, {}, {}, nullptr, target);
 }
 
diff --git a/paddle/cinn/poly/domain.cc b/paddle/cinn/poly/domain.cc
index dc6a27d82bfd87..c6f4479bf8bba9 100644
--- a/paddle/cinn/poly/domain.cc
+++ b/paddle/cinn/poly/domain.cc
@@ -62,7 +62,7 @@ std::string Domain::__str__() const {
 
 isl::set Domain::to_isl() const {
   VLOG(3) << "isl::set " << __str__();
-  isl::set x(common::Context::isl_ctx(), __str__());
+  isl::set x(cinn::common::Context::isl_ctx(), __str__());
   return x;
 }
 
diff --git a/paddle/cinn/poly/graph.cc b/paddle/cinn/poly/graph.cc
index c647cf49565dc3..ef5aa875d5b4cf 100755
--- a/paddle/cinn/poly/graph.cc
+++ b/paddle/cinn/poly/graph.cc
@@ -85,12 +85,12 @@ std::string DataFlowGraphNode::id() const {
 bool DataFlowGraphNode::IsLinkedTo(const DataFlowGraphNode* node) const {
   bool found = std::find_if(inlinks_.begin(),
                             inlinks_.end(),
-                            [=](const Shared<common::GraphEdge>& x) {
+                            [=](const Shared<cinn::common::GraphEdge>& x) {
                               return x->source() == node;
                             }) != std::end(inlinks_);
   return found || std::find_if(outlinks_.begin(),
                                outlinks_.end(),
-                               [=](const Shared<common::GraphEdge>& x) {
+                               [=](const Shared<cinn::common::GraphEdge>& x) {
                                  return x->sink() == node;
                                }) != std::end(outlinks_);
 }
diff --git a/paddle/cinn/poly/graph.h b/paddle/cinn/poly/graph.h
index e0c15f7be793fb..14b2a61f41e499 100644
--- a/paddle/cinn/poly/graph.h
+++ b/paddle/cinn/poly/graph.h
@@ -27,7 +27,7 @@
 namespace cinn {
 namespace poly {
 
-struct DataFlowGraphNode : public common::GraphNode {
+struct DataFlowGraphNode : public cinn::common::GraphNode {
   //! Used for union find to gather groups.
   DataFlowGraphNode* group_parent{};
   //! Each stage belongs to a node.
@@ -58,12 +58,12 @@ struct DataFlowGraphNode : public common::GraphNode {
                                       const DataFlowGraphNode* b);
 };
 
-struct DataFlowGraphEdge : public common::GraphEdge {};
+struct DataFlowGraphEdge : public cinn::common::GraphEdge {};
 
 /**
  * DataFlowGraph help to record the data dependencies between the Stages.
  */
-struct DataFlowGraph : public common::Graph {};
+struct DataFlowGraph : public cinn::common::Graph {};
 
 /**
  * Create a dependency graph given some stages.
@@ -93,7 +93,7 @@ struct Group {
  * Nodes has the stages has dependency relation and has the same iteration
  * domain, then they will be put in the same sub-graph.
  */
-std::vector<Group> PartitionGraphByIterationDomain(common::Graph* graph);
+std::vector<Group> PartitionGraphByIterationDomain(cinn::common::Graph* graph);
 
 }  // namespace detail
 
diff --git a/paddle/cinn/poly/poly_scheduler.cc b/paddle/cinn/poly/poly_scheduler.cc
index b916e5952ffe4b..0e3b84a70e8e23 100644
--- a/paddle/cinn/poly/poly_scheduler.cc
+++ b/paddle/cinn/poly/poly_scheduler.cc
@@ -33,11 +33,11 @@ namespace detail {
 //! Visit the nodes in topological order, if one node is valid to visit, visit
 //! it and check whether its out link children are ready to visit, merge them to
 //! the same group. NOTE this is discarded.
-std::vector<Group> PartitionGraphByIterationDomain(common::Graph* graph) {
+std::vector<Group> PartitionGraphByIterationDomain(cinn::common::Graph* graph) {
   VLOG(3) << "graph:\n" << graph->Visualize();
   // collect indegrees for naive topological traversal.
   std::map<DataFlowGraphNode*, uint16_t> indegree;
-  for (common::GraphNode* n : graph->nodes()) {
+  for (cinn::common::GraphNode* n : graph->nodes()) {
     auto* node = n->safe_as<DataFlowGraphNode>();
     indegree[node] = node->inlinks().size();
   }
@@ -145,10 +145,11 @@ bool CheckGroupValid(const std::vector<Group>& groups) {
 }
 
 //! Tell if \param a links to \param b.
-bool IsLinkTo(const common::GraphNode* a, const common::GraphNode* b) {
+bool IsLinkTo(const cinn::common::GraphNode* a,
+              const cinn::common::GraphNode* b) {
   // dfs
-  std::stack<const common::GraphNode*> stack({a});
-  std::unordered_set<const common::GraphNode*> visited;
+  std::stack<const cinn::common::GraphNode*> stack({a});
+  std::unordered_set<const cinn::common::GraphNode*> visited;
   while (!stack.empty()) {
     auto* top = stack.top();
     stack.pop();
@@ -169,9 +170,9 @@ bool IsLinkTo(const common::GraphNode* a, const common::GraphNode* b) {
   return false;
 }
 
-bool IsBetween(const common::GraphNode* x,
-               const common::GraphNode* a,
-               const common::GraphNode* b) {
+bool IsBetween(const cinn::common::GraphNode* x,
+               const cinn::common::GraphNode* a,
+               const cinn::common::GraphNode* b) {
   if (IsLinkTo(a, x) && IsLinkTo(x, b)) return true;
   if (IsLinkTo(x, a) && IsLinkTo(b, x)) return true;
   return false;
@@ -191,8 +192,8 @@ std::vector<Group> TopoSortGroups(std::vector<Group>& groups) {  // NOLINT
       node2group[node->id()] = group;
       in_degree += node->inlinks().size();
       for (auto& node2 : group->nodes) {
-        if (node2->as<common::GraphNode>()->IsLinkedTo(
-                node->as<common::GraphNode>())) {
+        if (node2->as<cinn::common::GraphNode>()->IsLinkedTo(
+                node->as<cinn::common::GraphNode>())) {
           in_degree--;
         }
       }
@@ -240,7 +241,7 @@ std::vector<Group> TopoSortGroups(std::vector<Group>& groups) {  // NOLINT
  * 2. If ComputeAt is set between two stages and their iteration domain matches,
  * the stages will be put in a group with relative order.
  */
-std::vector<Group> NaivePartitionGraph(common::Graph* graph) {
+std::vector<Group> NaivePartitionGraph(cinn::common::Graph* graph) {
   std::map<DataFlowGraphNode*, std::vector<DataFlowGraphNode*>> node_groups;
   auto topo_order = graph->topological_order();
   auto& nodes_in_order = std::get<0>(topo_order);
@@ -252,7 +253,7 @@ std::vector<Group> NaivePartitionGraph(common::Graph* graph) {
   }
 
   // process compute_at
-  absl::flat_hash_map<const common::GraphNode*, uint32_t>
+  absl::flat_hash_map<const cinn::common::GraphNode*, uint32_t>
       node2score;  // record each node's score for sorting.
   int score = 0;
   for (auto* n : nodes_in_order) {
diff --git a/paddle/cinn/poly/schedule.cc b/paddle/cinn/poly/schedule.cc
index 43357dbdfb1044..1904b76cb777d9 100644
--- a/paddle/cinn/poly/schedule.cc
+++ b/paddle/cinn/poly/schedule.cc
@@ -184,7 +184,7 @@ void SchedulerBase::AddStage(const Stage &x) {
   std::string id = isl_map_get_tuple_name(x.transform().get(), isl_dim_in);
   schedule_graph_.RegisterNode(
       x.id(),
-      common::make_shared<ScheduleGraphNode>(
+      cinn::common::make_shared<ScheduleGraphNode>(
           id, isl_get_dim_names(x.transform(), isl_dim_out), &x));
 
   // record the longest dimensions.
diff --git a/paddle/cinn/poly/schedule.h b/paddle/cinn/poly/schedule.h
index 1c28c5961e4fd4..77059ab863f232 100755
--- a/paddle/cinn/poly/schedule.h
+++ b/paddle/cinn/poly/schedule.h
@@ -51,7 +51,7 @@ struct TimeDim {
 };
 
 class ScheduleGraphNode;
-struct ScheduleGraph : public common::Graph {};
+struct ScheduleGraph : public cinn::common::Graph {};
 
 /**
  * ISL schedule map with time space, used to generate the final schedule.
@@ -205,9 +205,9 @@ std::unique_ptr<Schedule> CreateSchedule(
 // std::vector<Stage *> GatherStagesInTensors(const std::vector<ir::Tensor> &xs,
 // bool with_placeholder = false);
 
-struct ScheduleGraphEdge : public common::GraphEdge {
-  ScheduleGraphEdge(common::GraphNode *a, common::GraphNode *b)
-      : common::GraphEdge(a, b) {}
+struct ScheduleGraphEdge : public cinn::common::GraphEdge {
+  ScheduleGraphEdge(cinn::common::GraphNode *a, cinn::common::GraphNode *b)
+      : cinn::common::GraphEdge(a, b) {}
 
   //! Dependency level.
   int level{-1};
@@ -216,7 +216,7 @@ struct ScheduleGraphEdge : public common::GraphEdge {
 /**
  * Node in the schedule graph.
  */
-struct ScheduleGraphNode : public common::GraphNode {
+struct ScheduleGraphNode : public cinn::common::GraphNode {
   TimeSchedule time_schedule;
   Stage *stage{};
 
diff --git a/paddle/cinn/poly/stage.cc b/paddle/cinn/poly/stage.cc
index 53dc1ab6aa6cfb..e04c178805ae47 100644
--- a/paddle/cinn/poly/stage.cc
+++ b/paddle/cinn/poly/stage.cc
@@ -227,8 +227,8 @@ std::tuple<Iterator, Iterator, Iterator, Iterator>  //
 Stage::Tile(int level0, int level1, int factor0, int factor1) {
   AssertAxisIsNotLocked(level0);
   AssertAxisIsNotLocked(level1);
-  Iterator i0(common::axis_name(level0));
-  Iterator i1(common::axis_name(level1));
+  Iterator i0(cinn::common::axis_name(level0));
+  Iterator i1(cinn::common::axis_name(level1));
   return Tile(i0, i1, factor0, factor1);
 }
 
@@ -291,7 +291,7 @@ void Stage::ChangeIndex(Stage *other) {
   }
   this->tensor()->new_indices = indices[0];
 
-  std::vector<Var> axis_var = common::GenDefaultAxis(indices[0].size());
+  std::vector<Var> axis_var = cinn::common::GenDefaultAxis(indices[0].size());
   for (int i = 0; i < axis_var.size(); i++) {
     optim::ReplaceVarWithExpr(&(this->expr_), axis_var[i], indices[0][i]);
   }
@@ -325,7 +325,7 @@ void Stage::AddForLoopInTransform(std::vector<std::vector<Expr>> &indices) {
     int int_range = GetRange(indices, i);
     if (int_range == 0) continue;
 
-    std::string dim_name = common::axis_name(i) + "_at";
+    std::string dim_name = cinn::common::axis_name(i) + "_at";
     Var dim_var(dim_name);
     indices[0][i] = ir::Add::Make(indices[0][i], Expr(dim_var));
     std::string this_domain = isl_set_to_str(domain_.get());
diff --git a/paddle/cinn/poly/stage.h b/paddle/cinn/poly/stage.h
index cf0586710bc9c7..ac36e5fd98e092 100644
--- a/paddle/cinn/poly/stage.h
+++ b/paddle/cinn/poly/stage.h
@@ -529,7 +529,7 @@ inline std::string OuterName(const std::string& name);
 inline std::string OuterName(const Iterator& iterator);
 
 inline Iterator DefaultIterator(int i) {
-  return Iterator(common::axis_name(i));
+  return Iterator(cinn::common::axis_name(i));
 }
 
 /**
diff --git a/paddle/cinn/poly/stage_test.cc b/paddle/cinn/poly/stage_test.cc
index a9c00a82f2487c..e8cbf9dd8ff878 100644
--- a/paddle/cinn/poly/stage_test.cc
+++ b/paddle/cinn/poly/stage_test.cc
@@ -215,10 +215,10 @@ function fn (_A, _A1, _B)
 )ROC";
     ASSERT_EQ(utils::Trim(target), utils::GetStreamCnt(fn));
 
-    Module::Builder builder("module", common::DefaultHostTarget());
+    Module::Builder builder("module", cinn::common::DefaultHostTarget());
     builder.AddFunction(fn);
 
-    CodeGenC codegen(common::DefaultHostTarget());
+    CodeGenC codegen(cinn::common::DefaultHostTarget());
     codegen.SetInlineBuiltinCodes(false);
     LOG(INFO) << "source:\n"
               << codegen.Compile(builder.Build(),
@@ -320,7 +320,8 @@ void TestElementwiseAddJitPrecession(
   auto fn = Lower("fn", stages, {A, B, C});
   LOG(INFO) << "fn:\n" << fn;
 
-  Module::Builder module_builder("some_module", common::DefaultHostTarget());
+  Module::Builder module_builder("some_module",
+                                 cinn::common::DefaultHostTarget());
   module_builder.AddFunction(fn);
 
   auto jit = backends::SimpleJIT::Create();
@@ -329,17 +330,20 @@ void TestElementwiseAddJitPrecession(
   auto* fn_handler = reinterpret_cast<lower_func_ptr_t>(_fn_handler);
 
   // create buffer and args
-  auto A_buf = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
-                   .set_random()
-                   .Build();
-  auto B_buf = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
-                   .set_random()
-                   .Build();
-  auto C_buf = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
-                   .set_zero()
-                   .Build();
+  auto A_buf =
+      cinn::common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
+          .set_random()
+          .Build();
+  auto B_buf =
+      cinn::common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
+          .set_random()
+          .Build();
+  auto C_buf =
+      cinn::common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
+          .set_zero()
+          .Build();
   auto arg_pack =
-      common::ArgsBuilder().Add(A_buf).Add(B_buf).Add(C_buf).Build();
+      cinn::common::ArgsBuilder().Add(A_buf).Add(B_buf).Add(C_buf).Build();
 
   fn_handler(arg_pack.data(), arg_pack.size());
 
@@ -511,10 +515,10 @@ TEST(ShareBufferWith, basic) {
 
   LOG(INFO) << "fn:\n" << fn;
 
-  Module::Builder builder("some_module", common::DefaultHostTarget());
+  Module::Builder builder("some_module", cinn::common::DefaultHostTarget());
   builder.AddFunction(fn);
 
-  CodeGenC codegen(common::DefaultHostTarget());
+  CodeGenC codegen(cinn::common::DefaultHostTarget());
   codegen.SetInlineBuiltinCodes(false);
 
   LOG(INFO) << "\n"
diff --git a/paddle/cinn/pybind/CMakeLists.txt b/paddle/cinn/pybind/CMakeLists.txt
index 33dc27860f9473..ec409578930df9 100755
--- a/paddle/cinn/pybind/CMakeLists.txt
+++ b/paddle/cinn/pybind/CMakeLists.txt
@@ -25,7 +25,8 @@ if(WITH_CUDA)
     DEPS
     cinncore_static
     cinn_runtime
-    pybind)
+    pybind
+    common)
   message("cuda_nvrtc: ${CUDA_NVRTC}")
   target_link_libraries(core_api ${CUDA_NVRTC_LIB} ${CUDA_LIBRARIES} cuda cudnn)
   if(NVTX_FOUND)
@@ -45,7 +46,7 @@ else()
     ${llvm_libs})
 endif()
 
-target_link_libraries(core_api ${MKLML_LIB} isl ginac)
+target_link_libraries(core_api ${MKLML_LIB} isl ginac common)
 if(USE_OPENMP STREQUAL "gnu")
   target_link_libraries(core_api ${OpenMP_CXX_LIBRARIES})
   message(STATUS "OpenMP lib: ${OpenMP_CXX_LIBRARIES}")
diff --git a/paddle/cinn/pybind/bind_utils.h b/paddle/cinn/pybind/bind_utils.h
index 397ee42dd4bf17..2e509ddb5d97ba 100644
--- a/paddle/cinn/pybind/bind_utils.h
+++ b/paddle/cinn/pybind/bind_utils.h
@@ -30,9 +30,9 @@
 namespace py = pybind11;
 
 namespace cinn::pybind {
-using common::CINNValue;
-using common::Shared;
-using common::Type;
+using cinn::common::CINNValue;
+using cinn::common::Shared;
+using cinn::common::Type;
 using ir::Expr;
 using ir::ExprNode;
 
diff --git a/paddle/cinn/pybind/common.cc b/paddle/cinn/pybind/common.cc
index bdb4b46c848ef9..80ff3abba928da 100644
--- a/paddle/cinn/pybind/common.cc
+++ b/paddle/cinn/pybind/common.cc
@@ -27,11 +27,11 @@ namespace py = pybind11;
 
 namespace cinn::pybind {
 
-using common::bfloat16;
-using common::CINNValue;
-using common::float16;
-using common::Target;
-using common::Type;
+using cinn::common::bfloat16;
+using cinn::common::CINNValue;
+using cinn::common::float16;
+using cinn::common::Target;
+using cinn::common::Type;
 using utils::GetStreamCnt;
 using utils::StringFormat;
 
@@ -41,7 +41,7 @@ void BindType(py::module *);
 void BindShared(py::module *);
 void BindCinnValue(py::module *);
 
-void ResetGlobalNameID() { common::Context::Global().ResetNameId(); }
+void ResetGlobalNameID() { cinn::common::Context::Global().ResetNameId(); }
 
 void BindTarget(py::module *m) {
   py::class_<Target> target(*m, "Target");
@@ -57,9 +57,9 @@ void BindTarget(py::module *m) {
       .def("defined", &Target::defined)
       .def("runtime_arch", &Target::runtime_arch);
 
-  m->def("DefaultHostTarget", &common::DefaultHostTarget)
-      .def("DefaultNVGPUTarget", &common::DefaultNVGPUTarget)
-      .def("DefaultTarget", &common::DefaultTarget);
+  m->def("DefaultHostTarget", &cinn::common::DefaultHostTarget)
+      .def("DefaultNVGPUTarget", &cinn::common::DefaultNVGPUTarget)
+      .def("DefaultTarget", &cinn::common::DefaultTarget);
 
   m->def("get_target", &cinn::runtime::CurrentTarget::GetCurrentTarget);
   m->def("set_target",
@@ -153,68 +153,68 @@ void BindType(py::module *m) {
       .value("HandleHandle", Type::cpp_type_t::HandleHandle)
       .export_values();
 
-  m->def("Void", &common::Void)
-      .def("Int", &common::Int, py::arg("bits"), py::arg("lanes") = 1)
-      .def("UInt", &common::UInt, py::arg("bits"), py::arg("lanes") = 1)
+  m->def("Void", &cinn::common::Void)
+      .def("Int", &cinn::common::Int, py::arg("bits"), py::arg("lanes") = 1)
+      .def("UInt", &cinn::common::UInt, py::arg("bits"), py::arg("lanes") = 1)
       .def("Float",
-           &common::Float,
+           &cinn::common::Float,
            py::arg("bits"),
            py::arg("lanes") = 1,
            py::arg("st") = Type::specific_type_t::None)
-      .def("Float16", &common::Float16, py::arg("lanes") = 1)
-      .def("BFloat16", &common::BFloat16, py::arg("lanes") = 1)
-      .def("Bool", &common::Bool, py::arg("lanes") = 1)
-      .def("String", &common::String);
+      .def("Float16", &cinn::common::Float16, py::arg("lanes") = 1)
+      .def("BFloat16", &cinn::common::BFloat16, py::arg("lanes") = 1)
+      .def("Bool", &cinn::common::Bool, py::arg("lanes") = 1)
+      .def("String", &cinn::common::String);
 
   m->def(
        "make_const",
        [](const Type &type, int32_t val) -> Expr {
-         return common::make_const(type, val);
+         return cinn::common::make_const(type, val);
        },
        py::arg("type"),
        py::arg("val"))
       .def(
           "make_const",
           [](const Type &type, int64_t val) -> Expr {
-            return common::make_const(type, val);
+            return cinn::common::make_const(type, val);
           },
           py::arg("type"),
           py::arg("val"))
       .def(
           "make_const",
           [](const Type &type, float val) -> Expr {
-            return common::make_const(type, val);
+            return cinn::common::make_const(type, val);
           },
           py::arg("type"),
           py::arg("val"))
       .def(
           "make_const",
           [](const Type &type, double val) -> Expr {
-            return common::make_const(type, val);
+            return cinn::common::make_const(type, val);
           },
           py::arg("type"),
           py::arg("val"))
       .def(
           "make_const",
           [](const Type &type, bool val) -> Expr {
-            return common::make_const(type, val);
+            return cinn::common::make_const(type, val);
           },
           py::arg("type"),
           py::arg("val"));
 
   m->def("type_of", [](absl::string_view dtype) {
-    return common::Str2Type(dtype.data());
+    return cinn::common::Str2Type(dtype.data());
   });
 }
 
 void BindShared(py::module *m) {
-  py::class_<common::RefCount> ref_count(*m, "RefCount");
+  py::class_<cinn::common::RefCount> ref_count(*m, "RefCount");
   ref_count.def(py::init<>())
-      .def("inc", &common::RefCount::Inc)
-      .def("dec", &common::RefCount::Dec)
-      .def("is_zero", &common::RefCount::is_zero)
-      .def("to_string", &common::RefCount::to_string)
-      .def("val", &common::RefCount::val);
+      .def("inc", &cinn::common::RefCount::Inc)
+      .def("dec", &cinn::common::RefCount::Dec)
+      .def("is_zero", &cinn::common::RefCount::is_zero)
+      .def("to_string", &cinn::common::RefCount::to_string)
+      .def("val", &cinn::common::RefCount::val);
 }
 
 // TODO(wanghaipeng03) using true_type or false_type as tag disptcher losses
@@ -240,8 +240,8 @@ inline void __binary_op_visitor_dispatch(
 }
 
 void BindCinnValue(py::module *m) {
-  using common::_CINNValuePack_;
-  using common::CINNValuePack;
+  using cinn::common::_CINNValuePack_;
+  using cinn::common::CINNValuePack;
 
   DefineShared<_CINNValuePack_>(m, "_CINNValuePack_");
 
@@ -259,7 +259,7 @@ void BindCinnValue(py::module *m) {
       .def("__len__", &_CINNValuePack_::size)
       .def("type_info", &_CINNValuePack_::type_info);
 
-  py::class_<CINNValuePack, common::Shared<_CINNValuePack_>>
+  py::class_<CINNValuePack, cinn::common::Shared<_CINNValuePack_>>
       cinn_value_pack_shared(*m, "CINNValuePack");
   cinn_value_pack_shared.def(py::init<_CINNValuePack_ *>())
       .def("__getitem__",
diff --git a/paddle/cinn/pybind/framework.cc b/paddle/cinn/pybind/framework.cc
index 752ac5003f43a2..fde1f7dd8eba00 100644
--- a/paddle/cinn/pybind/framework.cc
+++ b/paddle/cinn/pybind/framework.cc
@@ -51,15 +51,15 @@ void BindFramework(pybind11::module *m) {
               const std::vector<ir::Tensor> &inputs,
               const std::vector<Type> &out_types,
               const std::vector<std::vector<int>> &output_shapes,
-              const common::Target &target) {
+              const cinn::common::Target &target) {
              const Operator *op_ptr = Operator::Get(key);
              auto impl = OpStrategy::SelectImpl(
                  self[op_ptr](attrs, inputs, out_types, output_shapes, target));
-             std::vector<common::CINNValue> temp_inputs;
+             std::vector<cinn::common::CINNValue> temp_inputs;
              std::vector<ir::Tensor> res;
              for (auto &tensor : inputs) {
                res.push_back(tensor);
-               temp_inputs.push_back(common::CINNValue(tensor));
+               temp_inputs.push_back(cinn::common::CINNValue(tensor));
              }
 
              ir::LoweredFunc func;
@@ -73,7 +73,7 @@ void BindFramework(pybind11::module *m) {
              std::vector<ir::LoweredFunc> funcs =
                  hlir::framework::GetFuncFromImpl(
                      impl,
-                     common::CINNValuePack{temp_inputs},
+                     cinn::common::CINNValuePack{temp_inputs},
                      res,
                      input_output_names,
                      key,
@@ -114,7 +114,7 @@ void BindFramework(pybind11::module *m) {
       .def("get_tensor",
            [](Scope &self, const std::string &name, const Target &target) {
              auto t = self.GetTensor(name);
-             py::dtype dt(common::Type2Str(t->type()));
+             py::dtype dt(cinn::common::Type2Str(t->type()));
              py::array::ShapeContainer shape(t->shape().data().begin(),
                                              t->shape().data().end());
              py::array array(std::move(dt), std::move(shape));
@@ -140,8 +140,10 @@ void BindFramework(pybind11::module *m) {
            })
       .def("var_names", &Scope::var_names);
 
-  py::class_<common::Shared<hlir::framework::_Tensor_>>(*m, "SharedTensor");
-  py::class_<Tensor, common::Shared<hlir::framework::_Tensor_>>(*m, "Tensor")
+  py::class_<cinn::common::Shared<hlir::framework::_Tensor_>>(*m,
+                                                              "SharedTensor");
+  py::class_<Tensor, cinn::common::Shared<hlir::framework::_Tensor_>>(*m,
+                                                                      "Tensor")
       .def(py::init<>())
       .def("shape",
            [](hlir::framework::Tensor &self) { return self->shape().data(); })
@@ -151,8 +153,9 @@ void BindFramework(pybind11::module *m) {
            })
       .def(
           "numpy",
-          [](hlir::framework::Tensor &self, const common::Target &target) {
-            std::string type_str = common::Type2Str(self->type());
+          [](hlir::framework::Tensor &self,
+             const cinn::common::Target &target) {
+            std::string type_str = cinn::common::Type2Str(self->type());
             if (type_str == "bfloat16") {
               type_str = "uint16";
             }
@@ -183,8 +186,9 @@ void BindFramework(pybind11::module *m) {
           "from_numpy",
           [](hlir::framework::Tensor &self,
              py::array array,
-             const common::Target &target) {
-            CHECK(array.dtype().is(py::dtype(common::Type2Str(self->type()))))
+             const cinn::common::Target &target) {
+            CHECK(array.dtype().is(
+                py::dtype(cinn::common::Type2Str(self->type()))))
                 << "currently only support float32 data type as input";
             hlir::framework::shape_t shape;
             std::copy_n(array.shape(), array.ndim(), std::back_inserter(shape));
diff --git a/paddle/cinn/pybind/frontend.cc b/paddle/cinn/pybind/frontend.cc
index aafa9bedf40d07..05e814ce107f80 100644
--- a/paddle/cinn/pybind/frontend.cc
+++ b/paddle/cinn/pybind/frontend.cc
@@ -41,7 +41,7 @@
 #include "paddle/cinn/utils/timer.h"
 
 namespace cinn::pybind {
-using common::Type;
+using cinn::common::Type;
 using frontend::Placeholder;
 namespace py = pybind11;
 using namespace cinn::frontend;  // NOLINT
@@ -78,7 +78,8 @@ void BindFrontend(pybind11::module *m) {
       .def("id", [](Variable &self) { return self->id; })
       .def("name", [](Variable &self) { return self->id; })
       .def("shape", [](Variable &self) { return self->shape; })
-      .def("type", [](Variable &self) { return common::Type2Str(self->type); })
+      .def("type",
+           [](Variable &self) { return cinn::common::Type2Str(self->type); })
       .def("set_type",
            [](Variable &self, const Type &type) {
              self->type = type;
@@ -86,7 +87,7 @@ void BindFrontend(pybind11::module *m) {
            })
       .def("set_type",
            [](Variable &self, const std::string &type) {
-             self->type = common::Str2Type(type);
+             self->type = cinn::common::Str2Type(type);
              return self;
            })
       .def("set_shape", [](Variable &self, const std::vector<int> &shape) {
@@ -95,15 +96,16 @@ void BindFrontend(pybind11::module *m) {
       });
 
   py::class_<Placeholder>(*m, "Placeholder")  //
-      .def(py::init<const common::Type &,
+      .def(py::init<const cinn::common::Type &,
                     const std::vector<int> &,
                     absl::string_view>(),
            py::arg("type"),
            py::arg("shape"),
            py::arg("id") = "")
       .def("shape", &Placeholder::shape)
-      .def("type",
-           [](Placeholder &self) { return common::Type2Str(self.type()); })
+      .def(
+          "type",
+          [](Placeholder &self) { return cinn::common::Type2Str(self.type()); })
       .def("id", &Placeholder::id)
       .def("name", &Placeholder::id)
       .def("__str__", [](const Placeholder &self) { return self.id(); });
@@ -179,7 +181,7 @@ void BindFrontend(pybind11::module *m) {
       .def(
           "build_and_get_output",
           [](Program &self,
-             const common::Target &target,
+             const cinn::common::Target &target,
              const std::vector<Variable> &tensor_inputs,
              const std::vector<py::array> &input_data,
              const std::vector<Variable> &tensor_outputs,
@@ -263,7 +265,7 @@ void BindFrontend(pybind11::module *m) {
       .def("apply_pass",
            [](Program &self,
               const std::unordered_set<std::string> &fetch_ids,
-              const common::Target &target,
+              const cinn::common::Target &target,
               const std::vector<std::string> &passes = {}) {
              auto graph = Optimize(&self, fetch_ids, target, passes);
              return graph->fusion_groups.size();
@@ -294,7 +296,7 @@ void BindFrontend(pybind11::module *m) {
       .def(
           "test_benchmark",
           [](Program &self,
-             const common::Target &target,
+             const cinn::common::Target &target,
              const std::vector<Variable> &tensor_inputs,
              const std::vector<py::array> &input_data,
              const Variable &tensor_out,
@@ -340,7 +342,7 @@ void BindFrontend(pybind11::module *m) {
       .def(
           "test_benchmark_with_code",
           [](Program &self,
-             const common::Target &target,
+             const cinn::common::Target &target,
              const std::vector<Variable> &tensor_inputs,
              const std::vector<py::array> &input_data,
              const Variable &tensor_out,
@@ -485,7 +487,7 @@ void BindFrontend(pybind11::module *m) {
       // clang-format on
       .def(py::init<const std::string &>(), py::arg("name") = "")
       .def("create_input",
-           static_cast<Placeholder (NetBuilder::*)(const common::Type &,
+           static_cast<Placeholder (NetBuilder::*)(const cinn::common::Type &,
                                                    const std::vector<int> &,
                                                    const std::string &)>(
                &NetBuilder::CreateInput),
@@ -843,7 +845,7 @@ void BindFrontend(pybind11::module *m) {
       // used always
       .def_static(
           "build_and_compile",
-          [](const common::Target &target,
+          [](const cinn::common::Target &target,
              NetBuilder &builder,
              const CinnComputation::CompileOptions &options) {
             return CinnComputation::BuildAndCompile(target, builder, options);
@@ -853,7 +855,7 @@ void BindFrontend(pybind11::module *m) {
           py::arg("options") = CinnComputation::DefaultCompileOptions())
       .def_static(
           "compile",
-          [](const common::Target &target,
+          [](const cinn::common::Target &target,
              Program &program,
              const CinnComputation::CompileOptions &options) {
             return CinnComputation::Compile(target, program, options);
@@ -863,7 +865,7 @@ void BindFrontend(pybind11::module *m) {
           py::arg("options") = CinnComputation::DefaultCompileOptions())
       .def_static(
           "compile_paddle_model",
-          [](const common::Target &target,
+          [](const cinn::common::Target &target,
              const std::string &model_path,
              const std::vector<std::string> &input_names,
              const std::vector<hlir::framework::shape_t> &input_shapes,
@@ -888,7 +890,7 @@ void BindFrontend(pybind11::module *m) {
 
   py::class_<PaddleModelConvertor>(*m, "PaddleModelConvertor")
       .def(py::init<>())
-      .def(py::init<const common::Target &,
+      .def(py::init<const cinn::common::Target &,
                     std::shared_ptr<NetBuilder>,
                     std::shared_ptr<hlir::framework::Scope>>(),
            py::arg("target"),
diff --git a/paddle/cinn/pybind/ir/ir.cc b/paddle/cinn/pybind/ir/ir.cc
index f569bd2c973bee..6118f7c8a5e695 100644
--- a/paddle/cinn/pybind/ir/ir.cc
+++ b/paddle/cinn/pybind/ir/ir.cc
@@ -41,7 +41,7 @@ std::vector<Expr> AxisMap(const std::string& kinds,
 
     // TODO(6clc): set bound of IterVar
 
-    Var iter_var = ir::_Var_::Make("iter_tmp", common::Int(32));
+    Var iter_var = ir::_Var_::Make("iter_tmp", cinn::common::Int(32));
     if (c == 'S') {
       iter_var->is_reduce_axis = false;
     } else if (c == 'R') {
@@ -89,7 +89,7 @@ IRContext Sequential(Expr min, Expr extent) {
   ForContextNode* for_ctx_node = new ForContextNode();
   for_ctx_node->min = min;
   for_ctx_node->extent = extent;
-  for_ctx_node->loop_var = ir::_Var_::Make("v", common::Int(32));
+  for_ctx_node->loop_var = ir::_Var_::Make("v", cinn::common::Int(32));
   return IRContext(for_ctx_node);
 }
 
diff --git a/paddle/cinn/pybind/ir/ir_api.cc b/paddle/cinn/pybind/ir/ir_api.cc
index 9d8320c31c7adf..b2e625e741ba62 100644
--- a/paddle/cinn/pybind/ir/ir_api.cc
+++ b/paddle/cinn/pybind/ir/ir_api.cc
@@ -135,8 +135,8 @@ void BindNode(py::module *m) {
   DefineShared<IrNode>(m, "IrNode");
 
   // class IrNodeRef : public Shared<IrNode>
-  py::class_<ir::IrNodeRef, common::Shared<IrNode>> ir_node_ref(*m,
-                                                                "IrNodeRef");
+  py::class_<ir::IrNodeRef, cinn::common::Shared<IrNode>> ir_node_ref(
+      *m, "IrNodeRef");
   ir_node_ref.def(py::init<>())
       .def(py::init<const ir::IrNodeRef &>())
       .def(py::init<ir::IrNode *>())
@@ -477,9 +477,9 @@ void BindIrIr(py::module *m) {
   py::class_<Var, IrNodeRef> var(*m, "Var");
   var.def(py::init<>())
       .def(py::init<IrNode *>())
-      .def(py::init<const std::string &, common::Type>(),
+      .def(py::init<const std::string &, cinn::common::Type>(),
            arg("name_hint"),
-           arg("t") = common::type_of<int>())
+           arg("t") = cinn::common::type_of<int>())
       .def(py::init<Expr, Expr, const std::string &>())
       .def(py::init<int, const std::string &>())
       .def(py::init<Expr, const std::string &>())
@@ -734,7 +734,7 @@ void BindIrTensor(py::module *m) {
 
 auto PackedFuncCall(lang::PackedFunc &self, py::args args) {  // NOLINT
   lang::Args cinn_args;
-  using common::CINNValue;
+  using cinn::common::CINNValue;
   for (auto handle : args) {
     if (py::isinstance<py::int_>(handle)) {
       cinn_args.Append(CINNValue(py::cast<int64_t>(handle)));
@@ -766,7 +766,9 @@ void BindPackedFunc(py::module *m) {
           [](lang::Args &self, int i) { return self[i]; },
           py::return_value_policy::reference)
       .def("__setitem__",
-           [](lang::Args &self, int i, common::CINNValue &v) { self[i] = v; });
+           [](lang::Args &self, int i, cinn::common::CINNValue &v) {
+             self[i] = v;
+           });
 
   py::class_<lang::PackedFunc> packed_func(*m, "PackedFunc");
   packed_func.def(py::init<>())
diff --git a/paddle/cinn/pybind/ir/ir_context.cc b/paddle/cinn/pybind/ir/ir_context.cc
index 8af89d974222f1..8b4d0a4cf1e1d3 100644
--- a/paddle/cinn/pybind/ir/ir_context.cc
+++ b/paddle/cinn/pybind/ir/ir_context.cc
@@ -92,7 +92,7 @@ void IRBuilderNode::Reset() {
 }
 
 IRBuilder::IRBuilder() {
-  common::Shared<IRBuilderNode> n(new IRBuilderNode());
+  cinn::common::Shared<IRBuilderNode> n(new IRBuilderNode());
   n->Reset();
   data_ = n;
 }
diff --git a/paddle/cinn/pybind/ir/ir_context.h b/paddle/cinn/pybind/ir/ir_context.h
index 89b65512e26664..8cdf0ed85c0818 100644
--- a/paddle/cinn/pybind/ir/ir_context.h
+++ b/paddle/cinn/pybind/ir/ir_context.h
@@ -29,7 +29,7 @@ namespace pybind {
 /**
  * A base context that represents the CINN IR that need context information
  */
-class IRContextNode : public common::Object {
+class IRContextNode : public cinn::common::Object {
  public:
   std::vector<ir::Expr> exprs;
 
@@ -60,7 +60,7 @@ class IRContext {
   void add_expr(Expr expr) { data_->exprs.push_back(expr); }
 
  public:
-  common::Shared<IRContextNode> data_;
+  cinn::common::Shared<IRContextNode> data_;
 
  public:
   template <typename TIRContextNode>
@@ -196,7 +196,7 @@ class ElseContextNode : public IRContextNode {
 /**
  * A stack used to store current IRContext
  */
-class IRBuilderNode : public common::Object {
+class IRBuilderNode : public cinn::common::Object {
  public:
   std::vector<IRContext> contexts;
   Expr result;
@@ -226,7 +226,7 @@ class IRBuilder {
   static IRBuilder CurrentIRBuilder();
 
  public:
-  common::Shared<IRBuilderNode> data_;
+  cinn::common::Shared<IRBuilderNode> data_;
 };
 
 std::vector<IRBuilder>* IRBuilderStack();
diff --git a/paddle/cinn/pybind/lang.cc b/paddle/cinn/pybind/lang.cc
index 8e121fc5628c93..5f7a80e12e2c06 100644
--- a/paddle/cinn/pybind/lang.cc
+++ b/paddle/cinn/pybind/lang.cc
@@ -35,7 +35,7 @@
 namespace py = pybind11;
 
 namespace cinn::pybind {
-using common::Type;
+using cinn::common::Type;
 using lang::Placeholder;
 using py::arg;
 using utils::GetStreamCnt;
@@ -70,7 +70,7 @@ void BindLower(py::module *m) {
          arg("scalar_args") = std::vector<ir::Var>(),
          arg("temp_tensors") = std::vector<ir::Tensor>(),
          arg("b") = nullptr,
-         arg("target") = common::DefaultHostTarget(),
+         arg("target") = cinn::common::DefaultHostTarget(),
          arg("supprt_ir_schedule") = false);
 }
 
@@ -84,7 +84,7 @@ void BindLowerVec(py::module *m) {
          arg("scalar_args") = std::vector<ir::Var>(),
          arg("temp_tensors") = std::vector<ir::Tensor>(),
          arg("b") = nullptr,
-         arg("target") = common::DefaultHostTarget(),
+         arg("target") = cinn::common::DefaultHostTarget(),
          arg("supprt_ir_schedule") = false);
 }
 
@@ -144,13 +144,13 @@ void BindModule(py::module *m) {
       .def("submodules", &ir::Module::submodules)
       .def("compile", &ir::Module::Compile)
       .def("get_c_code", [](const ir::Module &self) -> std::string {
-        backends::CodeGenC codegen(common::DefaultHostTarget());
+        backends::CodeGenC codegen(cinn::common::DefaultHostTarget());
         codegen.SetInlineBuiltinCodes(false);
         return codegen.Compile(self, backends::CodeGenC::OutputKind::CImpl);
       });
 
   py::class_<ir::Module::Builder> builder(module, "Builder");
-  builder.def(py::init<const std::string &, const common::Target &>())
+  builder.def(py::init<const std::string &, const cinn::common::Target &>())
       .def("add_function",
            [](ir::Module::Builder &self, ir::LoweredFunc func) {
              if (self.GetTargetArch() == Target::Arch::NVGPU) {
diff --git a/paddle/cinn/pybind/pe.cc b/paddle/cinn/pybind/pe.cc
index 94204ae4b3e44c..2cd837ab2da3f4 100644
--- a/paddle/cinn/pybind/pe.cc
+++ b/paddle/cinn/pybind/pe.cc
@@ -26,7 +26,7 @@ namespace py = pybind11;
 namespace cinn {
 namespace pybind {
 
-using common::Type;
+using cinn::common::Type;
 using lang::Placeholder;
 using py::arg;
 using utils::GetStreamCnt;
@@ -137,7 +137,7 @@ void BindPE(py::module* m) {
          py::arg("trans_b") = false,
          py::arg("alpha") = 1,
          py::arg("out") = "T_Matmul_mkl_out",
-         py::arg("target") = common::DefaultHostTarget());
+         py::arg("target") = cinn::common::DefaultHostTarget());
 }
 
 }  // namespace pybind
diff --git a/paddle/cinn/pybind/runtime.cc b/paddle/cinn/pybind/runtime.cc
index a4d14edc709316..91db8af397ec29 100644
--- a/paddle/cinn/pybind/runtime.cc
+++ b/paddle/cinn/pybind/runtime.cc
@@ -76,9 +76,9 @@ cinn_buffer_t *CreateBufferFromNumpy(py::array data,
 
 cinn_buffer_t *CreateBufferFromNumpy(
     py::array data,
-    common::Target target = common::DefaultHostTarget(),
+    cinn::common::Target target = cinn::common::DefaultHostTarget(),
     int align = 0) {
-  if (target == common::DefaultHostTarget()) {
+  if (target == cinn::common::DefaultHostTarget()) {
     return CreateBufferFromNumpy(data, cinn_x86_device);
   } else if (target.arch == Target::Arch::NVGPU) {
 #ifdef CINN_WITH_CUDA
@@ -276,7 +276,7 @@ void BindCinnRuntime(py::module *m) {
            arg("data"),
            arg("device"),
            arg("align") = 0)
-      .def(py::init(py::overload_cast<py::array, common::Target, int>(
+      .def(py::init(py::overload_cast<py::array, cinn::common::Target, int>(
                &CreateBufferFromNumpy)),
            arg("data"),
            arg("target"),
diff --git a/paddle/cinn/runtime/cpu/cblas.cc b/paddle/cinn/runtime/cpu/cblas.cc
index 8a9f7be63083cc..9e08c128cb66b0 100644
--- a/paddle/cinn/runtime/cpu/cblas.cc
+++ b/paddle/cinn/runtime/cpu/cblas.cc
@@ -137,14 +137,14 @@ void cinn_call_cholesky_host(
 CINN_REGISTER_HELPER(cinn_cpu_mkl) {
   using namespace cinn;  // NOLINT
   using backends::FunctionProto;
-  auto host_target = common::DefaultHostTarget();
+  auto host_target = cinn::common::DefaultHostTarget();
 
   FunctionProto::shape_inference_t inference_shape_gemm =
       [](const std::vector<Expr>& args, int offset) {
         CHECK_EQ(offset, 0UL) << "Only one output";
         CHECK_EQ(args.size(), 12UL) << "Wrong number of arguments passed in";
-        auto M = common::AutoSimplify(args[1]);
-        auto N = common::AutoSimplify(args[2]);
+        auto M = cinn::common::AutoSimplify(args[1]);
+        auto N = cinn::common::AutoSimplify(args[2]);
         std::vector<Expr> shape;
         shape.push_back(M);
         shape.push_back(N);
@@ -159,16 +159,16 @@ CINN_REGISTER_HELPER(cinn_cpu_mkl) {
         auto A_tensor = A.as_tensor();
         CHECK(A_tensor);
 
-        auto batch_size = common::AutoSimplify(args[1]);
+        auto batch_size = cinn::common::AutoSimplify(args[1]);
         int32_t batch_size_val = batch_size.as_int32();
 
-        auto M = common::AutoSimplify(args[2]);
-        auto N = common::AutoSimplify(args[3]);
+        auto M = cinn::common::AutoSimplify(args[2]);
+        auto N = cinn::common::AutoSimplify(args[3]);
 
         std::vector<Expr> shape;
         int total = 1;
         for (auto& v : A_tensor->shape) {
-          auto val = common::AutoSimplify(v);
+          auto val = cinn::common::AutoSimplify(v);
           CHECK(val.is_constant());
           shape.push_back(val);
           total *= val.as_int32();
diff --git a/paddle/cinn/runtime/cpu/host_intrinsics_test.cc b/paddle/cinn/runtime/cpu/host_intrinsics_test.cc
index 22e13f8b0c3abf..f7f5ba7cb085d2 100644
--- a/paddle/cinn/runtime/cpu/host_intrinsics_test.cc
+++ b/paddle/cinn/runtime/cpu/host_intrinsics_test.cc
@@ -41,7 +41,7 @@ TEST(tanh, basic) {
 
   auto jit = backends::SimpleJIT::Create();
 
-  ir::Module::Builder builder("module1", common::DefaultHostTarget());
+  ir::Module::Builder builder("module1", cinn::common::DefaultHostTarget());
 
   auto fn = Lower("fn", stages, {x, y});
   LOG(INFO) << "fn:\n" << fn;
@@ -54,13 +54,15 @@ TEST(tanh, basic) {
   auto fnp = reinterpret_cast<lower_func_ptr_t>(fn_ptr);
   ASSERT_TRUE(fnp);
 
-  auto* x_buf = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
-                    .set_random()
-                    .Build();
-  auto* out_buf = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
-                      .set_zero()
-                      .Build();
-  auto args = common::ArgsBuilder().Add(x_buf).Add(out_buf).Build();
+  auto* x_buf =
+      cinn::common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
+          .set_random()
+          .Build();
+  auto* out_buf =
+      cinn::common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
+          .set_zero()
+          .Build();
+  auto args = cinn::common::ArgsBuilder().Add(x_buf).Add(out_buf).Build();
   fnp(args.data(), args.size());
 
   auto* x_buf_data = reinterpret_cast<float*>(x_buf->memory);
@@ -87,7 +89,7 @@ TEST(find_value_nd, basic) {
 
   auto jit = backends::SimpleJIT::Create();
 
-  ir::Module::Builder builder("module1", common::DefaultHostTarget());
+  ir::Module::Builder builder("module1", cinn::common::DefaultHostTarget());
 
   auto fn = Lower("fn", stages, {x, y});
   LOG(INFO) << "fn:\n" << fn;
@@ -100,12 +102,13 @@ TEST(find_value_nd, basic) {
   auto fnp = reinterpret_cast<lower_func_ptr_t>(fn_ptr);
   ASSERT_TRUE(fnp);
 
-  auto* x_buf = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
-                    .set_random()
-                    .Build();
+  auto* x_buf =
+      cinn::common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
+          .set_random()
+          .Build();
   auto* out_buf =
-      common::BufferBuilder(Int(32), {N.as_int32()}).set_zero().Build();
-  auto args = common::ArgsBuilder().Add(x_buf).Add(out_buf).Build();
+      cinn::common::BufferBuilder(Int(32), {N.as_int32()}).set_zero().Build();
+  auto args = cinn::common::ArgsBuilder().Add(x_buf).Add(out_buf).Build();
   fnp(args.data(), args.size());
 
   auto* x_buf_data = reinterpret_cast<float*>(x_buf->memory);
@@ -135,7 +138,7 @@ TEST(cinn_host_lt_num_fp32, basic) {
 
   auto jit = backends::SimpleJIT::Create();
 
-  ir::Module::Builder builder("module1", common::DefaultHostTarget());
+  ir::Module::Builder builder("module1", cinn::common::DefaultHostTarget());
 
   auto fn = Lower("fn", stages, {x, y});
   LOG(INFO) << "fn:\n" << fn;
@@ -148,12 +151,13 @@ TEST(cinn_host_lt_num_fp32, basic) {
   auto fnp = reinterpret_cast<lower_func_ptr_t>(fn_ptr);
   ASSERT_TRUE(fnp);
 
-  auto* x_buf = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
-                    .set_random()
-                    .Build();
+  auto* x_buf =
+      cinn::common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
+          .set_random()
+          .Build();
   auto* out_buf =
-      common::BufferBuilder(Int(32), {N.as_int32()}).set_zero().Build();
-  auto args = common::ArgsBuilder().Add(x_buf).Add(out_buf).Build();
+      cinn::common::BufferBuilder(Int(32), {N.as_int32()}).set_zero().Build();
+  auto args = cinn::common::ArgsBuilder().Add(x_buf).Add(out_buf).Build();
   fnp(args.data(), args.size());
 
   auto* x_buf_data = reinterpret_cast<float*>(x_buf->memory);
@@ -186,7 +190,7 @@ TEST(cinn_host_gt_num_fp32, basic) {
 
   auto jit = backends::SimpleJIT::Create();
 
-  ir::Module::Builder builder("module1", common::DefaultHostTarget());
+  ir::Module::Builder builder("module1", cinn::common::DefaultHostTarget());
 
   auto fn = Lower("fn", stages, {x, y});
   LOG(INFO) << "fn:\n" << fn;
@@ -199,12 +203,13 @@ TEST(cinn_host_gt_num_fp32, basic) {
   auto fnp = reinterpret_cast<lower_func_ptr_t>(fn_ptr);
   ASSERT_TRUE(fnp);
 
-  auto* x_buf = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
-                    .set_random()
-                    .Build();
+  auto* x_buf =
+      cinn::common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
+          .set_random()
+          .Build();
   auto* out_buf =
-      common::BufferBuilder(Int(32), {N.as_int32()}).set_zero().Build();
-  auto args = common::ArgsBuilder().Add(x_buf).Add(out_buf).Build();
+      cinn::common::BufferBuilder(Int(32), {N.as_int32()}).set_zero().Build();
+  auto args = cinn::common::ArgsBuilder().Add(x_buf).Add(out_buf).Build();
   fnp(args.data(), args.size());
 
   auto* x_buf_data = reinterpret_cast<float*>(x_buf->memory);
diff --git a/paddle/cinn/runtime/cpu/mkl_math_test.cc b/paddle/cinn/runtime/cpu/mkl_math_test.cc
index f91a76ddd54114..d064535d940c18 100644
--- a/paddle/cinn/runtime/cpu/mkl_math_test.cc
+++ b/paddle/cinn/runtime/cpu/mkl_math_test.cc
@@ -33,11 +33,13 @@ cinn_buffer_t *CreateBuffer(const std::vector<int> shape,
                             bool random = true,
                             int set_value = 0) {
   if (random) {
-    return common::BufferBuilder(Float(32), shape).set_random().Build();
+    return cinn::common::BufferBuilder(Float(32), shape).set_random().Build();
   } else if (set_value != 0) {
-    return common::BufferBuilder(Float(32), shape).set_val(set_value).Build();
+    return cinn::common::BufferBuilder(Float(32), shape)
+        .set_val(set_value)
+        .Build();
   }
-  return common::BufferBuilder(Float(32), shape).set_zero().Build();
+  return cinn::common::BufferBuilder(Float(32), shape).set_zero().Build();
 }
 
 template <typename FuncRuntime>
@@ -74,7 +76,7 @@ void TestCallElementwise(const std::string &fn_name,
 
   auto stages = CreateStages(lower_args);
 
-  auto target = common::DefaultHostTarget();
+  auto target = cinn::common::DefaultHostTarget();
   target.arch = Target::Arch::X86;
   ir::Module::Builder builder("module0", target);
   auto func = Lower("fn", stages, lower_args);
@@ -96,8 +98,9 @@ void TestCallElementwise(const std::string &fn_name,
   } else {
     A_buf = CreateBuffer({10, 10});
   }
-  auto *B_buf =
-      common::BufferBuilder(type, {10, 10}).set_align(type.bits()).Build();
+  auto *B_buf = cinn::common::BufferBuilder(type, {10, 10})
+                    .set_align(type.bits())
+                    .Build();
 
   cinn_pod_value_t a_arg(A_buf), b_arg(B_buf);
   cinn_pod_value_t args[] = {a_arg, b_arg};
@@ -183,18 +186,18 @@ TEST(cinn_cpu_mkl_gemm_fp32, test) {
       [=]() -> Expr {
         return lang::CallExtern("cinn_cpu_mkl_gemm_fp32",
                                 {
-                                    common::make_one<float>(),   // alpha
-                                    M,                           // M
-                                    N,                           // N
-                                    K,                           // K
-                                    common::make_bool(false),    // ta
-                                    common::make_bool(false),    // tb
-                                    K,                           // lda
-                                    N,                           // ldb
-                                    N,                           // ldc
-                                    common::make_zero<float>(),  // beta
-                                    A.tensor(),                  // A
-                                    B.tensor(),                  // B
+                                    cinn::common::make_one<float>(),   // alpha
+                                    M,                                 // M
+                                    N,                                 // N
+                                    K,                                 // K
+                                    cinn::common::make_bool(false),    // ta
+                                    cinn::common::make_bool(false),    // tb
+                                    K,                                 // lda
+                                    N,                                 // ldb
+                                    N,                                 // ldc
+                                    cinn::common::make_zero<float>(),  // beta
+                                    A.tensor(),                        // A
+                                    B.tensor(),                        // B
                                 });
       },
       "extern_call");
@@ -204,7 +207,7 @@ TEST(cinn_cpu_mkl_gemm_fp32, test) {
 
   auto stages = CreateStages({call, out});
 
-  auto target = common::DefaultHostTarget();
+  auto target = cinn::common::DefaultHostTarget();
   target.arch = Target::Arch::X86;
   ir::Module::Builder builder("module0", target);
 
@@ -221,17 +224,21 @@ TEST(cinn_cpu_mkl_gemm_fp32, test) {
   auto fn_ptr = reinterpret_cast<void (*)(void *, int32_t)>(fn);
 
   // test with real data
-  auto *A_buf = common::BufferBuilder(Float(32), {M.as_int32(), K.as_int32()})
-                    .set_random()
-                    .Build();
-  auto *B_buf = common::BufferBuilder(Float(32), {K.as_int32(), N.as_int32()})
-                    .set_random()
-                    .Build();
-  auto *C_buf = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
-                    .set_zero()
-                    .Build();
-
-  auto args = common::ArgsBuilder().Add(A_buf).Add(B_buf).Add(C_buf).Build();
+  auto *A_buf =
+      cinn::common::BufferBuilder(Float(32), {M.as_int32(), K.as_int32()})
+          .set_random()
+          .Build();
+  auto *B_buf =
+      cinn::common::BufferBuilder(Float(32), {K.as_int32(), N.as_int32()})
+          .set_random()
+          .Build();
+  auto *C_buf =
+      cinn::common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
+          .set_zero()
+          .Build();
+
+  auto args =
+      cinn::common::ArgsBuilder().Add(A_buf).Add(B_buf).Add(C_buf).Build();
 
   fn_ptr(args.data(), args.size());
 
diff --git a/paddle/cinn/runtime/cpu/mkldnn_math.cc b/paddle/cinn/runtime/cpu/mkldnn_math.cc
index 8154ee538ad739..b45ddedd2e890a 100644
--- a/paddle/cinn/runtime/cpu/mkldnn_math.cc
+++ b/paddle/cinn/runtime/cpu/mkldnn_math.cc
@@ -157,23 +157,23 @@ void cinn_cpu_mkldnn_conv2d_nchw_fp32(int batch_size,
 CINN_REGISTER_HELPER(cinn_cpu_mkldnn) {
   using namespace cinn;  // NOLINT
   using backends::FunctionProto;
-  auto host_target = common::DefaultHostTarget();
+  auto host_target = cinn::common::DefaultHostTarget();
 
   FunctionProto::shape_inference_t inference_shape_conv2d_nchw =
       [](const std::vector<Expr>& args, int offset) {
         CHECK_EQ(args.size(), 16UL) << "Wrong number of arguments passed in";
-        auto N = common::AutoSimplify(args[0]);
-        int input_h = common::AutoSimplify(args[2]).as_int32();
-        int input_w = common::AutoSimplify(args[3]).as_int32();
-        auto c_out = common::AutoSimplify(args[4]);
-        int filter_h = common::AutoSimplify(args[6]).as_int32();
-        int filter_w = common::AutoSimplify(args[7]).as_int32();
-        int pad_h = common::AutoSimplify(args[8]).as_int32();
-        int pad_w = common::AutoSimplify(args[9]).as_int32();
-        int stride_h = common::AutoSimplify(args[10]).as_int32();
-        int stride_w = common::AutoSimplify(args[11]).as_int32();
-        int dilation_h = common::AutoSimplify(args[12]).as_int32();
-        int dilation_w = common::AutoSimplify(args[13]).as_int32();
+        auto N = cinn::common::AutoSimplify(args[0]);
+        int input_h = cinn::common::AutoSimplify(args[2]).as_int32();
+        int input_w = cinn::common::AutoSimplify(args[3]).as_int32();
+        auto c_out = cinn::common::AutoSimplify(args[4]);
+        int filter_h = cinn::common::AutoSimplify(args[6]).as_int32();
+        int filter_w = cinn::common::AutoSimplify(args[7]).as_int32();
+        int pad_h = cinn::common::AutoSimplify(args[8]).as_int32();
+        int pad_w = cinn::common::AutoSimplify(args[9]).as_int32();
+        int stride_h = cinn::common::AutoSimplify(args[10]).as_int32();
+        int stride_w = cinn::common::AutoSimplify(args[11]).as_int32();
+        int dilation_h = cinn::common::AutoSimplify(args[12]).as_int32();
+        int dilation_w = cinn::common::AutoSimplify(args[13]).as_int32();
         int out_h = (input_h - ((filter_h - 1) * dilation_h + 1) + 2 * pad_h) /
                         stride_h +
                     1;
diff --git a/paddle/cinn/runtime/cpu/mkldnn_math_test.cc b/paddle/cinn/runtime/cpu/mkldnn_math_test.cc
index 26d06d715d5504..15574a90280427 100644
--- a/paddle/cinn/runtime/cpu/mkldnn_math_test.cc
+++ b/paddle/cinn/runtime/cpu/mkldnn_math_test.cc
@@ -33,11 +33,13 @@ cinn_buffer_t *CreateBuffer(const std::vector<int> shape,
                             bool random = true,
                             int set_value = 0) {
   if (random) {
-    return common::BufferBuilder(Float(32), shape).set_random().Build();
+    return cinn::common::BufferBuilder(Float(32), shape).set_random().Build();
   } else if (set_value != 0) {
-    return common::BufferBuilder(Float(32), shape).set_val(set_value).Build();
+    return cinn::common::BufferBuilder(Float(32), shape)
+        .set_val(set_value)
+        .Build();
   }
-  return common::BufferBuilder(Float(32), shape).set_zero().Build();
+  return cinn::common::BufferBuilder(Float(32), shape).set_zero().Build();
 }
 
 TEST(cinn_cpu_mkldnn_conv2d_nchw_fp32, test) {
@@ -90,7 +92,7 @@ TEST(cinn_cpu_mkldnn_conv2d_nchw_fp32, test) {
 
   auto stages = CreateStages({call, out});
 
-  auto target = common::DefaultHostTarget();
+  auto target = cinn::common::DefaultHostTarget();
   target.arch = Target::Arch::X86;
   ir::Module::Builder builder("module0", target);
 
@@ -109,16 +111,18 @@ TEST(cinn_cpu_mkldnn_conv2d_nchw_fp32, test) {
   // test with real data
   int o_h = (i_h - ((k_h - 1) * dilation_h + 1) + pad_h * 2) / stride_h + 1;
   int o_w = (i_w - ((k_w - 1) * dilation_w + 1) + pad_w * 2) / stride_w + 1;
-  auto *A_buf = common::BufferBuilder(Float(32), {n, c_in, i_h, i_w})
+  auto *A_buf = cinn::common::BufferBuilder(Float(32), {n, c_in, i_h, i_w})
                     .set_random()
                     .Build();
-  auto *B_buf = common::BufferBuilder(Float(32), {c_out, c_in, k_h, k_w})
+  auto *B_buf = cinn::common::BufferBuilder(Float(32), {c_out, c_in, k_h, k_w})
                     .set_random()
                     .Build();
-  auto *C_buf =
-      common::BufferBuilder(Float(32), {n, c_out, o_h, o_w}).set_zero().Build();
+  auto *C_buf = cinn::common::BufferBuilder(Float(32), {n, c_out, o_h, o_w})
+                    .set_zero()
+                    .Build();
 
-  auto args = common::ArgsBuilder().Add(A_buf).Add(B_buf).Add(C_buf).Build();
+  auto args =
+      cinn::common::ArgsBuilder().Add(A_buf).Add(B_buf).Add(C_buf).Build();
 
   fn_ptr(args.data(), args.size());
 
diff --git a/paddle/cinn/runtime/cpu/thread_backend.cc b/paddle/cinn/runtime/cpu/thread_backend.cc
index c6c49dfe5d5052..43804e33b1e60b 100644
--- a/paddle/cinn/runtime/cpu/thread_backend.cc
+++ b/paddle/cinn/runtime/cpu/thread_backend.cc
@@ -64,7 +64,7 @@ int cinn_backend_parallel_launch(FCINNParallelLambda flambda,
 CINN_REGISTER_HELPER(cinn_backend_parallel) {
   using namespace cinn;  // NOLINT
   using backends::FunctionProto;
-  auto host_target = common::DefaultHostTarget();
+  auto host_target = cinn::common::DefaultHostTarget();
   backends::GlobalSymbolRegistry::Global().RegisterFn(
       runtime::intrinsic::parallel_launch,
       reinterpret_cast<void*>(&cinn_backend_parallel_launch));
diff --git a/paddle/cinn/runtime/cuda/cublas_util.h b/paddle/cinn/runtime/cuda/cublas_util.h
index edb3d60e8a1a3c..bdd21dafed544f 100644
--- a/paddle/cinn/runtime/cuda/cublas_util.h
+++ b/paddle/cinn/runtime/cuda/cublas_util.h
@@ -91,8 +91,8 @@ inline cublasStatus_t cublasGemm(cudaDataType_t dtype,
                         CUBLAS_COMPUTE_32F,
                         CUBLAS_GEMM_DEFAULT_TENSOR_OP);
 #else
-    common::float16 alpha_fp16{alpha};
-    common::float16 beta_fp16{beta};
+    cinn::common::float16 alpha_fp16{alpha};
+    cinn::common::float16 beta_fp16{beta};
     return cublasHgemm(handle,
                        transa,
                        transb,
@@ -221,8 +221,8 @@ inline cublasStatus_t cublasGemmStridedBatched(cudaDataType_t dtype,
                                       CUBLAS_COMPUTE_32F,
                                       CUBLAS_GEMM_DEFAULT_TENSOR_OP);
 #else
-    common::float16 alpha_fp16{alpha};
-    common::float16 beta_fp16{beta};
+    cinn::common::float16 alpha_fp16{alpha};
+    cinn::common::float16 beta_fp16{beta};
     return cublasHgemmStridedBatched(
         handle,
         transa,
diff --git a/paddle/cinn/runtime/cuda/cuda_module_test.cc b/paddle/cinn/runtime/cuda/cuda_module_test.cc
index 2b0ff0c89c72a1..fe41a1ed0ca2e0 100644
--- a/paddle/cinn/runtime/cuda/cuda_module_test.cc
+++ b/paddle/cinn/runtime/cuda/cuda_module_test.cc
@@ -51,7 +51,7 @@ void saxpy(float a, float *x, float *y, float *out, size_t n)
 }
 
 TEST(CUDAModule, float16) {
-  using common::float16;
+  using cinn::common::float16;
   using runtime::cuda::util::Vector;
 
   auto generate_ptx = [] {
@@ -120,7 +120,7 @@ TEST(CUDAModule, float16) {
 }
 
 TEST(CUDAModule, bfloat16) {
-  using common::bfloat16;
+  using cinn::common::bfloat16;
   using runtime::cuda::util::Vector;
 
   auto generate_ptx = [] {
diff --git a/paddle/cinn/runtime/cuda/cuda_util.cc b/paddle/cinn/runtime/cuda/cuda_util.cc
index 6509fb33dbeb49..326e5a3aac561d 100644
--- a/paddle/cinn/runtime/cuda/cuda_util.cc
+++ b/paddle/cinn/runtime/cuda/cuda_util.cc
@@ -191,7 +191,7 @@ void cinn_call_cublas(void *v_args,
   bool is_float = type_code == cinn_type_float;
   bool is_bfloat16 = type_code == cinn_type_bfloat;
   int bytes = args[0].operator cinn_buffer_t *()->type.bits / CHAR_BIT;
-  if (is_float && bytes == sizeof(common::float16)) {
+  if (is_float && bytes == sizeof(cinn::common::float16)) {
     cuda_dtype = CUDA_R_16F;
   } else if (is_float && bytes == sizeof(float)) {
     cuda_dtype = CUDA_R_32F;
@@ -413,7 +413,7 @@ void cinn_call_batched_cublas(void *v_args,
   bool is_float = type_code == cinn_type_float;
   bool is_bfloat16 = type_code == cinn_type_bfloat;
   int bytes = args[0].operator cinn_buffer_t *()->type.bits / CHAR_BIT;
-  if (is_float && bytes == sizeof(common::float16)) {
+  if (is_float && bytes == sizeof(cinn::common::float16)) {
     cuda_dtype = CUDA_R_16F;
   } else if (is_float && bytes == sizeof(float)) {
     cuda_dtype = CUDA_R_32F;
@@ -1841,7 +1841,7 @@ void cinn_assert_true_nvgpu(
                    msg,
                    only_warning,
                    stream,
-                   common::DefaultNVGPUTarget());
+                   cinn::common::DefaultNVGPUTarget());
 }
 
 void cinn_gpu_cublas_mul(const std::vector<int> &attrs,
@@ -2172,11 +2172,11 @@ void cinn_gpu_cudnn_conv2d(const absl::flat_hash_map<std::string, int> &attr,
                            cinn_buffer_t *w,
                            cinn_buffer_t *y,
                            cudaStream_t stream,
-                           common::Layout target) {
+                           cinn::common::Layout target) {
   cudnnTensorFormat_t cudnn_tensor_format;
-  if (target == common::Layout::kNCHW) {
+  if (target == cinn::common::Layout::kNCHW) {
     cudnn_tensor_format = CUDNN_TENSOR_NCHW;
-  } else if (target == common::Layout::kNHWC) {
+  } else if (target == cinn::common::Layout::kNHWC) {
     cudnn_tensor_format = CUDNN_TENSOR_NHWC;
   } else {
     CINN_NOT_IMPLEMENTED
diff --git a/paddle/cinn/runtime/cuda/cuda_util.h b/paddle/cinn/runtime/cuda/cuda_util.h
index ec7f2ca6a88352..7ea9dbe00a2c5b 100644
--- a/paddle/cinn/runtime/cuda/cuda_util.h
+++ b/paddle/cinn/runtime/cuda/cuda_util.h
@@ -150,12 +150,13 @@ void cinn_call_batched_cublas(void* v_args,
                               void* stream);
 
 #ifdef CINN_WITH_CUDNN
-void cinn_gpu_cudnn_conv2d(const absl::flat_hash_map<std::string, int>& attr,
-                           cinn_buffer_t* x,
-                           cinn_buffer_t* w,
-                           cinn_buffer_t* y,
-                           cudaStream_t stream = nullptr,
-                           common::Layout target = common::Layout::kNCHW);
+void cinn_gpu_cudnn_conv2d(
+    const absl::flat_hash_map<std::string, int>& attr,
+    cinn_buffer_t* x,
+    cinn_buffer_t* w,
+    cinn_buffer_t* y,
+    cudaStream_t stream = nullptr,
+    cinn::common::Layout target = cinn::common::Layout::kNCHW);
 
 void cinn_gpu_cudnn_conv2d_backward_data(
     const absl::flat_hash_map<std::string, int>& attr,
diff --git a/paddle/cinn/runtime/custom_function.cc b/paddle/cinn/runtime/custom_function.cc
index dda7430b9e336f..08fe5c1bd7f351 100644
--- a/paddle/cinn/runtime/custom_function.cc
+++ b/paddle/cinn/runtime/custom_function.cc
@@ -27,7 +27,7 @@ PD_DECLARE_string(cinn_check_fusion_accuracy_pass);
 namespace cinn {
 namespace runtime {
 
-using common::Target;
+using cinn::common::Target;
 using hlir::framework::Shape;
 using hlir::framework::Tensor;
 
@@ -104,7 +104,7 @@ bool MemcpyToHost(void* dst,
                   size_t bytes,
                   const Target& input_target,
                   void* stream = nullptr) {
-  if (input_target == common::DefaultNVGPUTarget()) {
+  if (input_target == cinn::common::DefaultNVGPUTarget()) {
 #ifdef CINN_WITH_CUDA
     const auto& cuda_stream = static_cast<cudaStream_t>(stream);
     cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDeviceToHost, cuda_stream);
@@ -116,7 +116,7 @@ bool MemcpyToHost(void* dst,
     return false;
 #endif
   }
-  if (input_target == common::DefaultHostTarget()) {
+  if (input_target == cinn::common::DefaultHostTarget()) {
     memcpy(dst, src, bytes);
     return true;
   }
@@ -132,14 +132,14 @@ bool MemcpyToDevice(void* dst,
                     const Target& input_target,
                     void* stream = nullptr) {
 #ifdef CINN_WITH_CUDA
-  if (input_target == common::DefaultNVGPUTarget()) {
+  if (input_target == cinn::common::DefaultNVGPUTarget()) {
     cudaMemcpyAsync(dst,
                     src,
                     bytes,
                     cudaMemcpyDeviceToDevice,
                     static_cast<cudaStream_t>(stream));
     return true;
-  } else if (input_target == common::DefaultHostTarget()) {
+  } else if (input_target == cinn::common::DefaultHostTarget()) {
     cudaMemcpyAsync(dst,
                     src,
                     bytes,
@@ -223,7 +223,7 @@ void cinn_assert_true(void* v_args,
 
   Tensor cpu_tensor;
   cpu_tensor->Resize(Shape(shape));
-  bool* dst = cpu_tensor->mutable_data<bool>(common::DefaultHostTarget());
+  bool* dst = cpu_tensor->mutable_data<bool>(cinn::common::DefaultHostTarget());
 
   // copy data from gpu to cpu
   const bool* src = reinterpret_cast<const bool*>(x->memory);
@@ -236,7 +236,7 @@ void cinn_assert_true(void* v_args,
                   utils::AssertTrueMsgTool::GetInstance()->GetMsg(msg),
                   target);
 
-  if (target == common::DefaultNVGPUTarget()) {
+  if (target == cinn::common::DefaultNVGPUTarget()) {
     utils::MemcpyToDevice(
         output->memory, x->memory, numel * sizeof(bool), target, stream);
   } else {
diff --git a/paddle/cinn/runtime/custom_function_test.cc b/paddle/cinn/runtime/custom_function_test.cc
index df88a0e4b817b5..546599f252cc2f 100644
--- a/paddle/cinn/runtime/custom_function_test.cc
+++ b/paddle/cinn/runtime/custom_function_test.cc
@@ -45,7 +45,7 @@ class CinnBufferAllocHelper {
 
   template <typename T>
   T* mutable_data(const Target& target) {
-    if (target_ != common::UnkTarget()) {
+    if (target_ != cinn::common::UnkTarget()) {
       CHECK_EQ(target, target_)
           << "Cannot alloc twice, the memory had alloced at " << target_
           << "! Please check.";
@@ -53,9 +53,9 @@ class CinnBufferAllocHelper {
     }
 
     target_ = target;
-    if (target == common::DefaultHostTarget()) {
+    if (target == cinn::common::DefaultHostTarget()) {
       cinn_buffer_malloc(nullptr, buffer_);
-    } else if (target == common::DefaultNVGPUTarget()) {
+    } else if (target == cinn::common::DefaultNVGPUTarget()) {
 #ifdef CINN_WITH_CUDA
       cudaMalloc(&buffer_->memory, buffer_->num_elements() * sizeof(T));
 #else
@@ -72,7 +72,7 @@ class CinnBufferAllocHelper {
 
   template <typename T>
   const T* data() {
-    if (target_ == common::UnkTarget()) {
+    if (target_ == cinn::common::UnkTarget()) {
       LOG(FATAL) << "No memory had alloced! Please check.";
     }
     return reinterpret_cast<const T*>(buffer_->memory);
@@ -80,11 +80,11 @@ class CinnBufferAllocHelper {
 
   ~CinnBufferAllocHelper() {
     if (buffer_) {
-      if (target_ == common::UnkTarget()) {
+      if (target_ == cinn::common::UnkTarget()) {
         // pass
-      } else if (target_ == common::DefaultHostTarget()) {
+      } else if (target_ == cinn::common::DefaultHostTarget()) {
         cinn_buffer_free(nullptr, buffer_);
-      } else if (target_ == common::DefaultNVGPUTarget()) {
+      } else if (target_ == cinn::common::DefaultNVGPUTarget()) {
 #ifdef CINN_WITH_CUDA
         cudaFree(buffer_->memory);
 #else
@@ -105,7 +105,7 @@ class CinnBufferAllocHelper {
 
  private:
   cinn_buffer_t* buffer_{nullptr};
-  Target target_{common::UnkTarget()};
+  Target target_{cinn::common::UnkTarget()};
 };
 
 template <typename T>
@@ -113,11 +113,11 @@ void SetInputValue(T* input,
                    const T* input_h,
                    size_t num,
                    const Target& target) {
-  if (target == common::DefaultHostTarget()) {
+  if (target == cinn::common::DefaultHostTarget()) {
     for (int i = 0; i < num; ++i) {
       input[i] = input_h[i];
     }
-  } else if (target == common::DefaultNVGPUTarget()) {
+  } else if (target == cinn::common::DefaultNVGPUTarget()) {
 #ifdef CINN_WITH_CUDA
     cudaMemcpy(input, input_h, num * sizeof(T), cudaMemcpyHostToDevice);
 #else
@@ -128,7 +128,7 @@ void SetInputValue(T* input,
 }
 
 TEST(CinnAssertTrue, test_true) {
-  Target target = common::DefaultTarget();
+  Target target = cinn::common::DefaultTarget();
 
   CinnBufferAllocHelper x(cinn_x86_device, cinn_bool_t(), {1});
 
@@ -151,10 +151,10 @@ TEST(CinnAssertTrue, test_true) {
   cinn::runtime::utils::AssertTrueMsgTool::GetInstance()->SetMsg(msg_key, msg);
   cinn_assert_true(v_args, 2, msg_key, true, nullptr, target);
 
-  if (target == common::DefaultHostTarget()) {
+  if (target == cinn::common::DefaultHostTarget()) {
     ASSERT_EQ(input[0], output[0])
         << "The output of AssertTrue should be the same as input";
-  } else if (target == common::DefaultNVGPUTarget()) {
+  } else if (target == cinn::common::DefaultNVGPUTarget()) {
 #ifdef CINN_WITH_CUDA
     bool output_h = false;
     cudaMemcpy(&output_h, output, sizeof(bool), cudaMemcpyDeviceToHost);
@@ -166,7 +166,7 @@ TEST(CinnAssertTrue, test_true) {
 }
 
 TEST(CinnAssertTrue, test_false_only_warning) {
-  Target target = common::DefaultTarget();
+  Target target = cinn::common::DefaultTarget();
 
   CinnBufferAllocHelper x(cinn_x86_device, cinn_bool_t(), {1});
 
@@ -189,10 +189,10 @@ TEST(CinnAssertTrue, test_false_only_warning) {
   cinn::runtime::utils::AssertTrueMsgTool::GetInstance()->SetMsg(msg_key, msg);
   cinn_assert_true(v_args, 2, msg_key, true, nullptr, target);
 
-  if (target == common::DefaultHostTarget()) {
+  if (target == cinn::common::DefaultHostTarget()) {
     ASSERT_EQ(input[0], output[0])
         << "The output of AssertTrue should be the same as input";
-  } else if (target == common::DefaultNVGPUTarget()) {
+  } else if (target == cinn::common::DefaultNVGPUTarget()) {
 #ifdef CINN_WITH_CUDA
     bool output_h = false;
     cudaMemcpy(&output_h, output, sizeof(bool), cudaMemcpyDeviceToHost);
@@ -204,7 +204,7 @@ TEST(CinnAssertTrue, test_false_only_warning) {
 }
 
 TEST(CustomCallGaussianRandom, test_target_nvgpu) {
-  Target target = common::DefaultTarget();
+  Target target = cinn::common::DefaultTarget();
 
   // Arg mean
   float mean = 0.0f;
@@ -220,9 +220,9 @@ TEST(CustomCallGaussianRandom, test_target_nvgpu) {
   int num_args = 1;
   cinn_pod_value_t v_args[1] = {cinn_pod_value_t(out.get())};
 
-  if (target == common::DefaultHostTarget()) {
+  if (target == cinn::common::DefaultHostTarget()) {
     LOG(INFO) << "Op gaussian random only support on NVGPU";
-  } else if (target == common::DefaultNVGPUTarget()) {
+  } else if (target == cinn::common::DefaultNVGPUTarget()) {
 #ifdef CINN_WITH_CUDA
     cinn::runtime::cuda::cinn_call_gaussian_random(
         v_args, num_args, mean, std, seed, nullptr);
@@ -240,7 +240,7 @@ TEST(CustomCallGaussianRandom, test_target_nvgpu) {
 }
 
 TEST(CustomCallUniformRandom, test_target_nvgpu) {
-  Target target = common::DefaultTarget();
+  Target target = cinn::common::DefaultTarget();
 
   // Arg min
   float min = -1.0f;
@@ -256,9 +256,9 @@ TEST(CustomCallUniformRandom, test_target_nvgpu) {
   int num_args = 1;
   cinn_pod_value_t v_args[1] = {cinn_pod_value_t(out.get())};
 
-  if (target == common::DefaultHostTarget()) {
+  if (target == cinn::common::DefaultHostTarget()) {
     LOG(INFO) << "Op uniform random only support on NVGPU";
-  } else if (target == common::DefaultNVGPUTarget()) {
+  } else if (target == cinn::common::DefaultNVGPUTarget()) {
 #ifdef CINN_WITH_CUDA
     cinn::runtime::cuda::cinn_call_uniform_random(
         v_args, num_args, min, max, seed, nullptr);
@@ -276,7 +276,7 @@ TEST(CustomCallUniformRandom, test_target_nvgpu) {
 }
 
 TEST(CustomCallCholesky, test) {
-  Target target = common::DefaultTarget();
+  Target target = cinn::common::DefaultTarget();
 
   // Batch size
   int batch_size = 1;
@@ -331,7 +331,7 @@ TEST(CustomCallCholesky, test) {
   cinn_pod_value_t v_args[2] = {cinn_pod_value_t(x.get()),
                                 cinn_pod_value_t(out.get())};
 
-  if (target == common::DefaultHostTarget()) {
+  if (target == cinn::common::DefaultHostTarget()) {
 #ifdef CINN_WITH_MKL_CBLAS
     cinn_call_cholesky_host(v_args, num_args, batch_size, m, upper);
     for (int i = 0; i < batch_size * m * m; i++) {
@@ -342,7 +342,7 @@ TEST(CustomCallCholesky, test) {
     LOG(INFO) << "Host Target only support on flag CINN_WITH_MKL_CBLAS ON! "
                  "Please check.";
 #endif
-  } else if (target == common::DefaultNVGPUTarget()) {
+  } else if (target == cinn::common::DefaultNVGPUTarget()) {
 #ifdef CINN_WITH_CUDA
     cinn::runtime::cuda::cinn_call_cholesky_nvgpu(
         v_args, num_args, batch_size, m, upper);
@@ -364,7 +364,7 @@ TEST(CustomCallCholesky, test) {
 
 #ifdef CINN_WITH_CUDA
 TEST(CustomCallTriangularSolve, test) {
-  Target target = common::DefaultNVGPUTarget();
+  Target target = cinn::common::DefaultNVGPUTarget();
 
   int batch_size = 1;
   int m = 3;
diff --git a/paddle/cinn/runtime/flags.cc b/paddle/cinn/runtime/flags.cc
index eb93d3442684b4..ac41a22f445623 100644
--- a/paddle/cinn/runtime/flags.cc
+++ b/paddle/cinn/runtime/flags.cc
@@ -304,10 +304,11 @@ bool IsCompiledWithCUDNN() {
 #endif
 }
 
-common::Target CurrentTarget::target_ = common::DefaultTarget();
+cinn::common::Target CurrentTarget::target_ = cinn::common::DefaultTarget();
 
-void CurrentTarget::SetCurrentTarget(const common::Target& target) {
-  if (!IsCompiledWithCUDA() && target.arch == common::Target::Arch::NVGPU) {
+void CurrentTarget::SetCurrentTarget(const cinn::common::Target& target) {
+  if (!IsCompiledWithCUDA() &&
+      target.arch == cinn::common::Target::Arch::NVGPU) {
     LOG(FATAL) << "Current CINN version does not support NVGPU, please try to "
                   "recompile with -DWITH_CUDA.";
   } else {
@@ -315,7 +316,7 @@ void CurrentTarget::SetCurrentTarget(const common::Target& target) {
   }
 }
 
-common::Target& CurrentTarget::GetCurrentTarget() { return target_; }
+cinn::common::Target& CurrentTarget::GetCurrentTarget() { return target_; }
 
 }  // namespace runtime
 }  // namespace cinn
diff --git a/paddle/cinn/runtime/flags.h b/paddle/cinn/runtime/flags.h
index e75af97a2a276d..ef7225c281c03d 100644
--- a/paddle/cinn/runtime/flags.h
+++ b/paddle/cinn/runtime/flags.h
@@ -47,15 +47,15 @@ bool IsCompiledWithCUDNN();
 
 class CurrentTarget {
  public:
-  static common::Target &GetCurrentTarget();
-  static void SetCurrentTarget(const common::Target &target);
+  static cinn::common::Target &GetCurrentTarget();
+  static void SetCurrentTarget(const cinn::common::Target &target);
 
  private:
   CurrentTarget() = default;
   CurrentTarget(const CurrentTarget &) = delete;
   CurrentTarget &operator=(const CurrentTarget &) = delete;
 
-  static common::Target target_;
+  static cinn::common::Target target_;
 };
 
 }  // namespace runtime
diff --git a/paddle/cinn/runtime/intrinsic.cc b/paddle/cinn/runtime/intrinsic.cc
index 41e12331650b68..eb68cb5637cf3d 100644
--- a/paddle/cinn/runtime/intrinsic.cc
+++ b/paddle/cinn/runtime/intrinsic.cc
@@ -25,7 +25,7 @@ using cinn::common::float16;
 
 cinn_type_t ToRuntimeType(Type type) {
 #define SET_TYPE_CASE_ITEM(compiled_type, runtime_type) \
-  if (type == common::compiled_type()) {                \
+  if (type == cinn::common::compiled_type()) {          \
     return runtime_type();                              \
   }
 
diff --git a/paddle/cinn/utils/data_util.cc b/paddle/cinn/utils/data_util.cc
index 5066395305f756..ddd8a451e8ffb5 100644
--- a/paddle/cinn/utils/data_util.cc
+++ b/paddle/cinn/utils/data_util.cc
@@ -19,7 +19,7 @@
 namespace cinn {
 
 void SetRandInt(hlir::framework::Tensor tensor,
-                const common::Target& target,
+                const cinn::common::Target& target,
                 int seed,
                 int low,
                 int high) {
@@ -37,7 +37,7 @@ void SetRandInt(hlir::framework::Tensor tensor,
 
   auto* data = tensor->mutable_data<int>(target);
 #ifdef CINN_WITH_CUDA
-  if (target == common::DefaultNVGPUTarget()) {
+  if (target == cinn::common::DefaultNVGPUTarget()) {
     cudaMemcpy(data,
                random_data.data(),
                num_ele * sizeof(int),
@@ -45,13 +45,13 @@ void SetRandInt(hlir::framework::Tensor tensor,
     return;
   }
 #endif
-  CHECK(target == common::DefaultHostTarget());
+  CHECK(target == cinn::common::DefaultHostTarget());
   std::copy(random_data.begin(), random_data.end(), data);
 }
 
 template <>
 void SetRandData<int>(hlir::framework::Tensor tensor,
-                      const common::Target& target,
+                      const cinn::common::Target& target,
                       int seed) {
   if (seed == -1) {
     std::random_device rd;
@@ -67,7 +67,7 @@ void SetRandData<int>(hlir::framework::Tensor tensor,
 
   auto* data = tensor->mutable_data<float>(target);
 #ifdef CINN_WITH_CUDA
-  if (target == common::DefaultNVGPUTarget()) {
+  if (target == cinn::common::DefaultNVGPUTarget()) {
     cudaMemcpy(data,
                random_data.data(),
                num_ele * sizeof(float),
@@ -75,13 +75,13 @@ void SetRandData<int>(hlir::framework::Tensor tensor,
     return;
   }
 #endif
-  CHECK(target == common::DefaultHostTarget());
+  CHECK(target == cinn::common::DefaultHostTarget());
   std::copy(random_data.begin(), random_data.end(), data);
 }
 
 template <>
 void SetRandData<float>(hlir::framework::Tensor tensor,
-                        const common::Target& target,
+                        const cinn::common::Target& target,
                         int seed) {
   if (seed == -1) {
     std::random_device rd;
@@ -97,48 +97,48 @@ void SetRandData<float>(hlir::framework::Tensor tensor,
 
   auto* data = tensor->mutable_data<float>(target);
 #ifdef CINN_WITH_CUDA
-  if (target == common::DefaultNVGPUTarget()) {
+  if (target == cinn::common::DefaultNVGPUTarget()) {
     cudaMemcpy(data,
                random_data.data(),
                num_ele * sizeof(float),
                cudaMemcpyHostToDevice);
-  } else if (target == common::DefaultHostTarget()) {
+  } else if (target == cinn::common::DefaultHostTarget()) {
     std::copy(random_data.begin(), random_data.end(), data);
   } else {
     CINN_NOT_IMPLEMENTED
   }
 #else
-  CHECK(target == common::DefaultHostTarget());
+  CHECK(target == cinn::common::DefaultHostTarget());
   std::copy(random_data.begin(), random_data.end(), data);
 #endif
 }
 
 template <typename T>
 std::vector<T> GetTensorData(const hlir::framework::Tensor& tensor,
-                             const common::Target& target) {
+                             const cinn::common::Target& target) {
   auto size = tensor->shape().numel();
   std::vector<T> data(size);
 #ifdef CINN_WITH_CUDA
-  if (target == common::DefaultNVGPUTarget()) {
+  if (target == cinn::common::DefaultNVGPUTarget()) {
     cudaMemcpy(data.data(),
                static_cast<const void*>(tensor->data<T>()),
                size * sizeof(T),
                cudaMemcpyDeviceToHost);
-  } else if (target == common::DefaultHostTarget()) {
+  } else if (target == cinn::common::DefaultHostTarget()) {
     std::copy(tensor->data<T>(), tensor->data<T>() + size, data.begin());
   } else {
     CINN_NOT_IMPLEMENTED
   }
 #else
-  CHECK(target == common::DefaultHostTarget());
+  CHECK(target == cinn::common::DefaultHostTarget());
   std::copy(tensor->data<T>(), tensor->data<T>() + size, data.begin());
 #endif
   return data;
 }
 
 template std::vector<float> GetTensorData<float>(
-    const hlir::framework::Tensor& tensor, const common::Target& target);
+    const hlir::framework::Tensor& tensor, const cinn::common::Target& target);
 template std::vector<int> GetTensorData<int>(
-    const hlir::framework::Tensor& tensor, const common::Target& target);
+    const hlir::framework::Tensor& tensor, const cinn::common::Target& target);
 
 }  // namespace cinn
diff --git a/paddle/cinn/utils/data_util.h b/paddle/cinn/utils/data_util.h
index a55ad554579f19..b3fa5745b5ab45 100644
--- a/paddle/cinn/utils/data_util.h
+++ b/paddle/cinn/utils/data_util.h
@@ -38,18 +38,18 @@ namespace cinn {
  * [low, high).
  */
 void SetRandInt(hlir::framework::Tensor tensor,
-                const common::Target& target,
+                const cinn::common::Target& target,
                 int seed = -1,
                 int low = 0,
                 int high = 11);
 
 template <typename T>
 void SetRandData(hlir::framework::Tensor tensor,
-                 const common::Target& target,
+                 const cinn::common::Target& target,
                  int seed = -1);
 
 template <typename T>
 std::vector<T> GetTensorData(const hlir::framework::Tensor& tensor,
-                             const common::Target& target);
+                             const cinn::common::Target& target);
 
 }  // namespace cinn
diff --git a/paddle/common/CMakeLists.txt b/paddle/common/CMakeLists.txt
index 2ae07983c77858..dae3176aca6d19 100644
--- a/paddle/common/CMakeLists.txt
+++ b/paddle/common/CMakeLists.txt
@@ -26,3 +26,6 @@ cc_library(common ${COMMON_BUILD_TYPE} SRCS ${common_srcs})
 if(WIN32)
   set_property(TARGET common PROPERTY WINDOWS_EXPORT_ALL_SYMBOLS ON)
 endif()
+
+# only used for libpaddle_inference.a
+cc_library(common_static STATIC SRCS ${common_srcs})
diff --git a/paddle/common/array.h b/paddle/common/array.h
index db604605f4ce33..11457a1eaa756b 100644
--- a/paddle/common/array.h
+++ b/paddle/common/array.h
@@ -140,3 +140,8 @@ class Array<T, 0> {
 };
 
 }  // namespace common
+
+namespace phi {
+template <typename T, size_t N>
+using Array = common::Array<T, N>;
+}  // namespace phi
diff --git a/paddle/common/ddim.cc b/paddle/common/ddim.cc
index 76069a1a037123..c4ae80c75db852 100644
--- a/paddle/common/ddim.cc
+++ b/paddle/common/ddim.cc
@@ -18,6 +18,57 @@
 
 namespace common {
 
+DDim::DDim() : rank_(-1) { dim_[0] = 0; }
+
+DDim::DDim(const DDim& ddim) : dim_() { CopyFrom(ddim); }
+
+DDim::DDim(const int* d, int n) : rank_(n) {
+  dynamic_dim_assign(d, dim_.GetMutable(), n);
+}
+
+DDim::DDim(const int64_t* d, int n) : rank_(n) {
+  dynamic_dim_assign(d, dim_.GetMutable(), n);
+}
+
+DDim::DDim(std::initializer_list<int64_t> init_list)
+    : DDim(init_list.begin(), init_list.size()) {}
+
+int64_t& DDim::at(int idx) {
+  COMMON_ENFORCE_GE(idx,
+                    0,
+                    common::errors::InvalidArgument(
+                        "Invalid DDim index to be accessed. The valid index "
+                        "is between 0 and %d, but received index is %d.",
+                        rank_,
+                        idx));
+  COMMON_ENFORCE_LT(idx,
+                    rank_,
+                    common::errors::InvalidArgument(
+                        "Invalid DDim index to be accessed. The valid index "
+                        "is between 0 and %d, but received index is %d.",
+                        rank_,
+                        idx));
+  return dim_[idx];
+}
+
+int64_t DDim::at(int idx) const {
+  COMMON_ENFORCE_GE(idx,
+                    0,
+                    common::errors::InvalidArgument(
+                        "Invalid DDim index to be accessed. The valid index "
+                        "is between 0 and %d, but received index is %d.",
+                        rank_,
+                        idx));
+  COMMON_ENFORCE_LT(idx,
+                    rank_,
+                    common::errors::InvalidArgument(
+                        "Invalid DDim index to be accessed. The valid index "
+                        "is between 0 and %d, but received index is %d.",
+                        rank_,
+                        idx));
+  return dim_[idx];
+}
+
 DDim make_ddim(std::initializer_list<int64_t> dims) {
   return DDim(dims.begin(), static_cast<int>(dims.size()));
 }
diff --git a/paddle/common/ddim.h b/paddle/common/ddim.h
index cfed0d221221d9..4710708c70d4a2 100644
--- a/paddle/common/ddim.h
+++ b/paddle/common/ddim.h
@@ -19,6 +19,7 @@
 #include <vector>
 
 #include "paddle/common/dim.h"
+#include "paddle/common/enforce.h"
 #include "paddle/common/exception.h"
 #include "paddle/utils/test_macros.h"
 
@@ -68,26 +69,21 @@ class TEST_API DDim {
  public:
   constexpr static int kMaxRank = 9;
 
-  DDim() : rank_(-1) { dim_[0] = 0; }
+  DDim();
 
-  DDim(const DDim& ddim) : dim_() { CopyFrom(ddim); }
+  DDim(const DDim& ddim);
 
-  DDim(const int* d, int n) : rank_(n) {
-    dynamic_dim_assign(d, dim_.GetMutable(), n);
-  }
+  DDim(const int* d, int n);
 
-  DDim(const int64_t* d, int n) : rank_(n) {
-    dynamic_dim_assign(d, dim_.GetMutable(), n);
-  }
+  DDim(const int64_t* d, int n);
+
+  /*implicit*/ DDim(std::initializer_list<int64_t> init_list);
 
   template <int D>
   /*implicit*/ DDim(const Dim<D>& in) : rank_(D) {  // NOLINT
     UnsafeCast<D>() = in;
   }
 
-  /*implicit*/ DDim(std::initializer_list<int64_t> init_list)
-      : DDim(init_list.begin(), init_list.size()) {}
-
   inline DDim& operator=(const DDim& ddim) { return CopyFrom(ddim); }
 
   template <int D>
@@ -101,41 +97,9 @@ class TEST_API DDim {
 
   inline int64_t operator[](int idx) const { return dim_[idx]; }
 
-  int64_t& at(int idx) {
-    COMMON_ENFORCE_GE(idx,
-                      0,
-                      common::errors::InvalidArgument(
-                          "Invalid DDim index to be accessed. The valid index "
-                          "is between 0 and %d, but received index is %d.",
-                          rank_,
-                          idx));
-    COMMON_ENFORCE_LT(idx,
-                      rank_,
-                      common::errors::InvalidArgument(
-                          "Invalid DDim index to be accessed. The valid index "
-                          "is between 0 and %d, but received index is %d.",
-                          rank_,
-                          idx));
-    return dim_[idx];
-  }
+  int64_t& at(int idx);
 
-  int64_t at(int idx) const {
-    COMMON_ENFORCE_GE(idx,
-                      0,
-                      common::errors::InvalidArgument(
-                          "Invalid DDim index to be accessed. The valid index "
-                          "is between 0 and %d, but received index is %d.",
-                          rank_,
-                          idx));
-    COMMON_ENFORCE_LT(idx,
-                      rank_,
-                      common::errors::InvalidArgument(
-                          "Invalid DDim index to be accessed. The valid index "
-                          "is between 0 and %d, but received index is %d.",
-                          rank_,
-                          idx));
-    return dim_[idx];
-  }
+  int64_t at(int idx) const;
 
   template <typename Visitor>
   typename std::result_of<Visitor(Dim<0>&)>::type apply_visitor(
@@ -188,8 +152,8 @@ class TEST_API DDim {
     PADDLE_VISIT_DDIM(ddim.rank_, (*this = ddim.UnsafeCast<kRank>()));
   }
 
-  friend DDim stride(const DDim& ddim);
-  friend DDim stride_numel(const DDim& ddim);
+  friend TEST_API DDim stride(const DDim& ddim);
+  friend TEST_API DDim stride_numel(const DDim& ddim);
 
  private:
   Dim<kMaxRank> dim_;
@@ -229,7 +193,7 @@ std::vector<T> vectorize(const DDim& ddim) {
 
 TEST_API int64_t product(const DDim& ddim);
 
-bool contain_unknown_dim(const DDim& ddim);
+TEST_API bool contain_unknown_dim(const DDim& ddim);
 
 /**
  * \brief Slice a ddim
@@ -238,7 +202,7 @@ bool contain_unknown_dim(const DDim& ddim);
  * e.g.  DDim d = make_ddim({1,2,3,4,5});
  *       slice_ddim(d, 1, 3); ====> {2,3}
  */
-DDim slice_ddim(const DDim& dim, int begin, int end);
+TEST_API DDim slice_ddim(const DDim& dim, int begin, int end);
 
 /**
  * \brief What is the length of this dimension?
@@ -246,7 +210,7 @@ DDim slice_ddim(const DDim& dim, int begin, int end);
  * \param Dynamic dimension to inspect
  */
 
-int arity(const DDim& ddim);
+TEST_API int arity(const DDim& ddim);
 
 TEST_API std::ostream& operator<<(std::ostream&, const DDim&);
 
@@ -255,22 +219,49 @@ TEST_API std::ostream& operator<<(std::ostream&, const DDim&);
  * e.g., DDim d = mak_ddim({1, 2, 3, 4, 5, 6})
  *       flatten_to_3d(d, 2, 4); ===> {1*2, 3*4, 5*6} ===> {2, 12, 30}
  */
-DDim flatten_to_3d(const DDim& src, int num_row_dims, int num_col_dims);
+TEST_API DDim flatten_to_3d(const DDim& src,
+                            int num_row_dims,
+                            int num_col_dims);
 
 // Reshape a tensor to a matrix. The matrix's first dimension(column length)
 // will be the product of tensor's first `num_col_dims` dimensions.
-DDim flatten_to_2d(const DDim& src, int num_col_dims);
+TEST_API DDim flatten_to_2d(const DDim& src, int num_col_dims);
 
-DDim flatten_to_1d(const DDim& src);
+TEST_API DDim flatten_to_1d(const DDim& src);
 
-DDim stride(const DDim& ddim);
+TEST_API DDim stride(const DDim& ddim);
 
-DDim stride_numel(const DDim& ddim);
+TEST_API DDim stride_numel(const DDim& ddim);
 }  // namespace common
 
+namespace paddle {
+namespace framework {
+using DDim = common::DDim;
+}
+}  // namespace paddle
+
+namespace phi {
+using DDim = common::DDim;
+using common::arity;
+using common::contain_unknown_dim;
+using common::flatten_to_1d;
+using common::flatten_to_2d;
+using common::flatten_to_3d;
+using common::make_ddim;
+using common::product;
+using common::slice_ddim;
+using common::stride;
+using common::stride_numel;
+using common::vectorize;
+}  // namespace phi
+
+namespace pir {
+using DDim = common::DDim;
+}
+
 namespace std {
 template <>
-struct hash<common::DDim> {
+struct TEST_API hash<common::DDim> {
   std::size_t operator()(common::DDim const& ddim) const;
 };
 }  // namespace std
diff --git a/paddle/common/dim.h b/paddle/common/dim.h
index 4423a452a370a5..68cd29872f8766 100644
--- a/paddle/common/dim.h
+++ b/paddle/common/dim.h
@@ -101,3 +101,16 @@ inline void static_dim_assign(const T1* in, T2* out) {
 }
 
 }  // namespace common
+
+// Note: `namespace paddle::framework` will case CI error.
+namespace paddle {
+namespace framework {
+template <int D>
+using Dim = common::Dim<D>;
+}
+}  // namespace paddle
+
+namespace phi {
+template <int D>
+using Dim = common::Dim<D>;
+}
diff --git a/paddle/common/enforce.h b/paddle/common/enforce.h
index d09f8942e79a75..e2a33951da71d4 100644
--- a/paddle/common/enforce.h
+++ b/paddle/common/enforce.h
@@ -1,13 +1,16 @@
-/* Copyright (c) 2013 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 
 #pragma once
 
@@ -26,14 +29,15 @@ limitations under the License. */
 #include <windows.h>  // GetModuleFileName, Sleep
 #endif
 
+#include "paddle/common/errors.h"
 #include "paddle/common/macros.h"
+
 #if !defined(_WIN32) && !defined(PADDLE_WITH_MUSL)
 #include <execinfo.h>
 #endif
 
-// #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with
-// windows.h
-#include "paddle/common/errors.h"
+// msvc conflict logging with windows.h
+#define GLOG_NO_ABBREVIATED_SEVERITIES
 #include "paddle/utils/string/printf.h"
 #include "paddle/utils/string/to_string.h"
 #include "paddle/utils/test_macros.h"
@@ -50,14 +54,13 @@ class CommonNotMetException : public std::exception {
   std::string err_str_;
 };
 }  // namespace common
+
 namespace common {
 namespace enforce {
 
-#if !defined(_WIN32)
-#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
-#else
-// there is no equivalent intrinsics in msvc.
-#define UNLIKELY(condition) (condition)
+/** HELPER MACROS AND FUNCTIONS **/
+#ifndef PADDLE_MAY_THROW
+#define PADDLE_MAY_THROW noexcept(false)
 #endif
 
 #if defined _WIN32 && defined PADDLE_ON_INFERENCE && defined PADDLE_NO_PYTHON
@@ -160,3 +163,54 @@ using CommonType2 = typename std::add_lvalue_reference<
 
 }  // namespace enforce
 }  // namespace common
+
+// TODO(zhangbopd): This is a copy from pir, and shoud be removed after merge
+// this into common enfoce namespace above.
+template <typename T>
+inline bool is_error(const T& stat) {
+  return !stat;
+}
+
+namespace pir {
+class IrNotMetException : public std::exception {
+ public:
+  explicit IrNotMetException(const std::string& str) : err_str_(str) {}
+
+  const char* what() const noexcept override { return err_str_.c_str(); }
+
+ private:
+  std::string err_str_;
+};
+
+#define IR_THROW(...)                                                     \
+  do {                                                                    \
+    try {                                                                 \
+      throw pir::IrNotMetException(                                       \
+          paddle::string::Sprintf("Error occured at: %s:%d :\n%s",        \
+                                  __FILE__,                               \
+                                  __LINE__,                               \
+                                  paddle::string::Sprintf(__VA_ARGS__))); \
+    } catch (const std::exception& e) {                                   \
+      std::cout << e.what() << std::endl;                                 \
+      throw;                                                              \
+    }                                                                     \
+  } while (0)
+
+#define IR_ENFORCE(COND, ...)                                               \
+  do {                                                                      \
+    bool __cond__(COND);                                                    \
+    if (UNLIKELY(is_error(__cond__))) {                                     \
+      try {                                                                 \
+        throw pir::IrNotMetException(                                       \
+            paddle::string::Sprintf("Error occured at: %s:%d :\n%s",        \
+                                    __FILE__,                               \
+                                    __LINE__,                               \
+                                    paddle::string::Sprintf(__VA_ARGS__))); \
+      } catch (const std::exception& e) {                                   \
+        std::cout << e.what() << std::endl;                                 \
+        throw;                                                              \
+      }                                                                     \
+    }                                                                       \
+  } while (0)
+
+}  // namespace pir
diff --git a/paddle/phi/core/errors.cc b/paddle/common/errors.cc
similarity index 93%
rename from paddle/phi/core/errors.cc
rename to paddle/common/errors.cc
index 0fcf8f292c1e17..b720132c505f56 100644
--- a/paddle/phi/core/errors.cc
+++ b/paddle/common/errors.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 
 licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/core/errors.h"
+#include "paddle/common/errors.h"
 
 #include <stdexcept>
 
-namespace phi {
+namespace common {
 std::string error_name(ErrorCode code) {
   switch (code) {
     case ErrorCode::LEGACY:
@@ -70,4 +70,4 @@ std::string ErrorSummary::to_string() const {
   result += error_message();
   return result;
 }
-}  // namespace phi
+}  // namespace common
diff --git a/paddle/common/errors.h b/paddle/common/errors.h
index 826a2350606587..e2ebf971f36cac 100644
--- a/paddle/common/errors.h
+++ b/paddle/common/errors.h
@@ -145,3 +145,9 @@ REGISTER_ERROR(External, ErrorCode::EXTERNAL)
 
 }  // namespace errors
 }  // namespace common
+
+namespace phi {
+namespace errors = ::common::errors;
+using ErrorCode = ::common::ErrorCode;
+using ErrorSummary = ::common::ErrorSummary;
+}  // namespace phi
diff --git a/paddle/phi/common/layout.h b/paddle/common/layout.h
similarity index 93%
rename from paddle/phi/common/layout.h
rename to paddle/common/layout.h
index 622962d787a9e7..408dd51c1be9db 100644
--- a/paddle/phi/common/layout.h
+++ b/paddle/common/layout.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,9 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include <cctype>
-#include "paddle/phi/api/ext/exception.h"
+#include "paddle/common/exception.h"
 
-namespace phi {
+namespace common {
 
 // Note: The original design of paddle DataLayout is confusing.
 // It contains two levels of "layout", one is the data layout
@@ -132,10 +132,18 @@ inline std::ostream& operator<<(std::ostream& os, DataLayout layout) {
   return os;
 }
 
-}  // namespace phi
+}  // namespace common
+
+namespace pir {
+using DataLayout = common::DataLayout;
+}
+
+namespace phi {
+using DataLayout = common::DataLayout;
+}
 
 namespace paddle {
 // In order to be compatible with the original custom operator Tensor interface
-using DataLayout = phi::DataLayout;
+using DataLayout = common::DataLayout;
 
 }  // namespace paddle
diff --git a/paddle/common/macros.h b/paddle/common/macros.h
index 13ce7e6d82d9c0..8506aa92486640 100644
--- a/paddle/common/macros.h
+++ b/paddle/common/macros.h
@@ -17,13 +17,14 @@ limitations under the License. */
 namespace common {
 
 // Disable the copy and assignment operator for a class.
-
+#ifndef DISABLE_COPY_AND_ASSIGN
 #define DISABLE_COPY_AND_ASSIGN(classname)         \
  private:                                          \
   classname(const classname&) = delete;            \
   classname(classname&&) = delete;                 \
   classname& operator=(const classname&) = delete; \
   classname& operator=(classname&&) = delete
+#endif
 
 #define PD_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg) \
   _PD_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)
@@ -47,6 +48,25 @@ namespace common {
 #define UNUSED __attribute__((unused))
 #endif
 
+// Because most enforce conditions would evaluate to true, we can use
+// __builtin_expect to instruct the C++ compiler to generate code that
+// always forces branch prediction of true.
+// This generates faster binary code. __builtin_expect is since C++11.
+// For more details, please check https://stackoverflow.com/a/43870188/724872.
+#if !defined(_WIN32)
+#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
+#else
+// there is no equivalent intrinsics in msvc.
+#define UNLIKELY(condition) (condition)
+#endif
+
+#if !defined(_WIN32)
+#define LIKELY(condition) __builtin_expect(static_cast<bool>(condition), 1)
+#else
+// there is no equivalent intrinsics in msvc.
+#define LIKELY(condition) (condition)
+#endif
+
 #define PD_CONCATENATE(arg1, arg2) PD_CONCATENATE1(arg1, arg2)
 #define PD_CONCATENATE1(arg1, arg2) PD_CONCATENATE2(arg1, arg2)
 #define PD_CONCATENATE2(arg1, arg2) arg1##arg2
diff --git a/paddle/fluid/distributed/auto_parallel/CMakeLists.txt b/paddle/fluid/distributed/auto_parallel/CMakeLists.txt
index 14c4b693885927..d1eae7f5995490 100644
--- a/paddle/fluid/distributed/auto_parallel/CMakeLists.txt
+++ b/paddle/fluid/distributed/auto_parallel/CMakeLists.txt
@@ -3,6 +3,6 @@ add_subdirectory(spmd_rules)
 cc_library(
   op_dist_attr
   SRCS dist_attr.cc
-  DEPS phi auto_parallel_proto proto_desc)
+  DEPS phi common auto_parallel_proto proto_desc)
 
 cc_library(auto_parallel DEPS op_dist_attr spmd_rules)
diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/CMakeLists.txt b/paddle/fluid/distributed/auto_parallel/spmd_rules/CMakeLists.txt
index 42fde81693429c..f16c1558905791 100644
--- a/paddle/fluid/distributed/auto_parallel/spmd_rules/CMakeLists.txt
+++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/CMakeLists.txt
@@ -3,4 +3,4 @@ file(GLOB spmd_srcs *.cc)
 cc_library(
   spmd_rules
   SRCS ${spmd_srcs}
-  DEPS phi)
+  DEPS phi common)
diff --git a/paddle/fluid/distributed/auto_parallel/test/CMakeLists.txt b/paddle/fluid/distributed/auto_parallel/test/CMakeLists.txt
index 954af0cc852a03..55948f83c635eb 100644
--- a/paddle/fluid/distributed/auto_parallel/test/CMakeLists.txt
+++ b/paddle/fluid/distributed/auto_parallel/test/CMakeLists.txt
@@ -1,22 +1,22 @@
 cc_test(
   device_mesh_test
   SRCS device_mesh_test.cc
-  DEPS phi)
+  DEPS phi common)
 
 cc_test(
   process_mesh_test
   SRCS process_mesh_test.cc
-  DEPS phi)
+  DEPS phi common)
 
 cc_test(
   dist_attr_test
   SRCS dist_attr_test.cc
-  DEPS phi proto_desc)
+  DEPS phi common proto_desc)
 
 cc_test(
   dist_mapper_test
   SRCS dist_mapper_test.cc
-  DEPS phi)
+  DEPS phi common)
 
 cc_test(
   spmd_rule_test
diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt
index a2267e1f6cebdd..d42b810972dc85 100644
--- a/paddle/fluid/distributed/collective/CMakeLists.txt
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
@@ -1,18 +1,18 @@
 cc_library(
   process_group
   SRCS process_group.cc
-  DEPS phi xxhash)
+  DEPS phi common xxhash)
 
 cc_library(
   eager_reducer
   SRCS reducer.cc
-  DEPS eager_api process_group phi string_helper)
+  DEPS eager_api process_group phi common string_helper)
 
 if(WITH_DISTRIBUTE)
   cc_library(
     process_group_gloo
     SRCS process_group_gloo.cc gloo_send_recv.cc
-    DEPS phi eager_api gloo_wrapper)
+    DEPS phi common eager_api gloo_wrapper)
 endif()
 
 if(WITH_NCCL OR WITH_RCCL)
@@ -21,6 +21,7 @@ if(WITH_NCCL OR WITH_RCCL)
     SRCS process_group_nccl.cc common.cc
     DEPS process_group
          phi
+         common
          place
          enforce
          collective_helper
@@ -32,7 +33,13 @@ if(WITH_XPU_BKCL)
   cc_library(
     process_group_bkcl
     SRCS process_group_bkcl.cc bkcl_tools.cc common.cc
-    DEPS process_group phi place enforce collective_helper device_context)
+    DEPS process_group
+         phi
+         common
+         place
+         enforce
+         collective_helper
+         device_context)
 endif()
 
 if(WITH_MPI)
@@ -46,7 +53,13 @@ if(WITH_CUSTOM_DEVICE)
   cc_library(
     process_group_custom
     SRCS process_group_custom.cc custom_ccl_tools.cc common.cc
-    DEPS process_group phi place enforce collective_helper device_context)
+    DEPS process_group
+         phi
+         common
+         place
+         enforce
+         collective_helper
+         device_context)
 endif()
 
 set(COMM_UTILS_DEPS process_group)
diff --git a/paddle/fluid/distributed/collective/process_group.h b/paddle/fluid/distributed/collective/process_group.h
index e2b31950bd51bc..ae1c7fe7d9d97c 100644
--- a/paddle/fluid/distributed/collective/process_group.h
+++ b/paddle/fluid/distributed/collective/process_group.h
@@ -20,12 +20,12 @@
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/common/errors.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/device_context.h"
 #include "paddle/phi/core/distributed/types.h"
 #include "paddle/phi/core/distributed/utils.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 
 constexpr auto kWaitTimeout = std::chrono::milliseconds(0);
 
diff --git a/paddle/fluid/distributed/collective/process_group_bkcl.cc b/paddle/fluid/distributed/collective/process_group_bkcl.cc
index 7a295b3360602a..8b306e29f52b32 100644
--- a/paddle/fluid/distributed/collective/process_group_bkcl.cc
+++ b/paddle/fluid/distributed/collective/process_group_bkcl.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/distributed/collective/process_group_bkcl.h"
 
+#include "paddle/common/errors.h"
 #include "paddle/fluid/distributed/collective/bkcl_tools.h"
 #include "paddle/fluid/distributed/collective/common.h"
 #include "paddle/fluid/framework/convert_utils.h"
@@ -25,7 +26,6 @@
 #include "paddle/phi/core/distributed/check/static_check.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/collective/process_group_with_stream.h b/paddle/fluid/distributed/collective/process_group_with_stream.h
index 0cea9bb3ed87e6..58d1a042fec3c8 100644
--- a/paddle/fluid/distributed/collective/process_group_with_stream.h
+++ b/paddle/fluid/distributed/collective/process_group_with_stream.h
@@ -14,9 +14,9 @@
 
 #pragma once
 
+#include "paddle/common/errors.h"
 #include "paddle/fluid/distributed/collective/process_group.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/collective/process_group_without_stream.h b/paddle/fluid/distributed/collective/process_group_without_stream.h
index dd22c0f1e4cbdb..a3c103574cbc5a 100644
--- a/paddle/fluid/distributed/collective/process_group_without_stream.h
+++ b/paddle/fluid/distributed/collective/process_group_without_stream.h
@@ -14,9 +14,9 @@
 
 #pragma once
 
+#include "paddle/common/errors.h"
 #include "paddle/fluid/distributed/collective/process_group.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/common/afs_warpper.h b/paddle/fluid/distributed/common/afs_warpper.h
index 516b35448fe516..30f4f164ba5a1d 100644
--- a/paddle/fluid/distributed/common/afs_warpper.h
+++ b/paddle/fluid/distributed/common/afs_warpper.h
@@ -20,9 +20,9 @@
 #include <string>
 #include <vector>
 
+#include "paddle/common/macros.h"
 #include "paddle/fluid/distributed/the_one_ps.pb.h"
 #include "paddle/fluid/string/string_helper.h"
-#include "paddle/phi/core/macros.h"
 namespace paddle {
 namespace distributed {
 struct FsDataConverter {
diff --git a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
index 9c28205520129c..84b58422ab2e71 100755
--- a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
+++ b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
@@ -5,9 +5,9 @@ endif()
 proto_library(interceptor_message_proto SRCS interceptor_message.proto)
 
 if(WITH_ARM_BRPC)
-  set(BRPC_DEPS arm_brpc snappy phi glog)
+  set(BRPC_DEPS arm_brpc snappy phi common glog)
 elseif(WITH_DISTRIBUTE AND NOT WITH_PSLIB)
-  set(BRPC_DEPS ${EXTERNAL_BRPC_DEPS} zlib phi)
+  set(BRPC_DEPS ${EXTERNAL_BRPC_DEPS} zlib phi common)
 else()
   set(BRPC_DEPS "")
 endif()
@@ -15,7 +15,7 @@ endif()
 cc_library(
   task_loop_thread_pool
   SRCS task_loop_thread_pool.cc task_loop_thread.cc task_loop.cc
-  DEPS enforce glog)
+  DEPS enforce glog common)
 cc_library(
   fleet_executor
   SRCS fleet_executor.cc
@@ -43,6 +43,7 @@ cc_library(
        executor_gc_helper
        op_registry
        phi
+       common
        glog
        ${BRPC_DEPS})
 if(WITH_DISTRIBUTE)
diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
index 7817b9bc0e9dfe..4190019e0d1738 100644
--- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
@@ -14,12 +14,12 @@
 
 #include "paddle/fluid/distributed/fleet_executor/compute_interceptor.h"
 
+#include "paddle/common/errors.h"
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
 #include "paddle/fluid/framework/executor_gc_helper.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/jit/serializer.h"
-#include "paddle/phi/core/errors.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc b/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc
index 2e3389af5feb59..704dd16400065c 100644
--- a/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc
@@ -14,13 +14,13 @@
 
 #include "paddle/fluid/distributed/fleet_executor/cond_interceptor.h"
 #include <algorithm>
+#include "paddle/common/errors.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
 #include "paddle/fluid/framework/executor_gc_helper.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/errors.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/errors.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc
index 1ec8c11fdf610d..a1fd38295319ed 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model.cc
+++ b/paddle/fluid/distributed/fleet_executor/dist_model.cc
@@ -46,7 +46,7 @@ bool LoadDataFromDistModelTensor(const DistModelTensor &input_data,
                                  phi::DenseTensor *input_tensor,
                                  const platform::Place &place) {
   VLOG(3) << "Loading data from DistModelTensor for " << input_data.name;
-  framework::DDim dims = phi::make_ddim(input_data.shape);
+  framework::DDim dims = common::make_ddim(input_data.shape);
   void *input_tensor_ptr = nullptr;
   if (input_data.dtype == DistModelDataType::INT64) {
     input_tensor_ptr = input_tensor->mutable_data<int64_t>(dims, place);
@@ -645,7 +645,7 @@ bool DistModel::FetchResults(std::vector<DistModelTensor> *output_data,
 template <typename T>
 bool DistModel::FetchResult(const phi::DenseTensor &fetch,
                             DistModelTensor *output_data) {
-  auto shape = phi::vectorize(fetch.dims());
+  auto shape = common::vectorize(fetch.dims());
   output_data->shape.assign(shape.begin(), shape.end());
   const T *data = fetch.data<T>();
   int64_t num_elems = fetch.numel();
diff --git a/paddle/fluid/distributed/fleet_executor/start_interceptor.cc b/paddle/fluid/distributed/fleet_executor/start_interceptor.cc
index 830f619ed3c00c..1fe4aaea15fc4d 100644
--- a/paddle/fluid/distributed/fleet_executor/start_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/start_interceptor.cc
@@ -14,9 +14,9 @@
 
 #include "paddle/fluid/distributed/fleet_executor/start_interceptor.h"
 
+#include "paddle/common/errors.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/phi/core/errors.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
index 63d4fa1bf97049..71474ec6be6fb4 100644
--- a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
@@ -37,7 +37,7 @@ namespace distributed {
 std::vector<framework::OperatorBase*> GetOps() {
   framework::AttributeMap attrs;
   attrs["dtype"] = framework::proto::VarType::FP32;
-  attrs["shape"] = phi::vectorize<int>({2, 3});
+  attrs["shape"] = common::vectorize<int>({2, 3});
   attrs["value"] = 1.0f;
 
   auto zero_op = framework::OpRegistry::CreateOp(
diff --git a/paddle/fluid/distributed/ps/service/CMakeLists.txt b/paddle/fluid/distributed/ps/service/CMakeLists.txt
index c23f26c6352180..eac2585416d8b8 100755
--- a/paddle/fluid/distributed/ps/service/CMakeLists.txt
+++ b/paddle/fluid/distributed/ps/service/CMakeLists.txt
@@ -3,11 +3,11 @@ set_source_files_properties(${BRPC_SRCS})
 
 if(WITH_HETERPS)
 
-  set(BRPC_DEPS ${EXTERNAL_BRPC_DEPS} phi zlib device_context rocksdb)
+  set(BRPC_DEPS ${EXTERNAL_BRPC_DEPS} phi common zlib device_context rocksdb)
 
 else()
 
-  set(BRPC_DEPS ${EXTERNAL_BRPC_DEPS} phi zlib device_context)
+  set(BRPC_DEPS ${EXTERNAL_BRPC_DEPS} phi common zlib device_context)
 
 endif()
 
@@ -99,6 +99,7 @@ cc_library(
        simple_rpc
        scope
        phi
+       common
        ps_gpu_wrapper
        fleet
        ${RPC_DEPS})
@@ -126,7 +127,7 @@ cc_library(
 #cc_library(
 #  communicator
 #  SRCS communicator/communicator.cc
-#  DEPS scope client table phi ${RPC_DEPS})
+#  DEPS scope client table phi common ${RPC_DEPS})
 #cc_library(
 #  ps_service
 #  SRCS ps_service/service.cc
diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
index 9ad8768e0927d5..c0e09af4c550b6 100644
--- a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
+++ b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
@@ -1505,7 +1505,7 @@ int32_t BrpcPsClient::RecvAndSaveTable(const uint64_t table_id,
   phi::DenseTensor *var_tensor = var->GetMutable<phi::DenseTensor>();
 
   std::vector<int64_t> vec_dim = {var_num, var_shape};
-  var_tensor->Resize(phi::make_ddim(vec_dim));
+  var_tensor->Resize(common::make_ddim(vec_dim));
 
   // copy and save
   float *tensor_data = var_tensor->mutable_data<float>(place);
diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_client.h b/paddle/fluid/distributed/ps/service/brpc_ps_client.h
old mode 100755
new mode 100644
index d902824bfd60c9..73f730bf6d8fde
--- a/paddle/fluid/distributed/ps/service/brpc_ps_client.h
+++ b/paddle/fluid/distributed/ps/service/brpc_ps_client.h
@@ -23,6 +23,7 @@
 #include "brpc/channel.h"
 #include "brpc/controller.h"
 #include "brpc/server.h"
+#include "paddle/common/macros.h"
 #include "paddle/fluid/distributed/ps/service/brpc_utils.h"
 #include "paddle/fluid/distributed/ps/service/ps_client.h"
 #include "paddle/fluid/distributed/ps/service/sendrecv.pb.h"
@@ -30,7 +31,6 @@
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/phi/core/macros.h"
 namespace brpc {
 class Channel;
 class Controller;
diff --git a/paddle/fluid/distributed/ps/service/brpc_utils.cc b/paddle/fluid/distributed/ps/service/brpc_utils.cc
index 715d1bbf954f07..8006e6d943579b 100644
--- a/paddle/fluid/distributed/ps/service/brpc_utils.cc
+++ b/paddle/fluid/distributed/ps/service/brpc_utils.cc
@@ -106,7 +106,7 @@ void SerializeLodTensor(framework::Variable* var,
   }
   var_msg->set_data_type(static_cast<VarMsg::Type>(
       framework::TransToProtoVarType(tensor->dtype())));
-  for (auto& dim : phi::vectorize(tensor->dims())) {
+  for (auto& dim : common::vectorize(tensor->dims())) {
     var_msg->add_dims(dim);
   }
   // IO Buffer
@@ -153,7 +153,7 @@ void SerializeSelectedRows(framework::Variable* var,
   memcpy(data_ptr, &((*rows)[0]), rows->size() * sizeof(int64_t));
   var_msg->set_data_type(static_cast<VarMsg::Type>(
       framework::TransToProtoVarType(tensor->dtype())));
-  for (auto& dim : phi::vectorize(tensor->dims())) {
+  for (auto& dim : common::vectorize(tensor->dims())) {
     var_msg->add_dims(dim);
   }
   // IO Buffer
@@ -232,7 +232,7 @@ void DeserializeLodTensor(framework::Variable* var,
   for (auto& x : msg.dims()) {
     vec_dim.push_back(x);
   }
-  tensor->Resize(phi::make_ddim(vec_dim));
+  tensor->Resize(common::make_ddim(vec_dim));
 
   framework::LoD lod;
   for (int i = 0; i < msg.lod_level(); ++i) {
@@ -288,7 +288,7 @@ void DeserializeSelectedRows(
   for (auto& x : msg.dims()) {
     vec_dim.push_back(x);
   }
-  tensor->Resize(phi::make_ddim(vec_dim));
+  tensor->Resize(common::make_ddim(vec_dim));
   void* tensor_data = tensor->mutable_data(
       place,
       framework::TransToPhiDataType(VarMessageToVarType(msg.data_type())));
diff --git a/paddle/fluid/distributed/ps/service/communicator/communicator.cc b/paddle/fluid/distributed/ps/service/communicator/communicator.cc
index 9932343fa779bd..f7a8410919f4cc 100644
--- a/paddle/fluid/distributed/ps/service/communicator/communicator.cc
+++ b/paddle/fluid/distributed/ps/service/communicator/communicator.cc
@@ -872,7 +872,7 @@ bool AsyncCommunicator::Check(const std::vector<std::string> &var_tables) {
     VLOG(3) << "send step_counter into queue";
     auto tmp_var = std::make_shared<Variable>();
     auto *tensor = tmp_var->GetMutable<phi::DenseTensor>();
-    tensor->Resize(phi::make_ddim({1}));
+    tensor->Resize(common::make_ddim({1}));
     auto *out_d = tensor->mutable_data<int64_t>(platform::CPUPlace());
     out_d[0] = 1;
     send_varname_to_queue_[table_name]->Push(tmp_var);
diff --git a/paddle/fluid/distributed/ps/service/env.h b/paddle/fluid/distributed/ps/service/env.h
index d6b403523496c5..0552c54282d35c 100644
--- a/paddle/fluid/distributed/ps/service/env.h
+++ b/paddle/fluid/distributed/ps/service/env.h
@@ -25,7 +25,7 @@
 #include <unordered_set>
 #include <vector>
 
-#include "paddle/phi/core/macros.h"
+#include "paddle/common/macros.h"
 #include "paddle/utils/flags.h"
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/ps/service/server.h b/paddle/fluid/distributed/ps/service/server.h
index fc1d4a2bd343ba..5a0764b11e8a1f 100644
--- a/paddle/fluid/distributed/ps/service/server.h
+++ b/paddle/fluid/distributed/ps/service/server.h
@@ -23,6 +23,7 @@
 
 #include "butil/endpoint.h"
 #include "google/protobuf/service.h"
+#include "paddle/common/macros.h"
 #include "paddle/fluid/distributed/common/registerer.h"
 #include "paddle/fluid/distributed/ps/service/env.h"
 #include "paddle/fluid/distributed/ps/service/sendrecv.pb.h"
@@ -31,7 +32,6 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/phi/core/macros.h"
 
 namespace google {
 namespace protobuf {
diff --git a/paddle/fluid/distributed/ps/table/CMakeLists.txt b/paddle/fluid/distributed/ps/table/CMakeLists.txt
index bedb28ec231f7b..d4e5a81f718b4a 100644
--- a/paddle/fluid/distributed/ps/table/CMakeLists.txt
+++ b/paddle/fluid/distributed/ps/table/CMakeLists.txt
@@ -18,7 +18,7 @@ set_source_files_properties(
 cc_library(
   graph_node
   SRCS ${graphDir}/graph_node.cc
-  DEPS WeightedSampler enforce)
+  DEPS WeightedSampler enforce common)
 set_source_files_properties(
   memory_dense_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(
@@ -46,7 +46,8 @@ cc_library(
        string_helper
        simple_threadpool
        xxhash
-       phi)
+       phi
+       common)
 
 set_source_files_properties(
   tensor_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
@@ -90,6 +91,7 @@ cc_library(
        string_helper
        device_context
        phi
+       common
        glog
        framework_io
        afs_wrapper
diff --git a/paddle/fluid/distributed/ps/table/graph/graph_edge.h b/paddle/fluid/distributed/ps/table/graph/graph_edge.h
index 8a5c7c1ce10dbe..d8e38491aa2a39 100644
--- a/paddle/fluid/distributed/ps/table/graph/graph_edge.h
+++ b/paddle/fluid/distributed/ps/table/graph/graph_edge.h
@@ -16,7 +16,7 @@
 #include <cstddef>
 #include <cstdint>
 #include <vector>
-#include "paddle/phi/core/macros.h"
+#include "paddle/common/macros.h"
 namespace paddle {
 namespace distributed {
 
diff --git a/paddle/fluid/distributed/ps/table/table.h b/paddle/fluid/distributed/ps/table/table.h
index dc44831e891ca1..b39a12da02a4fc 100644
--- a/paddle/fluid/distributed/ps/table/table.h
+++ b/paddle/fluid/distributed/ps/table/table.h
@@ -22,6 +22,7 @@
 #include <string>
 #include <utility>
 
+#include "paddle/common/macros.h"
 #include "paddle/fluid/distributed/common/afs_warpper.h"
 #include "paddle/fluid/distributed/ps/table/accessor.h"
 #include "paddle/fluid/distributed/ps/table/depends/sparse_utils.h"
@@ -32,7 +33,6 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/string_helper.h"
-#include "paddle/phi/core/macros.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/rpc/CMakeLists.txt b/paddle/fluid/distributed/rpc/CMakeLists.txt
index 4042a6fe3ccfeb..c7c31fd55be81b 100644
--- a/paddle/fluid/distributed/rpc/CMakeLists.txt
+++ b/paddle/fluid/distributed/rpc/CMakeLists.txt
@@ -12,7 +12,7 @@ set_source_files_properties(
 set_source_files_properties(rpc_agent.cc PROPERTIES COMPILE_FLAGS
                                                     ${DISTRIBUTE_COMPILE_FLAGS})
 
-set(PADDLE_RPC_DEPS ${EXTERNAL_BRPC_DEPS} zlib phi pybind)
+set(PADDLE_RPC_DEPS ${EXTERNAL_BRPC_DEPS} zlib phi common pybind)
 proto_library(paddle_rpc_proto SRCS rpc.proto)
 cc_library(
   paddle_rpc
diff --git a/paddle/fluid/distributed/test/CMakeLists.txt b/paddle/fluid/distributed/test/CMakeLists.txt
index aaae9761330254..ba08768ab4a104 100644
--- a/paddle/fluid/distributed/test/CMakeLists.txt
+++ b/paddle/fluid/distributed/test/CMakeLists.txt
@@ -43,6 +43,7 @@ cc_test(
   DEPS brpc_utils
        scope
        phi
+       common
        sendrecv_rpc
        ps_service
        ${COMMON_DEPS}
diff --git a/paddle/fluid/distributed/test/brpc_utils_test.cc b/paddle/fluid/distributed/test/brpc_utils_test.cc
index 4ff9f2709b81c0..98dc18c3d4cbe9 100644
--- a/paddle/fluid/distributed/test/brpc_utils_test.cc
+++ b/paddle/fluid/distributed/test/brpc_utils_test.cc
@@ -37,7 +37,7 @@ void CreateVarsOnScope(framework::Scope* scope,
   // var 1
   framework::Variable* var1 = scope->Var("x1");
   auto* tensor1 = var1->GetMutable<phi::DenseTensor>();
-  tensor1->Resize(phi::make_ddim({512, 8, 4, 2}));
+  tensor1->Resize(common::make_ddim({512, 8, 4, 2}));
   framework::LoD lod1;
   lod1.push_back(phi::Vector<size_t>({1, 3, 8}));
   tensor1->set_lod(lod1);
@@ -47,7 +47,7 @@ void CreateVarsOnScope(framework::Scope* scope,
   // var 2
   framework::Variable* var2 = scope->Var("x2");
   auto* tensor2 = var2->GetMutable<phi::DenseTensor>();
-  tensor2->Resize(phi::make_ddim({1000, 64}));
+  tensor2->Resize(common::make_ddim({1000, 64}));
   framework::LoD lod2;
   lod2.push_back(phi::Vector<size_t>({1, 1}));
   tensor2->set_lod(lod2);
@@ -60,7 +60,7 @@ void CreateVarsOnScope(framework::Scope* scope,
   slr->set_height(564);
   auto* tensor3 = slr->mutable_value();
   auto* rows = slr->mutable_rows();
-  tensor3->Resize(phi::make_ddim({564, 128}));
+  tensor3->Resize(common::make_ddim({564, 128}));
   tensor3->mutable_data<float>(*place);
   phi::funcs::set_constant(ctx, tensor3, 32.7);
   for (int i = 0; i < 564; ++i) rows->push_back(i);
@@ -97,7 +97,7 @@ void RunMultiVarMsg(platform::Place place) {
   // check var1
   framework::Variable* var1 = scope_recv.FindVar("x1");
   auto* tensor1 = var1->GetMutable<phi::DenseTensor>();
-  EXPECT_EQ(tensor1->dims(), phi::make_ddim({512, 8, 4, 2}));
+  EXPECT_EQ(tensor1->dims(), common::make_ddim({512, 8, 4, 2}));
   // EXPECT_EQ(tensor1->lod(), phi::Vector<size_t>({1, 3, 8}));
   auto* tensor_data1 = const_cast<float*>(tensor1->data<float>());
   int tensor_numel1 = 512 * 8 * 4 * 2;
@@ -107,7 +107,7 @@ void RunMultiVarMsg(platform::Place place) {
   // check var2
   framework::Variable* var2 = scope_recv.FindVar("x2");
   auto* tensor2 = var2->GetMutable<phi::DenseTensor>();
-  EXPECT_EQ(tensor2->dims(), phi::make_ddim({1000, 64}));
+  EXPECT_EQ(tensor2->dims(), common::make_ddim({1000, 64}));
   // EXPECT_EQ(tensor2->lod(), phi::Vector<size_t>({1, 1}));
   auto* tensor_data2 = const_cast<int*>(tensor2->data<int>());
   int tensor_numel2 = 1000 * 64;
@@ -122,7 +122,7 @@ void RunMultiVarMsg(platform::Place place) {
   }
 
   auto* tensor3 = slr->mutable_value();
-  EXPECT_EQ(tensor3->dims(), phi::make_ddim({564, 128}));
+  EXPECT_EQ(tensor3->dims(), common::make_ddim({564, 128}));
   auto* tensor_data3 = const_cast<float*>(tensor3->data<float>());
   int tensor_numel3 = 564 * 128;
   for (int i = 0; i < tensor_numel3; ++i)
diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt
index f948e050387bca..a7ea765aadc3c8 100755
--- a/paddle/fluid/eager/CMakeLists.txt
+++ b/paddle/fluid/eager/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(eager_deps
     phi
+    common
     hook_utils
     utils
     global_utils
@@ -45,26 +46,27 @@ if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
   cc_library(
     backward
     SRCS backward.cc
-    DEPS grad_tensor_holder utils autograd_meta grad_node_info phi)
+    DEPS grad_tensor_holder utils autograd_meta grad_node_info phi common)
 endif()
 
 cc_library(
   eager_nan_inf_utils
   SRCS nan_inf_utils.cc
-  DEPS phi nan_inf_utils enforce)
+  DEPS phi common nan_inf_utils enforce)
 cc_library(
   grad_node_info
   SRCS grad_node_info.cc
-  DEPS phi)
+  DEPS phi common)
 
 cc_library(
   autograd_meta
   SRCS autograd_meta.cc
-  DEPS phi)
+  DEPS phi common)
 cc_library(
   utils
   SRCS utils.cc
   DEPS phi
+       common
        global_utils
        layer
        proto_desc
diff --git a/paddle/fluid/eager/accumulation/CMakeLists.txt b/paddle/fluid/eager/accumulation/CMakeLists.txt
index 574123661847b6..129baebf4ca876 100755
--- a/paddle/fluid/eager/accumulation/CMakeLists.txt
+++ b/paddle/fluid/eager/accumulation/CMakeLists.txt
@@ -2,5 +2,5 @@ if(NOT (NOT WITH_PYTHON AND ON_INFER))
   cc_library(
     accumulation_node
     SRCS accumulation_node.cc
-    DEPS gradient_accumulator phi grad_node_info final_dygraph_function)
+    DEPS gradient_accumulator phi common grad_node_info final_dygraph_function)
 endif()
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
index 7567236c4ff68e..be15752419771c 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
@@ -107,7 +107,7 @@ void ScaleAPI(const paddle::Tensor& x,
       dense_tensor->dtype(), dense_tensor->dims(), dense_tensor->layout());
   auto place = dense_tensor->place();
   size_t bytes_size =
-      phi::product(dense_tensor->dims()) * SizeOf(dense_tensor->dtype());
+      common::product(dense_tensor->dims()) * SizeOf(dense_tensor->dtype());
   auto dense_out = std::make_shared<phi::DenseTensor>(
       paddle::memory::Alloc(place, bytes_size), std::move(tensor_meta));
   // Handle Device Context
diff --git a/paddle/fluid/eager/api/utils/CMakeLists.txt b/paddle/fluid/eager/api/utils/CMakeLists.txt
index c6c5f4e9302a94..3de1959416b306 100755
--- a/paddle/fluid/eager/api/utils/CMakeLists.txt
+++ b/paddle/fluid/eager/api/utils/CMakeLists.txt
@@ -7,10 +7,10 @@ if(NOT (NOT WITH_PYTHON AND ON_INFER))
   cc_library(
     hook_utils
     SRCS hook_utils.cc
-    DEPS phi autograd_meta grad_node_info utils accumulation_node)
+    DEPS phi common autograd_meta grad_node_info utils accumulation_node)
 else()
   cc_library(
     hook_utils
     SRCS hook_utils.cc
-    DEPS phi autograd_meta grad_node_info utils)
+    DEPS phi common autograd_meta grad_node_info utils)
 endif()
diff --git a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
index 25d2f4dacfd168..a6bb716e6b7ade 100644
--- a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
@@ -61,6 +61,12 @@ if(WIN32)
     list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/phi.dll)
   endif()
 
+  add_custom_command(
+    OUTPUT ${eager_generator_path}/common.dll
+    COMMAND ${CMAKE_COMMAND} -E copy ${COMMON_LIB} ${eager_generator_path}
+    DEPENDS common)
+  list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/common.dll)
+
   if(WITH_SHARED_IR)
     add_custom_command(
       OUTPUT ${eager_generator_path}/ir.dll
diff --git a/paddle/fluid/eager/custom_operator/CMakeLists.txt b/paddle/fluid/eager/custom_operator/CMakeLists.txt
index a74ba2dc8c6287..189fb23c80aa8d 100644
--- a/paddle/fluid/eager/custom_operator/CMakeLists.txt
+++ b/paddle/fluid/eager/custom_operator/CMakeLists.txt
@@ -1,9 +1,9 @@
 cc_library(
   custom_operator_node
   SRCS custom_operator_node.cc
-  DEPS phi grad_node_info custom_operator utils custom_operator_utils)
+  DEPS phi common grad_node_info custom_operator utils custom_operator_utils)
 
 cc_library(
   custom_operator_utils
   SRCS custom_operator_utils.cc
-  DEPS phi grad_node_info custom_operator utils)
+  DEPS phi common grad_node_info custom_operator utils)
diff --git a/paddle/fluid/eager/custom_operator/custom_operator_utils.cc b/paddle/fluid/eager/custom_operator/custom_operator_utils.cc
index 795abd5e72f4e7..8894a06267b514 100644
--- a/paddle/fluid/eager/custom_operator/custom_operator_utils.cc
+++ b/paddle/fluid/eager/custom_operator/custom_operator_utils.cc
@@ -227,7 +227,8 @@ static std::vector<std::vector<phi::DDim>> RunInferShapeFunc(
         auto duplicable_input_pair = ctx.InputRangeAt(inplace_reverse_map[i]);
         result.push_back({ctx.InputAt(duplicable_input_pair.first).dims()});
       } else {
-        result.push_back({phi::make_ddim(output_shapes[output_shape_idx++])});
+        result.push_back(
+            {common::make_ddim(output_shapes[output_shape_idx++])});
       }
     }
   }
@@ -436,7 +437,7 @@ paddle::Tensor BuildEmptyDistPaddleTensor(
   meta.dims = dims;
   meta.dtype = dtype;
 
-  auto dist_attr = phi::distributed::TensorDistAttr(phi::vectorize(dims));
+  auto dist_attr = phi::distributed::TensorDistAttr(common::vectorize(dims));
   dist_attr.set_process_mesh(process_mesh);
 
   auto dist_t = std::make_shared<phi::distributed::DistTensor>(
@@ -604,7 +605,7 @@ void TransCtxTensorsToDistTensors(
     for (size_t i = 0; i < output_all->size(); ++i) {
       auto& tensor = output_all->at(i);
       phi::distributed::TensorDistAttr dist_attr =
-          phi::distributed::TensorDistAttr(phi::vectorize(tensor.dims()));
+          phi::distributed::TensorDistAttr(common::vectorize(tensor.dims()));
       dist_attr.set_process_mesh(current_process_mesh);
       auto dist_t = std::make_shared<phi::distributed::DistTensor>(
           std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl()),
@@ -615,7 +616,7 @@ void TransCtxTensorsToDistTensors(
     for (size_t i = 0; i < input_all->size(); ++i) {
       auto& tensor = input_all->at(i);
       phi::distributed::TensorDistAttr dist_attr =
-          phi::distributed::TensorDistAttr(phi::vectorize(tensor.dims()));
+          phi::distributed::TensorDistAttr(common::vectorize(tensor.dims()));
       dist_attr.set_process_mesh(current_process_mesh);
       auto dist_t = std::make_shared<phi::distributed::DistTensor>(
           std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl()),
diff --git a/paddle/fluid/eager/eager_layout_transformer.h b/paddle/fluid/eager/eager_layout_transformer.h
index d707b14be416e0..81ad21302f2868 100644
--- a/paddle/fluid/eager/eager_layout_transformer.h
+++ b/paddle/fluid/eager/eager_layout_transformer.h
@@ -79,7 +79,7 @@ inline void DealWithShapeOp(paddle::Tensor* out_tensor,
   for (int i = 0; i < dim_size; i++) {
     dims[i] = value[i];
   }
-  auto des_str = phi::DataLayoutToString(des_layout);
+  auto des_str = common::DataLayoutToString(des_layout);
   if (change_dim && des_str == "NCHW") {
     // NCHW -> NHWC
     VLOG(6) << "layout autotune get Shape from NCHW -> NHWC " << value[0] << " "
@@ -200,7 +200,7 @@ class EagerHeavilyLayoutSensitiveOpTransformer : public EagerLayoutTransformer {
                                                     std::string* layout)
       : op_name_(op_name), desired_layout_(DesiredLayout()) {
     VLOG(4) << "Heavily op: " << op_name << " layout " << *layout;
-    *layout = phi::DataLayoutToString(DesiredLayout());
+    *layout = common::DataLayoutToString(DesiredLayout());
   }
 
   paddle::Tensor TransInTensor(const std::string& in_name,
@@ -247,13 +247,13 @@ class EagerLightlyLayoutSensitiveOpTransformer : public EagerLayoutTransformer {
       const std::string& op_name) {
     VLOG(4) << "Lightly op : " << op_name;
     auto desired_layout = DesiredLayout();
-    final_layout_ = phi::DataLayoutToString(desired_layout);
+    final_layout_ = common::DataLayoutToString(desired_layout);
   }
 
   // transpose from desired to default
   paddle::Tensor TransInTensor(const std::string& in_name UNUSED,
                                const paddle::Tensor& in) {
-    std::string input_layout = phi::DataLayoutToString(in.layout());
+    std::string input_layout = common::DataLayoutToString(in.layout());
     auto default_layout = DefaultLayout();
     if (final_layout_ == input_layout && in.shape().size() == 4) {
       auto out_tensor = EagerTraceTransposeOp(phi::DataLayout::UNDEFINED, in);
diff --git a/paddle/fluid/eager/eager_tensor.h b/paddle/fluid/eager/eager_tensor.h
index e8e74232888f46..f90cdba5b54b30 100644
--- a/paddle/fluid/eager/eager_tensor.h
+++ b/paddle/fluid/eager/eager_tensor.h
@@ -18,9 +18,9 @@
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
 // Phi deps
+#include "paddle/common/macros.h"
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/core/compat/convert_utils.h"
-#include "paddle/phi/core/macros.h"
 
 namespace egr {
 
diff --git a/paddle/fluid/eager/pylayer/CMakeLists.txt b/paddle/fluid/eager/pylayer/CMakeLists.txt
index fe7a57fe795942..609f43b3e00942 100644
--- a/paddle/fluid/eager/pylayer/CMakeLists.txt
+++ b/paddle/fluid/eager/pylayer/CMakeLists.txt
@@ -1,4 +1,4 @@
 cc_library(
   py_layer_node
   SRCS py_layer_node.cc
-  DEPS pybind phi grad_node_info)
+  DEPS pybind phi common grad_node_info)
diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
index 542c6429c43c9e..1bc28549cb0c44 100644
--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -19,8 +19,8 @@
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/tensor_wrapper.h"
 
+#include "paddle/common/layout.h"
 #include "paddle/phi/api/all.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/tensor_meta.h"
 
@@ -538,7 +538,7 @@ void EagerUtils::FillZeroForEmptyOptionalGradInput(
             grad_in_metas[i].DistAttr()));
         if (grad_in_metas[i].GetTensorMeta().dims.size() != -1) {
           auto tensor_with_zero = paddle::experimental::full(
-              phi::vectorize(grad_in_metas[i].GetTensorMeta().dims),
+              common::vectorize(grad_in_metas[i].GetTensorMeta().dims),
               0.0,
               grad_in_metas[i].GetTensorMeta().dtype,
               grad_in_metas[i].GetPlace());
@@ -548,7 +548,7 @@ void EagerUtils::FillZeroForEmptyOptionalGradInput(
         }
       } else {
         auto tensor_with_zero = paddle::experimental::full(
-            phi::vectorize(grad_in_metas[i].GetTensorMeta().dims),
+            common::vectorize(grad_in_metas[i].GetTensorMeta().dims),
             0.0,
             grad_in_metas[i].GetTensorMeta().dtype,
             grad_in_metas[i].GetPlace());
@@ -576,7 +576,7 @@ void EagerUtils::FillZeroForEmptyOptionalGradOutput(
             grad_output_metas[i].DistAttr()));
         if (grad_output_metas[i].GetTensorMeta().dims.size() != -1) {
           auto tensor_with_zero = paddle::experimental::full(
-              phi::vectorize(grad_output_metas[i].GetTensorMeta().dims),
+              common::vectorize(grad_output_metas[i].GetTensorMeta().dims),
               0.0,
               grad_output_metas[i].GetTensorMeta().dtype,
               grad_output_metas[i].GetPlace());
@@ -587,7 +587,7 @@ void EagerUtils::FillZeroForEmptyOptionalGradOutput(
       } else {
         auto tensor_with_zero =
             paddle::experimental::full(  // only create dense tensor.
-                phi::vectorize(grad_output_metas[i].GetTensorMeta().dims),
+                common::vectorize(grad_output_metas[i].GetTensorMeta().dims),
                 0.0,
                 grad_output_metas[i].GetTensorMeta().dtype,
                 grad_output_metas[i].GetPlace());
@@ -610,7 +610,7 @@ void EagerUtils::FillZeroForEmptyGradInput(paddle::Tensor* in_grad,
           grad_in_meta.DistTensorGlobalDims(), grad_in_meta.DistAttr()));
       if (tensor_meta.dims.size() != -1) {
         auto tensor_with_zero =
-            paddle::experimental::full(phi::vectorize(tensor_meta.dims),
+            paddle::experimental::full(common::vectorize(tensor_meta.dims),
                                        0.0,
                                        tensor_meta.dtype,
                                        grad_in_meta.GetPlace());
@@ -620,7 +620,7 @@ void EagerUtils::FillZeroForEmptyGradInput(paddle::Tensor* in_grad,
       }
     } else {
       auto tensor_with_zero =
-          paddle::experimental::full(phi::vectorize(tensor_meta.dims),
+          paddle::experimental::full(common::vectorize(tensor_meta.dims),
                                      0.0,
                                      tensor_meta.dtype,
                                      grad_in_meta.GetPlace());
@@ -638,7 +638,7 @@ void EagerUtils::FillZeroForEmptyOptionalGradInput(
           grad_in_meta.DistTensorGlobalDims(), grad_in_meta.DistAttr()));
       if (tensor_meta.dims.size() != -1) {
         auto tensor_with_zero =
-            paddle::experimental::full(phi::vectorize(tensor_meta.dims),
+            paddle::experimental::full(common::vectorize(tensor_meta.dims),
                                        0.0,
                                        tensor_meta.dtype,
                                        grad_in_meta.GetPlace());
@@ -648,7 +648,7 @@ void EagerUtils::FillZeroForEmptyOptionalGradInput(
       }
     } else {
       auto tensor_with_zero =
-          paddle::experimental::full(phi::vectorize(tensor_meta.dims),
+          paddle::experimental::full(common::vectorize(tensor_meta.dims),
                                      0.0,
                                      tensor_meta.dtype,
                                      grad_in_meta.GetPlace());
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 83b156d10e5cb5..e1395488160191 100755
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -115,7 +115,7 @@ proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto
 cc_library(
   string_array
   SRCS string_array.cc
-  DEPS utf8proc phi)
+  DEPS utf8proc phi common)
 
 cc_library(
   data_type
@@ -125,27 +125,27 @@ cc_library(
 cc_library(
   tensor
   SRCS tensor_util.cc
-  DEPS place memory data_type device_context phi)
+  DEPS place memory data_type device_context phi common)
 
 cc_library(
   lod_tensor
   SRCS lod_tensor.cc
-  DEPS phi place tensor framework_proto version)
+  DEPS phi common place tensor framework_proto version)
 
 cc_library(
   garbage_collector
   SRCS garbage_collector.cc
-  DEPS device_context memory phi glog)
+  DEPS device_context memory phi common glog)
 
 cc_library(
   reader
   SRCS reader.cc
-  DEPS lod_tensor phi)
+  DEPS lod_tensor phi common)
 
 cc_library(
   var_type_traits
   SRCS var_type_traits.cc
-  DEPS framework_proto scope phi)
+  DEPS framework_proto scope phi common)
 if(WITH_GPU)
   target_link_libraries(var_type_traits dynload_cuda)
 endif()
@@ -177,7 +177,7 @@ endif()
 cc_library(
   scope
   SRCS scope.cc
-  DEPS glog phi xxhash var_type_traits)
+  DEPS glog phi common xxhash var_type_traits)
 cc_library(
   device_worker
   SRCS device_worker.cc
@@ -228,24 +228,24 @@ endif()
 cc_library(
   data_layout_transform
   SRCS data_layout_transform.cc
-  DEPS tensor phi)
+  DEPS tensor phi common)
 
 cc_library(
   data_transform
   SRCS data_transform.cc
-  DEPS phi
-       tensor
+  DEPS tensor
        framework_proto
        selected_rows_utils
        data_device_transform
        data_type_transform
        data_layout_transform
-       phi)
+       phi
+       common)
 
 cc_library(
   attribute
   SRCS attribute.cc
-  DEPS framework_proto enforce)
+  DEPS framework_proto enforce common)
 cc_library(
   op_version_proto
   SRCS op_version_proto.cc
@@ -271,7 +271,7 @@ cc_library(
 cc_library(
   shape_inference
   SRCS shape_inference.cc
-  DEPS phi attribute selected_rows_utils)
+  DEPS phi common attribute selected_rows_utils)
 
 # every source file that includes "dnnl.h" must depends on mkldnn
 # or, the first one should depends on mkldnn
@@ -302,6 +302,7 @@ if(WITH_XPU)
          selected_rows_utils
          place
          phi
+         common
          var_type_traits
          op_info
          xpu_op_list)
@@ -309,7 +310,13 @@ else()
   cc_library(
     phi_utils
     SRCS phi_utils.cc
-    DEPS lod_tensor selected_rows_utils place phi var_type_traits op_info)
+    DEPS lod_tensor
+         selected_rows_utils
+         place
+         phi
+         common
+         var_type_traits
+         op_info)
 endif()
 
 if(WITH_XPU)
@@ -337,6 +344,7 @@ if(WITH_XPU)
          phi_utils
          infershape_utils
          phi
+         common
          op_compat_infos
          type_info)
 else()
@@ -363,6 +371,7 @@ else()
          phi_utils
          infershape_utils
          phi
+         common
          op_compat_infos
          type_info)
 endif()
@@ -380,7 +389,8 @@ add_dependencies(
   glog
   version
   xxhash
-  phi)
+  phi
+  common)
 
 cc_library(
   proto_desc
@@ -395,6 +405,7 @@ cc_library(
        xxhash
        op_dist_attr
        phi
+       common
        op_version_proto
        op_version_registry)
 
@@ -406,7 +417,7 @@ cc_library(
 cc_library(
   op_call_stack
   SRCS op_call_stack.cc
-  DEPS op_proto_maker enforce)
+  DEPS op_proto_maker enforce common)
 
 cc_library(
   program_utils
@@ -677,7 +688,8 @@ if(WITH_DISTRIBUTE)
            fleet
            heter_server
            ${${EXTERNAL_BRPC_DEPS}}
-           phi)
+           phi
+           common)
     set(DISTRIBUTE_COMPILE_FLAGS "")
     if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
       set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
@@ -883,7 +895,7 @@ cc_library(
 cc_library(
   selected_rows_utils
   SRCS selected_rows_utils.cc
-  DEPS phi device_context)
+  DEPS phi common device_context)
 
 cc_library(
   dlpack_tensor
@@ -904,6 +916,7 @@ cc_library(
        place
        var_type_traits
        phi
+       common
        phi_utils
        op_info
        shape_inference)
@@ -938,6 +951,7 @@ cc_library(
        dynamic_loader
        string_helper
        phi
+       common
        imperative_flag
        layer)
 
diff --git a/paddle/fluid/framework/convert_utils.h b/paddle/fluid/framework/convert_utils.h
index 6995e21da89109..4fc86eba98dd65 100644
--- a/paddle/fluid/framework/convert_utils.h
+++ b/paddle/fluid/framework/convert_utils.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/common/layout.h"
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/utils/data_type.h"
 
 // TODO(chenweihang): this file may need to be removed
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 81075e0c5fb5bd..bf2f9e4379b693 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -434,7 +434,7 @@ static void RunInferShapeFunc(
                        vec_ddim.end(),
                        std::back_inserter(vec_shape),
                        [&](const DDim& ddim) -> std::vector<int64_t> {
-                         return phi::vectorize(ddim);
+                         return common::vectorize(ddim);
                        });
 
       } else {  // optional inputs, `vec_shape` is empty
@@ -450,7 +450,7 @@ static void RunInferShapeFunc(
     } else {
       if (ctx->HasInput(in_name)) {  // general inputs
         auto ddim = ctx->GetInputDim(in_name);
-        input_shapes.emplace_back(phi::vectorize(ddim));
+        input_shapes.emplace_back(common::vectorize(ddim));
       } else {  // optional inputs
         PADDLE_ENFORCE(
             detail::IsOptionalVar(in_name),
@@ -582,7 +582,7 @@ static void RunInferShapeFunc(
       } else {
         // Set output dims by the output of InferShapeFn
         ctx->SetOutputDim(out_name,
-                          phi::make_ddim(output_shapes[output_shape_idx++]));
+                          common::make_ddim(output_shapes[output_shape_idx++]));
       }
     }
   }
diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
index 19c5196d2f933a..ca5c7b66b343ac 100644
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -1021,7 +1021,7 @@ void MultiSlotDataFeed::PutToFeedVec(
         use_slots_shape_[i][inductive_shape_index_[i]] =
             total_instance / total_dims_without_inductive_[i];
       }
-      feed_vec_[i]->Resize(phi::make_ddim(use_slots_shape_[i]));
+      feed_vec_[i]->Resize(common::make_ddim(use_slots_shape_[i]));
     }
   }
 #endif
@@ -1423,7 +1423,7 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec(const Record* ins_vec, int num) {
         use_slots_shape_[i][inductive_shape_index_[i]] =
             total_instance / total_dims_without_inductive_[i];
       }
-      feed_vec_[i]->Resize(phi::make_ddim(use_slots_shape_[i]));
+      feed_vec_[i]->Resize(common::make_ddim(use_slots_shape_[i]));
     }
   }
 #endif
@@ -1523,7 +1523,7 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec(
         use_slots_shape_[i][inductive_shape_index_[i]] =
             total_instance / total_dims_without_inductive_[i];
       }
-      feed_vec_[i]->Resize(phi::make_ddim(use_slots_shape_[i]));
+      feed_vec_[i]->Resize(common::make_ddim(use_slots_shape_[i]));
     }
   }
 #endif
@@ -1568,7 +1568,7 @@ void PrivateInstantDataFeed<T>::PutToFeedVec() {
               use_slots_[i].c_str(),
               total_dims,
               total_instance));
-      feed_vec_[i]->Resize(phi::make_ddim(use_slots_shape_[i]));
+      feed_vec_[i]->Resize(common::make_ddim(use_slots_shape_[i]));
     }
   }
 }
@@ -1998,7 +1998,7 @@ void PaddleBoxDataFeed::PutToFeedVec(const std::vector<Record*>& ins_vec) {
         use_slots_shape_[i][inductive_shape_index_[i]] =
             total_instance / total_dims_without_inductive_[i];
       }
-      feed_vec_[i]->Resize(phi::make_ddim(use_slots_shape_[i]));
+      feed_vec_[i]->Resize(common::make_ddim(use_slots_shape_[i]));
     }
   }
 #endif
@@ -2615,7 +2615,7 @@ void SlotRecordInMemoryDataFeed::PutToFeedVec(const SlotRecord* ins_vec,
         info.local_shape[info.inductive_shape_index] =
             total_instance / info.total_dims_without_inductive;
       }
-      feed->Resize(phi::make_ddim(info.local_shape));
+      feed->Resize(common::make_ddim(info.local_shape));
     } else {
       LoD data_lod{slot_offset};
       feed_vec_[j]->set_lod(data_lod);
@@ -2987,7 +2987,7 @@ void SlotRecordInMemoryDataFeed::PackToScope(MiniBatchGpuPack* pack,
         info.local_shape[info.inductive_shape_index] =
             total_instance / info.total_dims_without_inductive;
       }
-      feed->Resize(phi::make_ddim(info.local_shape));
+      feed->Resize(common::make_ddim(info.local_shape));
     } else {
       LoD& lod = (*feed->mutable_lod());
       lod.resize(1);
diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h
index dd17c9d4d0bab3..6472f6eadbca9d 100644
--- a/paddle/fluid/framework/data_feed.h
+++ b/paddle/fluid/framework/data_feed.h
@@ -31,6 +31,7 @@ limitations under the License. */
 #include <unordered_set>
 #include <utility>
 #include <vector>
+#include "paddle/common/macros.h"
 #include "paddle/fluid/framework/archive.h"
 #include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/framework/channel.h"
@@ -41,7 +42,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/timer.h"
 #include "paddle/fluid/string/string_helper.h"
-#include "paddle/phi/core/macros.h"
 #if defined(PADDLE_WITH_CUDA)
 #include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
diff --git a/paddle/fluid/framework/data_layout.h b/paddle/fluid/framework/data_layout.h
index 93c5f805098b33..d7bc7c5a817181 100644
--- a/paddle/fluid/framework/data_layout.h
+++ b/paddle/fluid/framework/data_layout.h
@@ -18,4 +18,4 @@ limitations under the License. */
 #include <ostream>
 #include <string>
 
-#include "paddle/phi/common/layout.h"
+#include "paddle/common/layout.h"
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index 8e94a04ab161be..19960cf139d4ee 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -92,7 +92,7 @@ void TransDataLayout(DataLayout from_layout,
     dst_dim[i] = src_dim[axis[i]];
   }
 
-  out->Resize(phi::make_ddim(dst_dim));
+  out->Resize(common::make_ddim(dst_dim));
   out->mutable_data(place, in.dtype());
 
   framework::VisitDataType(
diff --git a/paddle/fluid/framework/data_set.h b/paddle/fluid/framework/data_set.h
index 9af5fbfc6b4a59..459bea97b74463 100644
--- a/paddle/fluid/framework/data_set.h
+++ b/paddle/fluid/framework/data_set.h
@@ -25,7 +25,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
-#include "paddle/phi/core/macros.h"
+#include "paddle/common/macros.h"
 #ifdef PADDLE_WITH_GLOO
 #include <gloo/broadcast.h>
 
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index f76f6af9dce969..f0c2b60f41b69d 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -10,15 +10,15 @@ cc_library(
 cc_library(
   scale_loss_grad_op_handle
   SRCS scale_loss_grad_op_handle.cc
-  DEPS op_handle_base scope lod_tensor phi memory)
+  DEPS op_handle_base scope lod_tensor phi common memory)
 cc_library(
   fetch_op_handle
   SRCS fetch_op_handle.cc
-  DEPS op_handle_base scope lod_tensor phi memory)
+  DEPS op_handle_base scope lod_tensor phi common memory)
 cc_library(
   fetch_async_op_handle
   SRCS fetch_async_op_handle.cc
-  DEPS op_handle_base scope lod_tensor phi memory)
+  DEPS op_handle_base scope lod_tensor phi common memory)
 
 cc_library(
   share_tensor_buffer_functor
@@ -71,7 +71,7 @@ if(WITH_GPU)
   nv_library(
     nan_inf_utils
     SRCS nan_inf_utils_detail.cc
-    DEPS framework_proto scope place phi)
+    DEPS framework_proto scope place phi common)
   nv_library(
     all_reduce_op_handle
     SRCS all_reduce_op_handle.cc
@@ -80,6 +80,7 @@ if(WITH_GPU)
          scope
          lod_tensor
          phi
+         common
          memory
          dynload_cuda)
   nv_library(
@@ -91,6 +92,7 @@ if(WITH_GPU)
          scope
          lod_tensor
          phi
+         common
          memory
          dynload_cuda
          place)
@@ -102,6 +104,7 @@ if(WITH_GPU)
          scope
          lod_tensor
          phi
+         common
          memory
          dynload_cuda
          variable_visitor
@@ -116,6 +119,7 @@ if(WITH_GPU)
            scope
            lod_tensor
            phi
+           common
            memory
            dynload_cuda
            variable_visitor
@@ -127,17 +131,23 @@ if(WITH_GPU)
     nv_library(
       reduce_op_handle
       SRCS reduce_op_handle.cc
-      DEPS op_handle_base variable_visitor scope phi dynload_cuda)
+      DEPS op_handle_base variable_visitor scope phi common dynload_cuda)
   else()
     nv_library(
       reduce_op_handle
       SRCS reduce_op_handle.cc
-      DEPS op_handle_base variable_visitor scope phi dynload_cuda)
+      DEPS op_handle_base variable_visitor scope phi common dynload_cuda)
   endif()
   nv_library(
     broadcast_op_handle
     SRCS broadcast_op_handle.cc
-    DEPS op_handle_base scope phi memory variable_visitor dynload_cuda)
+    DEPS op_handle_base
+         scope
+         phi
+         common
+         memory
+         variable_visitor
+         dynload_cuda)
   nv_library(
     fused_broadcast_op_handle
     SRCS fused_broadcast_op_handle.cc
@@ -146,7 +156,7 @@ elseif(WITH_ROCM)
   hip_library(
     nan_inf_utils
     SRCS nan_inf_utils_detail.cc
-    DEPS framework_proto scope place phi)
+    DEPS framework_proto scope place phi common)
   hip_library(
     all_reduce_op_handle
     SRCS all_reduce_op_handle.cc
@@ -154,6 +164,7 @@ elseif(WITH_ROCM)
          scope
          lod_tensor
          phi
+         common
          memory
          dynload_cuda
          variable_visitor)
@@ -166,6 +177,7 @@ elseif(WITH_ROCM)
          scope
          lod_tensor
          phi
+         common
          memory
          dynload_cuda
          place)
@@ -177,6 +189,7 @@ elseif(WITH_ROCM)
          scope
          lod_tensor
          phi
+         common
          memory
          dynload_cuda
          variable_visitor
@@ -187,17 +200,23 @@ elseif(WITH_ROCM)
     hip_library(
       reduce_op_handle
       SRCS reduce_op_handle.cc
-      DEPS op_handle_base variable_visitor scope phi dynload_cuda)
+      DEPS op_handle_base variable_visitor scope phi common dynload_cuda)
   else()
     hip_library(
       reduce_op_handle
       SRCS reduce_op_handle.cc
-      DEPS op_handle_base variable_visitor scope phi dynload_cuda)
+      DEPS op_handle_base variable_visitor scope phi common dynload_cuda)
   endif()
   hip_library(
     broadcast_op_handle
     SRCS broadcast_op_handle.cc
-    DEPS op_handle_base scope phi memory variable_visitor dynload_cuda)
+    DEPS op_handle_base
+         scope
+         phi
+         common
+         memory
+         variable_visitor
+         dynload_cuda)
   hip_library(
     fused_broadcast_op_handle
     SRCS fused_broadcast_op_handle.cc
@@ -206,11 +225,17 @@ else()
   cc_library(
     nan_inf_utils
     SRCS nan_inf_utils_detail.cc
-    DEPS framework_proto scope place phi)
+    DEPS framework_proto scope place phi common)
   cc_library(
     all_reduce_op_handle
     SRCS all_reduce_op_handle.cc
-    DEPS op_handle_base scope lod_tensor phi memory variable_visitor)
+    DEPS op_handle_base
+         scope
+         lod_tensor
+         phi
+         common
+         memory
+         variable_visitor)
   cc_library(
     fused_all_reduce_op_handle
     SRCS fused_all_reduce_op_handle.cc
@@ -219,6 +244,7 @@ else()
          scope
          lod_tensor
          phi
+         common
          memory
          variable_visitor
          place)
@@ -230,6 +256,7 @@ else()
          scope
          lod_tensor
          phi
+         common
          memory
          variable_visitor
          place
@@ -238,17 +265,17 @@ else()
     cc_library(
       reduce_op_handle
       SRCS reduce_op_handle.cc
-      DEPS op_handle_base variable_visitor scope phi)
+      DEPS op_handle_base variable_visitor scope phi common)
   else()
     cc_library(
       reduce_op_handle
       SRCS reduce_op_handle.cc
-      DEPS op_handle_base variable_visitor scope phi)
+      DEPS op_handle_base variable_visitor scope phi common)
   endif()
   cc_library(
     broadcast_op_handle
     SRCS broadcast_op_handle.cc
-    DEPS op_handle_base scope phi memory variable_visitor)
+    DEPS op_handle_base scope phi common memory variable_visitor)
   cc_library(
     fused_broadcast_op_handle
     SRCS fused_broadcast_op_handle.cc
@@ -258,7 +285,7 @@ endif()
 cc_library(
   gather_op_handle
   SRCS gather_op_handle.cc
-  DEPS op_handle_base scope phi memory variable_visitor)
+  DEPS op_handle_base scope phi common memory variable_visitor)
 
 cc_library(
   eager_deletion_op_handle
@@ -305,6 +332,7 @@ cc_test(
        op_handle_base
        scope
        phi
+       common
        memory
        device_context
        broadcast_op_handle)
@@ -317,8 +345,9 @@ cc_test_old(
   var_handle
   op_handle_base
   scope
-  memory
   phi
+  common
+  memory
   device_context
   gather_op_handle)
 
@@ -330,12 +359,17 @@ cc_library(
   scope_buffered_ssa_graph_executor
   SRCS scope_buffered_ssa_graph_executor.cc
   DEPS ssa_graph_executor scope_buffered_monitor)
-#cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope phi memory
+#cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope phi common memory
 #        device_context reduce_op_handle )
 cc_library(
   bind_threaded_ssa_graph_executor
   SRCS bind_threaded_ssa_graph_executor.cc
-  DEPS fetch_op_handle phi ssa_graph_executor scope simple_threadpool
+  DEPS fetch_op_handle
+       phi
+       common
+       ssa_graph_executor
+       scope
+       simple_threadpool
        device_context)
 cc_library(
   fast_threaded_ssa_graph_executor
diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.h b/paddle/fluid/framework/details/broadcast_op_handle_test.h
index dc0f4e3fe1762a..60e45d226dd50f 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h
@@ -217,7 +217,7 @@ struct TestBroadcastOpHandle {
                             platform::errors::NotFound(
                                 "Variable %s is not found in scope.", varname));
     auto lod_tensor = var->GetMutable<phi::DenseTensor>();
-    std::vector<float> send_vector(static_cast<size_t>(phi::product(kDims)));
+    std::vector<float> send_vector(static_cast<size_t>(common::product(kDims)));
     for (size_t k = 0; k < send_vector.size(); ++k) {
       send_vector[k] = k + val_scalar;
     }
@@ -233,7 +233,7 @@ struct TestBroadcastOpHandle {
                                       const std::vector<int64_t>& rows,
                                       int height,
                                       float value_scalar = 0.0) {
-    std::vector<float> send_vector(static_cast<size_t>(phi::product(kDims)));
+    std::vector<float> send_vector(static_cast<size_t>(common::product(kDims)));
     for (size_t k = 0; k < send_vector.size(); ++k) {
       send_vector[k] = k + value_scalar;
     }
@@ -290,7 +290,7 @@ struct TestBroadcastOpHandle {
     f::TensorCopySync(rt, cpu_place, &result_tensor);
     float* ct = result_tensor.data<float>();
 
-    for (int64_t i = 0; i < phi::product(kDims); ++i) {
+    for (int64_t i = 0; i < common::product(kDims); ++i) {
       ASSERT_NEAR(ct[i], send_vector[i], 1e-5);
     }
   }
@@ -315,7 +315,7 @@ struct TestBroadcastOpHandle {
     phi::DenseTensor result_tensor;
     f::TensorCopySync(tensor, cpu_place, &result_tensor);
     float* ct = result_tensor.mutable_data<float>(cpu_place);
-    for (int64_t k = 0; k < phi::product(kDims); ++k) {
+    for (int64_t k = 0; k < common::product(kDims); ++k) {
       ASSERT_NEAR(ct[k], send_vec[k], 1e-5);
     }
   }
diff --git a/paddle/fluid/framework/details/fetch_async_op_handle.cc b/paddle/fluid/framework/details/fetch_async_op_handle.cc
index 900f0ebc4f111e..ee78d366711075 100644
--- a/paddle/fluid/framework/details/fetch_async_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_async_op_handle.cc
@@ -78,8 +78,8 @@ static void CheckTensorAttrs(const phi::DenseTensor *tensor,
             "(th) fetched variable. Please set the "
             "parameter `return_merged = False` when you "
             "call the `Executor.run()` method.",
-            phi::DataLayoutToString(layout),
-            phi::DataLayoutToString(tensor->layout()),
+            common::DataLayoutToString(layout),
+            common::DataLayoutToString(tensor->layout()),
             offset));
   }
 
@@ -175,7 +175,7 @@ void FetchAsyncOpHandle::FetchMergedLodTensor(
   // for 0D tensor, can't concat eath tensor. So stack 0D and concat 1+D tensor
   if (rank == 0) {
     int src_lodtensor_size = static_cast<int>(src_lodtensors.size());
-    new_dim = phi::make_ddim(std::vector<int>({src_lodtensor_size}));
+    new_dim = common::make_ddim(std::vector<int>({src_lodtensor_size}));
   } else {
     bool find_first_dims = false;
     for (auto *t : src_lodtensors) {
diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc
index 12d84aef8a8aca..8070a63bf2ce63 100644
--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@@ -137,7 +137,7 @@ struct TestGatherOpHandle {
     int height = static_cast<int>(kDims[0] * 2);
     std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
                               2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
-    std::vector<float> send_vector(phi::product(kDims));
+    std::vector<float> send_vector(common::product(kDims));
     for (size_t k = 0; k < send_vector.size(); ++k) {
       send_vector[k] = static_cast<float>(k);
     }
@@ -209,7 +209,7 @@ struct TestGatherOpHandle {
     float* ct = result_tensor.data<float>();
 
     for (int64_t j = 0;
-         j < phi::product(kDims) * static_cast<int64_t>(gpu_list_.size());
+         j < common::product(kDims) * static_cast<int64_t>(gpu_list_.size());
          ++j) {
       ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5);
     }
diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h
index 31782e0d7bc9ea..8fb92fcfc12539 100644
--- a/paddle/fluid/framework/details/op_registry.h
+++ b/paddle/fluid/framework/details/op_registry.h
@@ -23,6 +23,7 @@ limitations under the License. */
 #include <unordered_set>
 #include <vector>
 
+#include "paddle/common/macros.h"
 #include "paddle/fluid/framework/grad_op_desc_maker.h"
 #include "paddle/fluid/framework/inplace_op_inference.h"
 #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
@@ -33,7 +34,6 @@ limitations under the License. */
 #include "paddle/fluid/imperative/dygraph_grad_maker.h"
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/prim/utils/static/composite_grad_desc_maker.h"
-#include "paddle/phi/core/macros.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc
index 7886fc5eae5286..459f4dfcff504e 100644
--- a/paddle/fluid/framework/details/reduce_op_handle_test.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc
@@ -162,7 +162,7 @@ struct TestReduceOpHandle {
     int height = kDims[0] * 2;
     std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
                               2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
-    std::vector<float> send_vector(phi::product(kDims));
+    std::vector<float> send_vector(common::product(kDims));
     for (size_t k = 0; k < send_vector.size(); ++k) {
       send_vector[k] = k;
     }
@@ -232,13 +232,13 @@ struct TestReduceOpHandle {
     f::TensorCopySync(rt, cpu_place, &result_tensor);
     float *ct = result_tensor.data<float>();
 
-    for (int64_t j = 0; j < phi::product(result_tensor.dims()); ++j) {
+    for (int64_t j = 0; j < common::product(result_tensor.dims()); ++j) {
       ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5);
     }
   }  // namespace details
 
   void TestReduceLodTensors(size_t output_scope_idx) {
-    std::vector<float> send_vector(static_cast<size_t>(phi::product(kDims)));
+    std::vector<float> send_vector(static_cast<size_t>(common::product(kDims)));
     for (size_t k = 0; k < send_vector.size(); ++k) {
       send_vector[k] = k;
     }
@@ -283,7 +283,7 @@ struct TestReduceOpHandle {
     f::TensorCopySync(rt, cpu_place, &result_tensor);
     float *ct = result_tensor.data<float>();
 
-    for (int64_t j = 0; j < phi::product(result_tensor.dims()); ++j) {
+    for (int64_t j = 0; j < common::product(result_tensor.dims()); ++j) {
       ASSERT_NEAR(ct[j], send_vector[j] * gpu_list_.size(), 1e-5);
     }
   }
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
index 6aac8b6fd51cdf..8b486be9cc686a 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -108,7 +108,7 @@ void ScaleLossGradOpHandle::RunImpl() {
 
 void ScaleLossGradOpHandle::RunOnVar(Variable *var, bool record_event) {
   auto *tensor = var->GetMutable<phi::DenseTensor>();
-  tensor->Resize(phi::make_ddim({1}));
+  tensor->Resize(common::make_ddim({1}));
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   ScaleLossGradFunctor func(
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index 25d29e469a4985..c9c0817a756020 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -30,6 +30,7 @@ limitations under the License. */
 #if defined(PADDLE_WITH_PSCORE)
 #include "paddle/fluid/distributed/ps/wrapper/fleet.h"
 #endif
+#include "paddle/common/macros.h"
 #include "paddle/fluid/framework/barrier.h"
 #include "paddle/fluid/framework/data_feed.h"
 #include "paddle/fluid/framework/executor_gc_helper.h"
@@ -44,7 +45,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/timer.h"
 #include "paddle/phi/backends/dynload/port.h"
-#include "paddle/phi/core/macros.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/dlpack_tensor.h b/paddle/fluid/framework/dlpack_tensor.h
index 4cd6d97a0c5cc3..943ee88b67695b 100644
--- a/paddle/fluid/framework/dlpack_tensor.h
+++ b/paddle/fluid/framework/dlpack_tensor.h
@@ -41,7 +41,7 @@ class DLPackTensor {
 
   // The shape in DLTensor is defined as int64_t*
   // Add this member to make TVMTensor init without heap allocation
-  ShapeType shape_[DDim::kMaxRank];
+  ShapeType shape_[phi::DDim::kMaxRank];
 };
 
 DLManagedTensor* toDLPack(const phi::DenseTensor& src);
diff --git a/paddle/fluid/framework/eigen.h b/paddle/fluid/framework/eigen.h
index bbf34c03130c11..2da048361d254b 100644
--- a/paddle/fluid/framework/eigen.h
+++ b/paddle/fluid/framework/eigen.h
@@ -88,8 +88,8 @@ struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {
                           "between 0 and %d, but received number is %d.",
                           rank,
                           num_col_dims));
-    return EigenMatrix::From(tensor,
-                             phi::flatten_to_2d(tensor.dims(), num_col_dims));
+    return EigenMatrix::From(
+        tensor, common::flatten_to_2d(tensor.dims(), num_col_dims));
   }
 
   static typename EigenMatrix::ConstType Reshape(const phi::DenseTensor& tensor,
@@ -102,8 +102,8 @@ struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {
                           "between 0 and %d, but received number is %d.",
                           rank,
                           num_col_dims));
-    return EigenMatrix::From(tensor,
-                             phi::flatten_to_2d(tensor.dims(), num_col_dims));
+    return EigenMatrix::From(
+        tensor, common::flatten_to_2d(tensor.dims(), num_col_dims));
   }
 };
 
diff --git a/paddle/fluid/framework/fleet/heter_wrapper.cc b/paddle/fluid/framework/fleet/heter_wrapper.cc
index 2cae0721aefa99..a8ce9be92bdf68 100644
--- a/paddle/fluid/framework/fleet/heter_wrapper.cc
+++ b/paddle/fluid/framework/fleet/heter_wrapper.cc
@@ -95,7 +95,7 @@ void HeterWrapper::SerializeToReq(const std::string& varname,
   req_var->set_data_type(static_cast<VariableMessage::Type>(
       framework::TransToProtoVarType(tensor->dtype())));
 
-  for (auto& dim : phi::vectorize(tensor->dims())) {
+  for (auto& dim : common::vectorize(tensor->dims())) {
     req_var->add_dims(dim);
   }
   const framework::LoD lod = tensor->lod();
@@ -154,7 +154,7 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope,
   for (auto& x : req_var.dims()) {
     vec_dim.push_back(x);
   }
-  tensor->Resize(phi::make_ddim(vec_dim));
+  tensor->Resize(common::make_ddim(vec_dim));
 
   LoD lod;
   for (int i = 0; i < req_var.lod_level(); ++i) {
@@ -199,7 +199,7 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope,
   for (auto& x : req_var.dims()) {
     vec_dim.push_back(x);
   }
-  tensor->Resize(phi::make_ddim(vec_dim));
+  tensor->Resize(common::make_ddim(vec_dim));
 
   LoD lod;
   for (int i = 0; i < req_var.lod_level(); ++i) {
diff --git a/paddle/fluid/framework/heter_section_worker.cc b/paddle/fluid/framework/heter_section_worker.cc
index ea55d5cd6c818a..24be8e04d8d507 100644
--- a/paddle/fluid/framework/heter_section_worker.cc
+++ b/paddle/fluid/framework/heter_section_worker.cc
@@ -39,7 +39,7 @@ void SetMicroId(paddle::framework::Scope* scope,
           "the type of microbatch_id  should be phi::DenseTensor"));
   auto* tensor = var->GetMutable<phi::DenseTensor>();
   std::vector<int> dims{1};
-  tensor->Resize(phi::make_ddim(dims));
+  tensor->Resize(common::make_ddim(dims));
   void* tensor_data = tensor->mutable_data(
       place, framework::TransToPhiDataType(framework::proto::VarType::FP32));
   if (platform::is_gpu_place(place)) {
diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc
index 4c41bc27f1730e..88f0b496a8e4c6 100644
--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -218,7 +218,7 @@ DDim CompatMetaTensor::dims() const {
     } else if (var->IsType<framework::LoDTensorArray>()) {
       // use tensor array size as dims
       auto& tensor_array = var->Get<framework::LoDTensorArray>();
-      return phi::make_ddim({static_cast<int64_t>(tensor_array.size())});
+      return common::make_ddim({static_cast<int64_t>(tensor_array.size())});
     } else {
       PADDLE_THROW(platform::errors::Unimplemented(
           "Currently, only can get dims from DenseTensor or SelectedRows or "
@@ -227,9 +227,9 @@ DDim CompatMetaTensor::dims() const {
   } else {
     auto* var = PADDLE_GET_CONST(VarDesc*, var_);
 
-    return phi::make_ddim(var->GetShape());
-    // return var->GetShape().empty() ? phi::make_ddim({0UL}) :
-    // phi::make_ddim(var->GetShape());
+    return common::make_ddim(var->GetShape());
+    // return var->GetShape().empty() ? common::make_ddim({0UL}) :
+    // common::make_ddim(var->GetShape());
   }
 }
 
@@ -316,7 +316,7 @@ void CompatMetaTensor::set_dims(const DDim& dims) {
   } else {
     auto* var = PADDLE_GET(VarDesc*, var_);
     if (var) {
-      var->SetShape(vectorize(dims));
+      var->SetShape(common::vectorize(dims));
     }
   }
 }
diff --git a/paddle/fluid/framework/inplace_op_inference.h b/paddle/fluid/framework/inplace_op_inference.h
index 2a1a44e57a30a8..5c5589fd149512 100644
--- a/paddle/fluid/framework/inplace_op_inference.h
+++ b/paddle/fluid/framework/inplace_op_inference.h
@@ -16,9 +16,9 @@
 #include <string>
 #include <unordered_map>
 
+#include "paddle/common/macros.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/type_defs.h"
-#include "paddle/phi/core/macros.h"
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 8336340849fb46..bc08f1d72f0f91 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -32,7 +32,7 @@ cc_library(
 cc_library(
   cost_model
   SRCS cost_model.cc
-  DEPS executor graph profiler proto_desc phi)
+  DEPS executor graph profiler proto_desc phi common)
 
 set(GRAPH_PATTERN_DETECTOR_DEPS graph graph_helper graph_traits)
 if(WITH_TESTING)
@@ -238,7 +238,7 @@ if(WITH_XPU)
   cc_library(
     xpu_quant_utils
     SRCS xpu/quant_utils.cc
-    DEPS pass phi)
+    DEPS pass phi common)
   cc_library(
     xpu_pass_utils
     SRCS xpu/pass_utils.cc
@@ -541,7 +541,8 @@ if(WITH_MKLDNN)
       concat_and_split
       naive_executor
       device_context
-      phi)
+      phi
+      common)
   if(WITH_GPU OR WITH_ROCM)
     set(TEST_CONV_BN_PASS_DEPS ${TEST_CONV_BN_PASS_DEPS} depthwise_conv)
   endif()
diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
index 9764f78744974e..5e82534ba67452 100644
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
@@ -264,12 +264,12 @@ void PrepareParameters(Graph* graph, const Param& param, ir::Node* lstm_op) {
       platform::errors::InvalidArgument(
           "phi::DenseTensor attention bias dimension size(%d) must be 1.",
           attention_bias_t->dims().size()));
-  attention_bias_t->Resize(phi::make_ddim({1, attention_bias_t->dims()[0]}));
+  attention_bias_t->Resize(common::make_ddim({1, attention_bias_t->dims()[0]}));
 
   auto* attention_scalar_bias_t =
       scope.FindVar(param.AttentionScalarBias)->GetMutable<phi::DenseTensor>();
   attention_scalar_bias_t->Resize(
-      phi::make_ddim({1, attention_scalar_bias_t->dims()[0]}));
+      common::make_ddim({1, attention_scalar_bias_t->dims()[0]}));
 
   PrepareLSTMWeight(W_forget_w0_t,
                     W_forget_w1_t,
@@ -296,7 +296,7 @@ void PrepareLSTMWeight(const phi::DenseTensor& W_forget_w0,
                        phi::DenseTensor* out) {
   int D = static_cast<int>(W_forget_w0.dims()[0]);
   int M = static_cast<int>(W_forget_w1.dims()[0]);
-  out->Resize(phi::make_ddim({D + M, 4 * D}));
+  out->Resize(common::make_ddim({D + M, 4 * D}));
   VLOG(3) << "LSTMWeight resized to " << out->dims();
 
   float* out_data = out->mutable_data<float>(platform::CPUPlace());
@@ -343,7 +343,7 @@ void PrepareLSTMBias(const phi::DenseTensor& B_forget,
           "phi::DenseTensor B forget dimension size(%d) must be 1.",
           B_forget.dims().size()));
   int D = static_cast<int>(B_forget.dims()[0]);
-  out->Resize(phi::make_ddim({1, 4 * D}));
+  out->Resize(common::make_ddim({1, 4 * D}));
   auto* out_data = out->mutable_data<float>(platform::CPUPlace());
   for (size_t i = 0; i < tensors.size(); i++) {
     memcpy(out_data + D * i, tensors[i], D * sizeof(float));
diff --git a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
index d29ef0f9ad1fad..61080c52c94bac 100644
--- a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
+++ b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/framework/ir/auto_mixed_precision_pass.h"
 
+#include "paddle/common/errors.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/phi/common/bfloat16.h"
@@ -21,7 +22,6 @@
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 #include "paddle/phi/backends/device_manager.h"
 #endif
diff --git a/paddle/fluid/framework/ir/conv2d_fusion_layout_transfer_pass.cc b/paddle/fluid/framework/ir/conv2d_fusion_layout_transfer_pass.cc
index 7d3c105f749387..2640bd9cd74ee3 100644
--- a/paddle/fluid/framework/ir/conv2d_fusion_layout_transfer_pass.cc
+++ b/paddle/fluid/framework/ir/conv2d_fusion_layout_transfer_pass.cc
@@ -18,9 +18,9 @@
 #include <unordered_set>
 #include "paddle/fluid/framework/ir/cutlass_teller.h"
 
+#include "paddle/common/layout.h"
 #include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/dense_tensor.h"
 
diff --git a/paddle/fluid/framework/ir/conv2d_trans_filter_dilations_nxn_to_1x1_pass.cc b/paddle/fluid/framework/ir/conv2d_trans_filter_dilations_nxn_to_1x1_pass.cc
index 7ba36c11313859..af3187d9d64bbb 100644
--- a/paddle/fluid/framework/ir/conv2d_trans_filter_dilations_nxn_to_1x1_pass.cc
+++ b/paddle/fluid/framework/ir/conv2d_trans_filter_dilations_nxn_to_1x1_pass.cc
@@ -191,7 +191,7 @@ void Conv2dTransFilterDilationsNxNTo1x1Pass::conv2d_dilation_trans(
 
     VarDesc new_weights_desc(new_weights_name);
     new_weights_desc.SetPersistable(true);
-    new_weights_desc.SetShape(vectorize(new_weights->dims()));
+    new_weights_desc.SetShape(common::vectorize(new_weights->dims()));
     new_weights_desc.SetDataType(
         framework::TransToProtoVarType(new_weights->dtype()));
     auto* new_weights_node = graph->CreateVarNode(&new_weights_desc);
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
index aa15b2696d7a12..335d7034ca6496 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
@@ -167,7 +167,7 @@ void recompute_bias_and_weights(const Scope* scope,
       }
     }
   } else {
-    auto weights_shape_2d = phi::flatten_to_2d(weights_shape, 1);
+    auto weights_shape_2d = common::flatten_to_2d(weights_shape, 1);
 
     EigenMatrixArrayMap weights_array_2d(
         weights_data, weights_shape_2d[0], weights_shape_2d[1]);
@@ -376,7 +376,7 @@ void ConvBNFusePass::ApplyImpl(ir::Graph* graph) const {
     if (!mkldnn_with_bias) {
       VarDesc eltwise_y_in_desc(
           patterns::PDNodeName("fuse_conv_bn", conv_type() + "_eltwise_y_in"));
-      eltwise_y_in_desc.SetShape(phi::vectorize(bn_bias_tensor->dims()));
+      eltwise_y_in_desc.SetShape(common::vectorize(bn_bias_tensor->dims()));
       eltwise_y_in_desc.SetDataType(
           framework::TransToProtoVarType(bn_bias_tensor->dtype()));
       eltwise_y_in_desc.SetLoDLevel(bn_bias->Var()->GetLoDLevel());
@@ -674,7 +674,8 @@ void ConvEltwiseAddBNFusePass::ApplyImpl(ir::Graph* graph) const {
       // Create eltwise_y (conv bias) variable
       VarDesc eltwise_y_in_desc(patterns::PDNodeName(
           name_scope_, "eltwise_y_in" + std::to_string(found_conv_bn_count)));
-      eltwise_y_in_desc.SetShape(phi::vectorize(eltwise_y_in_tensor->dims()));
+      eltwise_y_in_desc.SetShape(
+          common::vectorize(eltwise_y_in_tensor->dims()));
       eltwise_y_in_desc.SetDataType(
           framework::TransToProtoVarType(eltwise_y_in_tensor->dtype()));
       eltwise_y_in_desc.SetLoDLevel(eltwise_y_in->Var()->GetLoDLevel());
diff --git a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc
index cd8312214af8d4..75ac438a97f856 100644
--- a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc
@@ -255,8 +255,8 @@ void FCElementwiseLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
     int begin_norm_axis =
         PADDLE_GET_CONST(int, layer_norm->Op()->GetAttr("begin_norm_axis"));
     auto layer_norm_x_dims = fc_out->Var()->GetShape();
-    auto layer_norm_x_mat_dims =
-        phi::flatten_to_2d(phi::make_ddim(layer_norm_x_dims), begin_norm_axis);
+    auto layer_norm_x_mat_dims = common::flatten_to_2d(
+        common::make_ddim(layer_norm_x_dims), begin_norm_axis);
     if (fc_w->Var()->GetShape()[1] != layer_norm_x_mat_dims[1]) {
       return;
     }
diff --git a/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass.cc b/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass.cc
index 99bb9c59206853..9422980a429f21 100644
--- a/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass.cc
+++ b/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass.cc
@@ -1474,7 +1474,7 @@ inline void QKVWeightsProcess(phi::DenseTensor* wq_tensor,
   auto* wk_data = wk_tensor->data<T>();
   auto* wv_data = wv_tensor->data<T>();
 
-  auto combined_w_dims = phi::make_ddim({3, num_head, dim_head, dim_embed});
+  auto combined_w_dims = common::make_ddim({3, num_head, dim_head, dim_embed});
 
   phi::DenseTensor tmp_combined_w_tensor;
   tmp_combined_w_tensor.Resize(combined_w_dims);
@@ -1516,7 +1516,7 @@ inline void QKVBiasProcess(phi::DenseTensor* bq_tensor,
   auto* bk_data = bk_tensor->data<T>();
   auto* bv_data = bv_tensor->data<T>();
 
-  auto combined_bias_dims = phi::make_ddim({3, num_head, dim_head});
+  auto combined_bias_dims = common::make_ddim({3, num_head, dim_head});
 
   phi::DenseTensor tmp_combined_bias_tensor;
   tmp_combined_bias_tensor.Resize(combined_bias_dims);
@@ -1590,7 +1590,7 @@ inline void QKVWeightsProcessFuseQKV(phi::DenseTensor* qkv_w_tensor,
   auto* dev_ctx = static_cast<phi::CPUContext*>(
       platform::DeviceContextPool::Instance().Get(platform::CPUPlace()));
   auto* qkv_w_data = qkv_w_tensor->data<T>();
-  auto transpose_w_dims = phi::make_ddim({3, num_head, dim_head, dim_embed});
+  auto transpose_w_dims = common::make_ddim({3, num_head, dim_head, dim_embed});
 
   phi::DenseTensor tmp_transpose_w_tensor;
   tmp_transpose_w_tensor.Resize(transpose_w_dims);
@@ -1628,7 +1628,7 @@ inline void QKVBiasProcessFuseQKV(phi::DenseTensor* qkv_b_tensor,
   auto* dev_ctx = static_cast<phi::CPUContext*>(
       platform::DeviceContextPool::Instance().Get(platform::CPUPlace()));
   auto* qkv_b_data = qkv_b_tensor->data<T>();
-  auto transpose_b_dims = phi::make_ddim({3, num_head, dim_head});
+  auto transpose_b_dims = common::make_ddim({3, num_head, dim_head});
 
   phi::DenseTensor tmp_transpose_b_tensor;
   tmp_transpose_b_tensor.Resize(transpose_b_dims);
diff --git a/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt b/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt
index 2357247b37d794..570b081aae95ed 100644
--- a/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt
@@ -6,13 +6,13 @@ if(WITH_GPU OR WITH_ROCM)
   cc_test(
     test_code_generator
     SRCS code_generator_tester.cc
-    DEPS code_generator phi lod_tensor graph_viz_pass)
+    DEPS code_generator phi common lod_tensor graph_viz_pass)
 endif()
 
 cc_library(
   fusion_group_pass
   SRCS fusion_group_pass.cc elementwise_group_detector.cc
-  DEPS subgraph_detector fuse_pass_base code_generator phi)
+  DEPS subgraph_detector fuse_pass_base code_generator phi common)
 cc_test(
   test_fusion_group_pass
   SRCS fusion_group_pass_tester.cc
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
index dd929f5329bae5..9749fb2bfa81c5 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
@@ -271,8 +271,8 @@ void TestElementwiseMain(
 
   // Prepare CPU tensors which always hold float.
   std::vector<phi::DenseTensor> cpu_tensors(ids.size());
-  auto dims =
-      phi::make_ddim({static_cast<int64_t>(256), static_cast<int64_t>(1024)});
+  auto dims = common::make_ddim(
+      {static_cast<int64_t>(256), static_cast<int64_t>(1024)});
   for (auto& cpu_tensor : cpu_tensors) {
     cpu_tensor.mutable_data<float>(dims, paddle::platform::CPUPlace());
   }
diff --git a/paddle/fluid/framework/ir/ipu/delete_scale_op_pass.cc b/paddle/fluid/framework/ir/ipu/delete_scale_op_pass.cc
index 6300c0e32971db..391373578cc24b 100644
--- a/paddle/fluid/framework/ir/ipu/delete_scale_op_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/delete_scale_op_pass.cc
@@ -14,13 +14,13 @@
 
 #include "paddle/fluid/framework/ir/ipu/delete_scale_op_pass.h"
 
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
-#include "paddle/phi/core/ddim.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc b/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc
index 5c14849dd01c36..3967e82ba6e804 100644
--- a/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc
@@ -14,12 +14,12 @@
 
 #include "paddle/fluid/framework/ir/ipu/infer_shape_pass.h"
 
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/platform/device/ipu/ipu_backend.h"
-#include "paddle/phi/core/ddim.h"
 
 namespace paddle {
 namespace framework {
@@ -74,7 +74,7 @@ void InferShapePass::ApplyImpl(ir::Graph* graph) const {
       paddle::framework::InitializeVariable(ptr, var_desc->GetType());
 
       auto tensor = ptr->GetMutable<phi::DenseTensor>();
-      tensor->Resize(phi::make_ddim(var_desc->GetShape()));
+      tensor->Resize(common::make_ddim(var_desc->GetShape()));
     }
 
     // infer shape
@@ -94,7 +94,7 @@ void InferShapePass::ApplyImpl(ir::Graph* graph) const {
         for (int i = 0; i < it->second.size(); i++) {
           auto output_name = op_desc->Output(it->first)[i];
           auto dim = it->second[i]->GetMutable<phi::DenseTensor>()->dims();
-          auto new_shape = phi::vectorize(dim);
+          auto new_shape = common::vectorize(dim);
           for (auto output_node : node->outputs) {
             if (output_node->Name() == output_name) {
               output_node->Var()->SetShape(new_shape);
diff --git a/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc b/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc
index fc894c07e0966d..56323c16051367 100644
--- a/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc
@@ -337,7 +337,7 @@ void LayerNormFusePass::ApplyImpl(Graph* graph) const {
 
     // gamma/beta must be a 1-dimensional tensor of size on layer_norm
     auto layer_norm_x_mat_dims =
-        phi::flatten_to_2d(phi::make_ddim(x_shape), begin_norm_axis);
+        common::flatten_to_2d(common::make_ddim(x_shape), begin_norm_axis);
     auto* gamma_tensor =
         scope->FindVar(gamma->Name())->GetMutable<phi::DenseTensor>();
     VarDesc new_gamma_desc(patterns::PDNodeName("layer_norm_fuse", "Scale"));
@@ -349,7 +349,7 @@ void LayerNormFusePass::ApplyImpl(Graph* graph) const {
     auto* new_gamma_node = g->CreateVarNode(&new_gamma_desc);
     auto* new_gamma_tensor =
         scope->Var(new_gamma_node->Name())->GetMutable<phi::DenseTensor>();
-    new_gamma_tensor->Resize(phi::make_ddim({layer_norm_x_mat_dims[1]}));
+    new_gamma_tensor->Resize(common::make_ddim({layer_norm_x_mat_dims[1]}));
     memcpy(new_gamma_tensor->mutable_data<float>(platform::CPUPlace()),
            gamma_tensor->mutable_data<float>(platform::CPUPlace()),
            layer_norm_x_mat_dims[1] * sizeof(float));
@@ -366,7 +366,7 @@ void LayerNormFusePass::ApplyImpl(Graph* graph) const {
     auto* new_beta_tensor =
         scope->Var(new_beta_node->Name())->GetMutable<phi::DenseTensor>();
 
-    new_beta_tensor->Resize(phi::make_ddim({layer_norm_x_mat_dims[1]}));
+    new_beta_tensor->Resize(common::make_ddim({layer_norm_x_mat_dims[1]}));
     memcpy(new_beta_tensor->mutable_data<float>(platform::CPUPlace()),
            beta_tensor->mutable_data<float>(platform::CPUPlace()),
            layer_norm_x_mat_dims[1] * sizeof(float));
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
index 96594cbd022dc5..d0618616619037 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
@@ -43,7 +43,7 @@ if(WITH_CINN)
   cc_library(
     share_varinfo_into_cinn_pass
     SRCS share_varinfo_into_cinn_pass.cc
-    DEPS pass enforce graph_helper computation_op_handle
+    DEPS pass enforce common graph_helper computation_op_handle
          eager_deletion_op_handle)
   cc_test(
     share_varinfo_into_cinn_pass_test
@@ -81,4 +81,5 @@ cc_library(
 cc_test(
   test_reference_count_pass_last_lived_ops
   SRCS test_reference_count_pass_last_lived_ops.cc
-  DEPS parallel_executor elementwise_mul_op elementwise_add_op generated_op phi)
+  DEPS parallel_executor elementwise_mul_op elementwise_add_op generated_op phi
+       common)
diff --git a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc
index 1738259d60f004..d63c52060651ef 100644
--- a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc
@@ -121,7 +121,7 @@ void ComputePropagateScalesMkldnnPass::ComputeVarScales(
 
       phi::DenseTensor tmp_tensor;
       std::vector<int64_t> reshape_dims = {dims[0], volume};
-      tmp_tensor.Resize(phi::make_ddim(reshape_dims));
+      tmp_tensor.Resize(common::make_ddim(reshape_dims));
       auto* weight_data = weight_tensor->data<float>();
       auto* tmp_data = tmp_tensor.mutable_data<float>(phi::CPUPlace());
       for (int i = 0; i < weight_tensor->numel(); i++) {
diff --git a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc
index e4a37ad2c7a7ff..7a811aae50e236 100644
--- a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc
@@ -146,7 +146,7 @@ class ComputePropagateScalesMkldnnPassTest : public testing::Test {
 
     auto* wx_var = scope.FindVar(wx_var_names);
     auto* wx_tensor = wx_var->GetMutable<phi::DenseTensor>();
-    wx_tensor->Resize(phi::make_dim(wx.size(), wx[0].size()));
+    wx_tensor->Resize(common::make_dim(wx.size(), wx[0].size()));
     for (size_t i = 0; i < wx.size(); i++)
       std::copy(
           begin(wx[i]),
@@ -155,7 +155,7 @@ class ComputePropagateScalesMkldnnPassTest : public testing::Test {
 
     auto* wh_var = scope.FindVar(wh_var_names);
     auto* wh_tensor = wh_var->GetMutable<phi::DenseTensor>();
-    wh_tensor->Resize(phi::make_dim(wh.size(), wh[0].size()));
+    wh_tensor->Resize(common::make_dim(wh.size(), wh[0].size()));
     for (size_t i = 0; i < wh.size(); i++)
       std::copy(
           begin(wh[i]),
@@ -280,7 +280,7 @@ TEST_F(ComputePropagateScalesMkldnnPassTest, get_scales_function) {
   float max_val = *std::max_element(values.begin(), values.end());
 
   phi::DenseTensor var_tensor;
-  var_tensor.Resize(phi::make_dim(values.size(), 1));
+  var_tensor.Resize(common::make_dim(values.size(), 1));
   std::copy(begin(values),
             end(values),
             var_tensor.mutable_data<float>(phi::CPUPlace()));
@@ -307,7 +307,7 @@ TEST_F(ComputePropagateScalesMkldnnPassTest, compute_var_scales) {
 
   auto* var = scope.FindVar(weight_var_name);
   auto* weight_tensor = var->GetMutable<phi::DenseTensor>();
-  weight_tensor->Resize(phi::make_dim(1, values.size()));
+  weight_tensor->Resize(common::make_dim(1, values.size()));
   std::copy(begin(values),
             end(values),
             weight_tensor->mutable_data<float>(phi::CPUPlace()));
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.cc
index 9639d3f374bef4..eedb5b3b60bd5e 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.cc
@@ -90,7 +90,7 @@ void recompute_bias_and_weights(const Scope* scope,
   auto* weights =
       scope->FindVar(conv_weight->Name())->GetMutable<phi::DenseTensor>();
   auto weights_shape = weights->dims();
-  auto weights_shape_2d = phi::flatten_to_2d(weights_shape, 1);
+  auto weights_shape_2d = common::flatten_to_2d(weights_shape, 1);
   auto* weights_data = weights->mutable_data<float>(phi::CPUPlace());
 
   EigenMatrixArrayMap weights_array_2d(
@@ -266,7 +266,7 @@ void ConvAffineChannelFusePass::FuseConvAffineChannel(
     VarDesc eltwise_y_in_desc(
         patterns::PDNodeName(name_scope_, "eltwise_y_in"));
     // Set shape && datatype manually
-    eltwise_y_in_desc.SetShape(phi::vectorize(ac_bias_tensor->dims()));
+    eltwise_y_in_desc.SetShape(common::vectorize(ac_bias_tensor->dims()));
     eltwise_y_in_desc.SetDataType(
         framework::TransToProtoVarType(ac_bias_tensor->dtype()));
     eltwise_y_in_desc.SetLoDLevel(ac_bias->Var()->GetLoDLevel());
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index 8d8504708f0373..2f1e7e8a53865c 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -1178,7 +1178,7 @@ void CPUQuantizePass::QuantizeMultiGru(Graph* graph) const {
 
       VarDesc scale_var_desc(patterns::PDNodeName("multi_gru", "w_scale"));
 
-      scale_var_desc.SetShape(phi::vectorize(scale_tensor_src.dims()));
+      scale_var_desc.SetShape(common::vectorize(scale_tensor_src.dims()));
       scale_var_desc.SetDataType(proto::VarType::FP32);
       scale_var_desc.SetLoDLevel(scale_tensor_src.lod().size());
       scale_var_desc.SetPersistable(true);
diff --git a/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.cc
index 08aafa4a60a0e7..a1f74d3423006b 100644
--- a/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.cc
@@ -16,10 +16,10 @@
 
 #include <vector>
 
+#include "paddle/common/errors.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/utils/string/pretty_log.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.cc
index 697a34904c817e..f9e8722ccf3978 100644
--- a/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.cc
@@ -19,11 +19,11 @@
 #include <utility>
 #include <vector>
 
+#include "paddle/common/errors.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/utils/string/pretty_log.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass_tester.cc
index 58e2a74ce1d405..72b07fc8934de5 100755
--- a/paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass_tester.cc
@@ -47,7 +47,7 @@ struct TestScope {
   void CreateTensor(const std::string& var_name, const Data& data) {
     auto variable = scope.Var(var_name);
     auto tensor = variable->GetMutable<phi::DenseTensor>();
-    tensor->Resize(phi::make_ddim(data.getShape()));
+    tensor->Resize(common::make_ddim(data.getShape()));
     auto dptr = tensor->mutable_data<float>(place);
     std::copy(data.getData().begin(), data.getData().end(), dptr);
   }
diff --git a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
index 0087886c1c8d7b..e5cd2a9007b381 100644
--- a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
@@ -420,10 +420,10 @@ void QuantDequantMkldnnPass::TransposeWeight(phi::DenseTensor* input) const {
     out_dim_v.push_back(in_dims[i]);
   }
 
-  const auto out_dims = phi::make_ddim(out_dim_v);
+  const auto out_dims = common::make_ddim(out_dim_v);
   const int rank = axis.size();
-  auto in_stride = phi::stride(in_dims);
-  auto out_stride = phi::stride(out_dims);
+  auto in_stride = common::stride(in_dims);
+  auto out_stride = common::stride(out_dims);
   const int count = input->numel();
 
   phi::DenseTensor trans_tensor;
@@ -490,7 +490,7 @@ void QuantDequantMkldnnPass::ConvertFromINT8ToFP32(
     }
 
     weight_tensor->clear();  // clear int weight
-    weight_tensor->Resize(phi::make_ddim(phi::vectorize(weight_dims)));
+    weight_tensor->Resize(common::make_ddim(common::vectorize(weight_dims)));
     auto* new_weight_data = weight_tensor->mutable_data<float>(phi::CPUPlace());
     memcpy(new_weight_data,
            weight_data.data(),
@@ -532,7 +532,7 @@ void QuantDequantMkldnnPass::ConvertFromINT8ToFP32(
       }
     }
     weight_tensor->clear();  // clear int weight
-    weight_tensor->Resize(phi::make_ddim(phi::vectorize(weight_dims)));
+    weight_tensor->Resize(common::make_ddim(common::vectorize(weight_dims)));
     auto* new_weight_data = weight_tensor->mutable_data<float>(phi::CPUPlace());
     memcpy(new_weight_data,
            weight_data.data(),
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
index 85f62c4a293fce..295ef57cfdfead 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
@@ -868,8 +868,8 @@ size_t BalanceVarSSAGraphBuilder::GetAppropriateDeviceID(
     PADDLE_ENFORCE_NOT_NULL(var_desc,
                             platform::errors::NotFound(
                                 "Can not find Var(%s) in Var Desc.", var_name));
-    auto dim = phi::make_ddim(var_desc->GetShape());
-    int64_t numel = phi::product(dim);
+    auto dim = common::make_ddim(var_desc->GetShape());
+    int64_t numel = common::product(dim);
     PADDLE_ENFORCE_GT(numel,
                       0,
                       platform::errors::InvalidArgument(
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
index 0fd3a71754f6d9..22802dbddd8efe 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
@@ -653,8 +653,8 @@ inline void QKVWeightsProcess(phi::DenseTensor* wq_tensor,
   auto* bv_data = bv_tensor->mutable_data<T>(platform::CPUPlace());
 
   auto combined_w_dims =
-      phi::make_ddim({wq_tensor->dims()[0], 3, wq_tensor->dims()[1]});
-  auto combined_bias_dims = phi::make_ddim({3, bq_tensor->dims()[0]});
+      common::make_ddim({wq_tensor->dims()[0], 3, wq_tensor->dims()[1]});
+  auto combined_bias_dims = common::make_ddim({3, bq_tensor->dims()[0]});
 
   phi::DenseTensor tmp_combined_w_tensor;
   tmp_combined_w_tensor.Resize(combined_w_dims);
@@ -1362,8 +1362,8 @@ int MultiHeadMatmulV3FusePass::BuildFusionV3(Graph* graph,
     auto* bv_data = bv_tensor->mutable_data<float>(platform::CPUPlace());
 
     auto combined_w_dims =
-        phi::make_ddim({wq_tensor->dims()[0], 3, wq_tensor->dims()[1]});
-    auto combined_bias_dims = phi::make_ddim({3, bq_tensor->dims()[0]});
+        common::make_ddim({wq_tensor->dims()[0], 3, wq_tensor->dims()[1]});
+    auto combined_bias_dims = common::make_ddim({3, bq_tensor->dims()[0]});
 
     // reuse the mul0_w and eltadd_0_b nodes for the combined nodes.
     auto* combined_w_desc = mul0_w->Var();
diff --git a/paddle/fluid/framework/ir/multihead_matmul_roformer_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_roformer_fuse_pass.cc
index be5fad23fd6e2d..1f91b6955aadfe 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_roformer_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_roformer_fuse_pass.cc
@@ -449,8 +449,8 @@ int MultiHeadMatmulRoformerFusePass::BuildFusion(Graph* graph,
     auto* bv_data = bv_tensor->mutable_data<float>(platform::CPUPlace());
 
     auto combined_w_dims =
-        phi::make_ddim({wq_tensor->dims()[0], 3, wq_tensor->dims()[1]});
-    auto combined_bias_dims = phi::make_ddim({3, bq_tensor->dims()[0]});
+        common::make_ddim({wq_tensor->dims()[0], 3, wq_tensor->dims()[1]});
+    auto combined_bias_dims = common::make_ddim({3, bq_tensor->dims()[0]});
 
     // reuse the mul0_w and eltadd_0_b nodes for the combined nodes.
     auto* combined_w_desc = mul0_w->Var();
diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h
index 473890a4b786ba..753fadd242ebc2 100644
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -22,10 +22,10 @@ limitations under the License. */
 #include <unordered_set>
 #include <vector>
 
+#include "paddle/common/macros.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/phi/core/macros.h"
 #include "paddle/utils/any.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/pass_test_util.cc b/paddle/fluid/framework/ir/pass_test_util.cc
index ea3532135fafbb..ee75794d7ccc42 100644
--- a/paddle/fluid/framework/ir/pass_test_util.cc
+++ b/paddle/fluid/framework/ir/pass_test_util.cc
@@ -187,7 +187,8 @@ void InitLoDTensorHolder(const Scope& scope,
                          const T* data) {
   auto var = scope.FindLocalVar(var_name);
   auto tensor = var->GetMutable<phi::DenseTensor>();
-  auto* tensor_mem_ptr = tensor->mutable_data<T>(phi::make_ddim(dims), place);
+  auto* tensor_mem_ptr =
+      tensor->mutable_data<T>(common::make_ddim(dims), place);
   if (data != nullptr) {
     std::memcpy(tensor_mem_ptr, data, tensor->memory_size());
   } else {
diff --git a/paddle/fluid/framework/ir/pass_test_util.h b/paddle/fluid/framework/ir/pass_test_util.h
index 44f6c66295466b..8032205fbe81db 100644
--- a/paddle/fluid/framework/ir/pass_test_util.h
+++ b/paddle/fluid/framework/ir/pass_test_util.h
@@ -18,13 +18,13 @@
 #include <utility>
 #include <vector>
 
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/phi/core/ddim.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/split_layernorm_to_math_ops_pass.cc b/paddle/fluid/framework/ir/split_layernorm_to_math_ops_pass.cc
index f28b768513f9f5..35e1fe74948f39 100644
--- a/paddle/fluid/framework/ir/split_layernorm_to_math_ops_pass.cc
+++ b/paddle/fluid/framework/ir/split_layernorm_to_math_ops_pass.cc
@@ -268,7 +268,7 @@ void SplitLayerNormPass::ApplyImpl(Graph* graph) const {
     pow_y->SetPersistable(true);
     auto* pow_y_node = graph->CreateVarNode(pow_y);
     auto* pow_y_tensor = scope->Var(pow_y_name)->GetMutable<phi::DenseTensor>();
-    pow_y_tensor->Resize(phi::make_ddim({1}));
+    pow_y_tensor->Resize(common::make_ddim({1}));
     dev_ctx->Alloc<float>(pow_y_tensor);
     (pow_y_tensor->data<float>())[0] = 2.0f;
 
@@ -312,7 +312,7 @@ void SplitLayerNormPass::ApplyImpl(Graph* graph) const {
     add_y->SetPersistable(true);
     auto* add_y_node = graph->CreateVarNode(add_y);
     auto* add_y_tensor = scope->Var(add_y_name)->GetMutable<phi::DenseTensor>();
-    add_y_tensor->Resize(phi::make_ddim({1}));
+    add_y_tensor->Resize(common::make_ddim({1}));
     dev_ctx->Alloc<float>(add_y_tensor);
     (add_y_tensor->data<float>())[0] = eps;
 
@@ -364,7 +364,7 @@ void SplitLayerNormPass::ApplyImpl(Graph* graph) const {
         scope->Var(new_scale_name)->GetMutable<phi::DenseTensor>();
     auto* scale_tensor =
         scope->Var(layer_norm_scale->Name())->GetMutable<phi::DenseTensor>();
-    new_scale_tensor->Resize(phi::make_ddim(shape_int64));
+    new_scale_tensor->Resize(common::make_ddim(shape_int64));
     dev_ctx->Alloc<float>(new_scale_tensor);
     memcpy(new_scale_tensor->data<float>(),
            scale_tensor->data<float>(),
@@ -393,7 +393,7 @@ void SplitLayerNormPass::ApplyImpl(Graph* graph) const {
         scope->Var(new_bias_name)->GetMutable<phi::DenseTensor>();
     auto* bias_tensor =
         scope->Var(layer_norm_bias->Name())->GetMutable<phi::DenseTensor>();
-    new_bias_tensor->Resize(phi::make_ddim(shape_int64));
+    new_bias_tensor->Resize(common::make_ddim(shape_int64));
     dev_ctx->Alloc<float>(new_bias_tensor);
     memcpy(new_bias_tensor->data<float>(),
            bias_tensor->data<float>(),
diff --git a/paddle/fluid/framework/ir/trt_cross_multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/trt_cross_multihead_matmul_fuse_pass.cc
index 3f94c97baa6d8e..42a64e1a54007f 100644
--- a/paddle/fluid/framework/ir/trt_cross_multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/trt_cross_multihead_matmul_fuse_pass.cc
@@ -327,7 +327,7 @@ int TrtCrossMultiHeadMatmulFusePass::BuildCrossFusion(
     auto* wv_data = wv_tensor->data<float>();
     // combined_w_dims = [in,2,out]
     auto combined_w_kv_dims =
-        phi::make_ddim({wk_tensor->dims()[0], 2, wk_tensor->dims()[1]});
+        common::make_ddim({wk_tensor->dims()[0], 2, wk_tensor->dims()[1]});
     VLOG(5) << "trt cross attention trt wk_dim in:" << wk_tensor->dims()[0]
             << "trt cross attention trt wk_dim out:" << wk_tensor->dims()[1];
     auto* combined_w_kv_desc = mul1_w->Var();
diff --git a/paddle/fluid/framework/ir/trt_delete_weight_dequant_linear_op_pass.cc b/paddle/fluid/framework/ir/trt_delete_weight_dequant_linear_op_pass.cc
index 58ee274e1c62c6..deee6c9bb855bf 100644
--- a/paddle/fluid/framework/ir/trt_delete_weight_dequant_linear_op_pass.cc
+++ b/paddle/fluid/framework/ir/trt_delete_weight_dequant_linear_op_pass.cc
@@ -334,7 +334,7 @@ void TrtDeleteWeightQuantDequantLinearOpPass::ApplyImpl(
           "OP'attribute "));
     }
     weight_tensor->clear();  // clear int weight
-    weight_tensor->Resize(phi::make_ddim(phi::vectorize(w_dims)));
+    weight_tensor->Resize(common::make_ddim(common::vectorize(w_dims)));
     float* new_quantized_weight_data = dev_ctx->HostAlloc<float>(
         weight_tensor, weight_tensor->numel() * sizeof(float));
     memcpy(new_quantized_weight_data,
diff --git a/paddle/fluid/framework/ir/trt_flash_multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/trt_flash_multihead_matmul_fuse_pass.cc
index dd0726dba572ba..200d2e8ad0d384 100644
--- a/paddle/fluid/framework/ir/trt_flash_multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/trt_flash_multihead_matmul_fuse_pass.cc
@@ -336,7 +336,7 @@ int TrtFlashMultiHeadMatmulFusePass::BuildFlashFusion(
       // auto dims = wq_tensor->dims();
       // combined_w_dims = [in,3,out]
       auto combined_w_dims =
-          phi::make_ddim({wq_tensor->dims()[0], 3, wq_tensor->dims()[1]});
+          common::make_ddim({wq_tensor->dims()[0], 3, wq_tensor->dims()[1]});
       auto* combined_w_desc = mul0_w->Var();
       combined_w_desc->SetShape(
           {wq_tensor->dims()[0], 3, wq_tensor->dims()[1]});
diff --git a/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.cc
index db62c5dd3789cc..0bee108064d083 100644
--- a/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.cc
@@ -791,8 +791,8 @@ int TrtMultiHeadMatmulV2FusePass::BuildFusionV2(Graph* graph,
     auto* bv_data = bv_tensor->mutable_data<float>(platform::CPUPlace());
 
     auto combined_w_dims =
-        phi::make_ddim({wq_tensor->dims()[0], 3, wq_tensor->dims()[1]});
-    auto combined_bias_dims = phi::make_ddim({3, bq_tensor->dims()[0]});
+        common::make_ddim({wq_tensor->dims()[0], 3, wq_tensor->dims()[1]});
+    auto combined_bias_dims = common::make_ddim({3, bq_tensor->dims()[0]});
 
     // reuse the mul0_w and eltadd_0_b nodes for the combined nodes.
     auto* combined_w_desc = mul0_w->Var();
@@ -1257,8 +1257,8 @@ int TrtMultiHeadMatmulV3FusePass::BuildFusionV3(Graph* graph,
     auto* bv_data = bv_tensor->mutable_data<float>(platform::CPUPlace());
 
     auto combined_w_dims =
-        phi::make_ddim({wq_tensor->dims()[0], 3, wq_tensor->dims()[1]});
-    auto combined_bias_dims = phi::make_ddim({3, bq_tensor->dims()[0]});
+        common::make_ddim({wq_tensor->dims()[0], 3, wq_tensor->dims()[1]});
+    auto combined_bias_dims = common::make_ddim({3, bq_tensor->dims()[0]});
 
     // reuse the mul0_w and eltadd_0_b nodes for the combined nodes.
     auto* combined_w_desc = mul0_w->Var();
diff --git a/paddle/fluid/framework/ir/trt_qk_multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/trt_qk_multihead_matmul_fuse_pass.cc
index df1476e9db3454..f9804070226ecf 100644
--- a/paddle/fluid/framework/ir/trt_qk_multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/trt_qk_multihead_matmul_fuse_pass.cc
@@ -324,8 +324,8 @@ int TrtQkMultiHeadMatmulFusePass::BuildQkFusion(Graph* graph,
 
     // combined_w_dims = [in,2,out]
     auto combined_w_qk_dims =
-        phi::make_ddim({wq_tensor->dims()[0], 2, wq_tensor->dims()[1]});
-    auto combined_bias_dims = phi::make_ddim({2, bq_tensor->dims()[0]});
+        common::make_ddim({wq_tensor->dims()[0], 2, wq_tensor->dims()[1]});
+    auto combined_bias_dims = common::make_ddim({2, bq_tensor->dims()[0]});
 
     VLOG(3) << "trt qk attention trt wq_dim in:" << wq_tensor->dims()[0]
             << "trt qk attention trt wk_dim out:" << wq_tensor->dims()[1];
diff --git a/paddle/fluid/framework/ir/trt_remove_amp_strategy_op_pass.cc b/paddle/fluid/framework/ir/trt_remove_amp_strategy_op_pass.cc
index 2b3a702dcd5024..22115ea28d9f69 100644
--- a/paddle/fluid/framework/ir/trt_remove_amp_strategy_op_pass.cc
+++ b/paddle/fluid/framework/ir/trt_remove_amp_strategy_op_pass.cc
@@ -18,10 +18,10 @@
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/common/errors.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/phi/common/data_type.h"
-#include "paddle/phi/core/errors.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/trt_support_nhwc_pass.cc b/paddle/fluid/framework/ir/trt_support_nhwc_pass.cc
index 5a086acd7cac2e..d9907555a17b57 100644
--- a/paddle/fluid/framework/ir/trt_support_nhwc_pass.cc
+++ b/paddle/fluid/framework/ir/trt_support_nhwc_pass.cc
@@ -18,13 +18,13 @@
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/common/errors.h"
+#include "paddle/common/layout.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/phi/common/data_type.h"
-#include "paddle/phi/common/layout.h"
-#include "paddle/phi/core/errors.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/vit_attention_fuse_pass.cc b/paddle/fluid/framework/ir/vit_attention_fuse_pass.cc
index 23db75266310c6..382e1c60ee9895 100644
--- a/paddle/fluid/framework/ir/vit_attention_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/vit_attention_fuse_pass.cc
@@ -103,12 +103,12 @@ void VitAttentionFusePass::ApplyImpl(ir::Graph* graph) const {
     auto* w_tensor =
         scope->FindVar(matmul0_in_y->Name())->GetMutable<phi::DenseTensor>();
     auto w_dims =
-        phi::make_ddim({w_tensor->dims()[0], 3, w_tensor->dims()[1] / 3});
+        common::make_ddim({w_tensor->dims()[0], 3, w_tensor->dims()[1] / 3});
     w_tensor->Resize(w_dims);
 
     auto* b_tensor = scope->FindVar(elementwise0_in_y->Name())
                          ->GetMutable<phi::DenseTensor>();
-    auto bias_dims = phi::make_ddim({3, b_tensor->dims()[0] / 3});
+    auto bias_dims = common::make_ddim({3, b_tensor->dims()[0] / 3});
     b_tensor->Resize(bias_dims);
 
     desc.SetInput("W", {matmul0_in_y->Name()});
diff --git a/paddle/fluid/framework/ir/xpu/conv1d_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/conv1d_xpu_fuse_pass.cc
index 6667cff8db8b14..c845ea657f48b1 100644
--- a/paddle/fluid/framework/ir/xpu/conv1d_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/conv1d_xpu_fuse_pass.cc
@@ -473,7 +473,7 @@ int Conv1dXPUFusePass::ApplyImpl(ir::Graph* graph,
         scope->GetVar(conv_filter->Name())->GetMutable<phi::DenseTensor>();
     auto filter_dims = filter_t->dims();
     auto original_f_dims =
-        phi::make_ddim({filter_dims[0], filter_dims[1], filter_dims[3]});
+        common::make_ddim({filter_dims[0], filter_dims[1], filter_dims[3]});
     filter_t->Resize(original_f_dims);
     filter_dims = original_f_dims;
     // conv_filter fp16 --> fp32
diff --git a/paddle/fluid/framework/ir/xpu/conv2d_bias_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/conv2d_bias_fuse_pass.cc
index 7f53507a85c83e..4ce1b239b9f891 100644
--- a/paddle/fluid/framework/ir/xpu/conv2d_bias_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/conv2d_bias_fuse_pass.cc
@@ -224,7 +224,7 @@ void Conv2dBiasFusePass::FoldConv2dBias(ir::Graph* graph) const {
     ew_bias_add_y_desc->SetShape({y_shape[1]});
     auto* ew_bias_add_y_tensor =
         scope->GetVar(ew_bias_add_y->Name())->GetMutable<phi::DenseTensor>();
-    ew_bias_add_y_tensor->Resize(phi::make_ddim({y_shape[1]}));
+    ew_bias_add_y_tensor->Resize(common::make_ddim({y_shape[1]}));
     ew_bias_add_desc->Flush();
 
     found_subgraph_count++;
diff --git a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
index bb7fb9f7a7535d..f353a3fc952b39 100644
--- a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
@@ -501,7 +501,8 @@ void Conv2dXPUFusePass::CreateTheReplicatedWeights(
 
     VarDesc replicated_filter_desc(replicated_filter_name);
     replicated_filter_desc.SetPersistable(true);
-    replicated_filter_desc.SetShape(vectorize(replicated_filter_tensor.dims()));
+    replicated_filter_desc.SetShape(
+        common::vectorize(replicated_filter_tensor.dims()));
     replicated_filter_desc.SetDataType(
         framework::TransToProtoVarType(replicated_filter_tensor.dtype()));
     graph->CreateVarNode(&replicated_filter_desc);
diff --git a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
index d2d9e57084ef46..00788289402b51 100644
--- a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
@@ -311,7 +311,8 @@ void FcXPUFusePass::CreateTheReplicatedWeights(
 
     VarDesc replicated_filter_desc(replicated_w_name);
     replicated_filter_desc.SetPersistable(true);
-    replicated_filter_desc.SetShape(vectorize(replicated_filter_tensor.dims()));
+    replicated_filter_desc.SetShape(
+        common::vectorize(replicated_filter_tensor.dims()));
     replicated_filter_desc.SetDataType(
         framework::TransToProtoVarType(replicated_filter_tensor.dtype()));
     graph->CreateVarNode(&replicated_filter_desc);
diff --git a/paddle/fluid/framework/ir/xpu/fused_multi_transformer_xpu_pass.cc b/paddle/fluid/framework/ir/xpu/fused_multi_transformer_xpu_pass.cc
index ce3bd1754edb4a..5ee61cf88c73da 100644
--- a/paddle/fluid/framework/ir/xpu/fused_multi_transformer_xpu_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/fused_multi_transformer_xpu_pass.cc
@@ -566,7 +566,7 @@ int FusedMultiTransformerXPUPass::FusedMultiTransformerXPUQuant(
       // Update dst var_desc in block
       VarDesc dst_desc(max_buffer_name);
       dst_desc.SetPersistable(true);
-      dst_desc.SetShape(vectorize(max_buffer_tensor.dims()));
+      dst_desc.SetShape(common::vectorize(max_buffer_tensor.dims()));
       dst_desc.SetDataType(
           framework::TransToProtoVarType(max_buffer_tensor.dtype()));
       max_buffer_node = graph->CreateVarNode(&dst_desc);
diff --git a/paddle/fluid/framework/ir/xpu/multi_encoder_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/multi_encoder_xpu_fuse_pass.cc
index 905e15ecba265c..0ee61208a1cc3f 100644
--- a/paddle/fluid/framework/ir/xpu/multi_encoder_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/multi_encoder_xpu_fuse_pass.cc
@@ -574,7 +574,7 @@ void MultiEncoderXPUFusePass::PrepareQKVWeight(Graph* graph,
     // Update qkv_w_int16 var_desc in block
     VarDesc qkv_w_int16_desc(qkv_w_int16_name);
     qkv_w_int16_desc.SetPersistable(true);
-    qkv_w_int16_desc.SetShape(vectorize(qkv_w_int16_t.dims()));
+    qkv_w_int16_desc.SetShape(common::vectorize(qkv_w_int16_t.dims()));
     qkv_w_int16_desc.SetDataType(
         framework::TransToProtoVarType(qkv_w_int16_t.dtype()));
     *qkv_w_int16 = graph->CreateVarNode(&qkv_w_int16_desc);
@@ -586,7 +586,7 @@ void MultiEncoderXPUFusePass::PrepareQKVWeight(Graph* graph,
     // Update qkv_w_max var_desc in block
     VarDesc qkv_w_max_desc(qkv_w_max_name);
     qkv_w_max_desc.SetPersistable(true);
-    qkv_w_max_desc.SetShape(vectorize(qkv_w_max_t.dims()));
+    qkv_w_max_desc.SetShape(common::vectorize(qkv_w_max_t.dims()));
     qkv_w_max_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32);
     *qkv_w_max = graph->CreateVarNode(&qkv_w_max_desc);
     auto* block_qkv_w_max_desc = block->Var(qkv_w_max_name);
@@ -671,7 +671,7 @@ void MultiEncoderXPUFusePass::PrepareQKVBias(Graph* graph,
     // Update qkv_bias var_desc in block
     VarDesc qkv_bias_desc(qkv_bias_name);
     qkv_bias_desc.SetPersistable(true);
-    qkv_bias_desc.SetShape(vectorize(qkv_bias_tensor.dims()));
+    qkv_bias_desc.SetShape(common::vectorize(qkv_bias_tensor.dims()));
     qkv_bias_desc.SetDataType(
         framework::TransToProtoVarType(qkv_bias_tensor.dtype()));
     *qkv_bias = graph->CreateVarNode(&qkv_bias_desc);
diff --git a/paddle/fluid/framework/ir/xpu/pass_utils.cc b/paddle/fluid/framework/ir/xpu/pass_utils.cc
index 49b7fe7c7ba56d..dbc899c93120ac 100644
--- a/paddle/fluid/framework/ir/xpu/pass_utils.cc
+++ b/paddle/fluid/framework/ir/xpu/pass_utils.cc
@@ -105,7 +105,7 @@ size_t HashTensor(const phi::DenseTensor& in) {
   auto in_dims = in.dims();
   HashCombine(&ret,
               phi::DataTypeToString(in.dtype()),
-              phi::DataLayoutToString(in.layout()),
+              common::DataLayoutToString(in.layout()),
               in_dims.size());
   for (int i = 0; i < in_dims.size(); i++) {
     HashCombine(&ret, in_dims[i]);
@@ -166,7 +166,7 @@ void PrepareWeight(Graph* graph,
     // Update dst_weight var_desc in block
     VarDesc dst_weight_desc(dst_weight_name);
     dst_weight_desc.SetPersistable(true);
-    dst_weight_desc.SetShape(vectorize(dst_weight_tensor.dims()));
+    dst_weight_desc.SetShape(common::vectorize(dst_weight_tensor.dims()));
     dst_weight_desc.SetDataType(
         framework::TransToProtoVarType(dst_weight_tensor.dtype()));
     *dst_weight = graph->CreateVarNode(&dst_weight_desc);
@@ -178,7 +178,8 @@ void PrepareWeight(Graph* graph,
     // Update dst_weight_max var_desc in block
     VarDesc dst_weight_max_desc(dst_weight_max_name);
     dst_weight_max_desc.SetPersistable(true);
-    dst_weight_max_desc.SetShape(vectorize(dst_weight_max_tensor.dims()));
+    dst_weight_max_desc.SetShape(
+        common::vectorize(dst_weight_max_tensor.dims()));
     dst_weight_max_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32);
     *dst_weight_max = graph->CreateVarNode(&dst_weight_max_desc);
     auto* block_dst_weight_max_desc = block->Var(dst_weight_max_name);
@@ -226,7 +227,8 @@ void PrepareWeight(Graph* graph,
       // Update dst_scale_max var_desc in block
       VarDesc dst_scale_max_desc(dst_scale_max_name);
       dst_scale_max_desc.SetPersistable(true);
-      dst_scale_max_desc.SetShape(vectorize(dst_weight_max_tensor.dims()));
+      dst_scale_max_desc.SetShape(
+          common::vectorize(dst_weight_max_tensor.dims()));
       dst_scale_max_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32);
       *dst_scale_max = graph->CreateVarNode(&dst_scale_max_desc);
       auto* block_dst_scale_max_desc = block->Var(dst_scale_max_name);
@@ -309,7 +311,7 @@ void PrepareBias(
     // Update dst var_desc in block
     VarDesc dst_desc(dst_name);
     dst_desc.SetPersistable(true);
-    dst_desc.SetShape(vectorize(dst_tensor.dims()));
+    dst_desc.SetShape(common::vectorize(dst_tensor.dims()));
     dst_desc.SetDataType(framework::TransToProtoVarType(dst_tensor.dtype()));
     *dst = graph->CreateVarNode(&dst_desc);
     auto* block_dst_desc = block->Var(dst_name);
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index 96cff2521dfe7c..102719079eaae2 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -463,8 +463,8 @@ void MergeLoDTensor(phi::DenseTensor *target,
           platform::errors::InvalidArgument(
               "phi::DenseTensor layout does not match, expected layout is %s, "
               "actual layout is %s.",
-              phi::DataLayoutToString(new_layout),
-              phi::DataLayoutToString(t->layout())));
+              common::DataLayoutToString(new_layout),
+              common::DataLayoutToString(t->layout())));
       auto tensor_dims = t->dims();
       PADDLE_ENFORCE_EQ(tensor_dims.size(),
                         new_dim.size(),
diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h
index 68aa8fceee96d4..524c9472b1c0cc 100644
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -21,10 +21,10 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/mixed_vector.h"
 
diff --git a/paddle/fluid/framework/new_executor/CMakeLists.txt b/paddle/fluid/framework/new_executor/CMakeLists.txt
index 6d19aa474c7f75..0ae4e35fdf13e6 100644
--- a/paddle/fluid/framework/new_executor/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/CMakeLists.txt
@@ -28,4 +28,4 @@ cc_library(
 cc_library(
   staticgraph_executor_statistics
   SRCS executor_statistics.cc
-  DEPS enforce glog phi)
+  DEPS enforce glog phi common)
diff --git a/paddle/fluid/framework/new_executor/feed_fetch_utils.cc b/paddle/fluid/framework/new_executor/feed_fetch_utils.cc
index dee86a8463d0fa..f5b4b6cceae65c 100644
--- a/paddle/fluid/framework/new_executor/feed_fetch_utils.cc
+++ b/paddle/fluid/framework/new_executor/feed_fetch_utils.cc
@@ -193,8 +193,8 @@ void MergeTensors(const std::vector<const phi::DenseTensor*>& tensors,
           phi::errors::InvalidArgument(
               "phi::DenseTensor layout does not match, expected layout is %s, "
               "actual layout is %s.",
-              phi::DataLayoutToString(new_layout),
-              phi::DataLayoutToString(t->layout())));
+              common::DataLayoutToString(new_layout),
+              common::DataLayoutToString(t->layout())));
       if (rank > 0) {
         auto tensor_dims = t->dims();
         PADDLE_ENFORCE_EQ(tensor_dims.size(),
diff --git a/paddle/fluid/framework/new_executor/instruction/CMakeLists.txt b/paddle/fluid/framework/new_executor/instruction/CMakeLists.txt
index 1d0d3edf7081b1..abc8e86fb1663f 100644
--- a/paddle/fluid/framework/new_executor/instruction/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/instruction/CMakeLists.txt
@@ -9,11 +9,11 @@ cc_library(
        tuple_push_instruction.cc
        tuple_pop_instruction.cc
        instruction_util.cc
-  DEPS pir_adaptor phi framework_proto)
+  DEPS pir_adaptor phi common framework_proto)
 
 if(WITH_CINN AND NOT CINN_ONLY)
   cc_library(
     cinn_jit_instruction NOT_FOR_INFER
     SRCS cinn_jit_instruction.cc
-    DEPS phi cinnapi cinn_op_dialect cinn_runtime_dialect)
+    DEPS phi common cinnapi cinn_op_dialect cinn_runtime_dialect)
 endif()
diff --git a/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc b/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc
index f9124c57874f31..9ff10d0ae7c91c 100644
--- a/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc
@@ -18,9 +18,9 @@
 #include "paddle/cinn/hlir/dialect/runtime/ir/runtime_dialect.h"
 #include "paddle/cinn/hlir/framework/instruction.h"
 #include "paddle/cinn/hlir/framework/pir_compiler.h"
+#include "paddle/common/errors.h"
 #include "paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h"
 #include "paddle/fluid/framework/paddle2cinn/transform_type.h"
-#include "paddle/phi/core/errors.h"
 #if defined(PADDLE_WITH_CUDA)
 #include "paddle/cinn/runtime/cinn_runtime.h"
 #endif
diff --git a/paddle/fluid/framework/new_executor/interpreter/CMakeLists.txt b/paddle/fluid/framework/new_executor/interpreter/CMakeLists.txt
index a0c3a06a02b52d..e2d221d167b139 100644
--- a/paddle/fluid/framework/new_executor/interpreter/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/interpreter/CMakeLists.txt
@@ -31,6 +31,7 @@ set(INTERPRETER_DEPS
     scope
     glog
     phi
+    common
     ${DEVICE_EVENT_LIBS}
     glog)
 
diff --git a/paddle/fluid/framework/new_executor/interpreter/job.h b/paddle/fluid/framework/new_executor/interpreter/job.h
index 952702d6e2f0a5..21acaa54aed0b5 100644
--- a/paddle/fluid/framework/new_executor/interpreter/job.h
+++ b/paddle/fluid/framework/new_executor/interpreter/job.h
@@ -16,9 +16,9 @@
 #include <glog/logging.h>
 #include <set>
 
+#include "paddle/common/errors.h"
+#include "paddle/common/macros.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
-#include "paddle/phi/core/macros.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/new_executor/interpreter/plan.h b/paddle/fluid/framework/new_executor/interpreter/plan.h
index beb2c176f94ad8..20dbbfad0e1b20 100644
--- a/paddle/fluid/framework/new_executor/interpreter/plan.h
+++ b/paddle/fluid/framework/new_executor/interpreter/plan.h
@@ -20,8 +20,8 @@
 
 #include "paddle/fluid/framework/new_executor/interpreter/job.h"
 
+#include "paddle/common/macros.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/phi/core/macros.h"
 #include "paddle/pir/core/program.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/new_executor/workqueue/CMakeLists.txt b/paddle/fluid/framework/new_executor/workqueue/CMakeLists.txt
index b0ab1826fb4bfb..bbac454694e2f4 100644
--- a/paddle/fluid/framework/new_executor/workqueue/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/workqueue/CMakeLists.txt
@@ -1,11 +1,11 @@
 cc_library(
   workqueue_utils
   SRCS workqueue_utils.cc events_waiter.cc
-  DEPS enforce glog)
+  DEPS enforce glog common)
 cc_library(
   workqueue
   SRCS workqueue.cc
-  DEPS workqueue_utils enforce glog phi)
+  DEPS workqueue_utils enforce glog phi common)
 cc_test(
   workqueue_test
   SRCS workqueue_test.cc
diff --git a/paddle/fluid/framework/new_executor/workqueue/event_count.h b/paddle/fluid/framework/new_executor/workqueue/event_count.h
index c9fd47b92383f8..9f80b02904dad6 100644
--- a/paddle/fluid/framework/new_executor/workqueue/event_count.h
+++ b/paddle/fluid/framework/new_executor/workqueue/event_count.h
@@ -56,7 +56,7 @@
 #include <vector>
 
 #include "glog/logging.h"
-#include "paddle/phi/core/macros.h"
+#include "paddle/common/macros.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/no_need_buffer_vars_inference.h b/paddle/fluid/framework/no_need_buffer_vars_inference.h
index 37f790a0d3f415..767abb1f07e6c2 100644
--- a/paddle/fluid/framework/no_need_buffer_vars_inference.h
+++ b/paddle/fluid/framework/no_need_buffer_vars_inference.h
@@ -20,10 +20,10 @@
 #include <unordered_set>
 #include <vector>
 
+#include "paddle/common/macros.h"
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/phi/core/macros.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 1846b7c9f0f71b..357cd400ba5b69 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -367,7 +367,7 @@ class CompileTimeInferShapeContext : public InferShapeContext {
     DDim res;
     try {
       auto shape = var->GetShape();
-      res = phi::make_ddim(shape);
+      res = common::make_ddim(shape);
     } catch (...) {
       VLOG(5) << "GetDim of variable " << name << " error";
       std::rethrow_exception(std::current_exception());
@@ -1319,7 +1319,7 @@ std::vector<DDim> CompileTimeInferShapeContext::GetRepeatedDims(
   try {
     auto shapes = var->GetShapes();
     for (const auto &s : shapes) {
-      res.push_back(phi::make_ddim(s));
+      res.push_back(common::make_ddim(s));
     }
   } catch (...) {
     VLOG(5) << "GetRepeatedDim of variable " << name << " error.";
@@ -1330,7 +1330,7 @@ std::vector<DDim> CompileTimeInferShapeContext::GetRepeatedDims(
 
 void CompileTimeInferShapeContext::SetDim(const std::string &name,
                                           const DDim &dim) {
-  block_.FindVarRecursive(name)->SetShape(vectorize(dim));
+  block_.FindVarRecursive(name)->SetShape(common::vectorize(dim));
 }
 
 void CompileTimeInferShapeContext::SetRepeatedDims(
@@ -1339,7 +1339,8 @@ void CompileTimeInferShapeContext::SetRepeatedDims(
   PADDLE_ENFORCE_NOT_NULL(
       var, platform::errors::NotFound("Variable %s is not found.", name));
   std::vector<std::vector<int64_t>> dim_vec(dims.size());
-  std::transform(dims.begin(), dims.end(), dim_vec.begin(), phi::vectorize<>);
+  std::transform(
+      dims.begin(), dims.end(), dim_vec.begin(), common::vectorize<>);
   var->SetShapes(dim_vec);
 }
 
diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h
index 852a09ed2fdc87..fa8f6fe5892134 100644
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -21,11 +21,11 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#include "paddle/common/macros.h"
 #include "paddle/fluid/distributed/auto_parallel/dist_attr.h"
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/framework/var_desc.h"
-#include "paddle/phi/core/macros.h"
 #include "paddle/utils/test_macros.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index 6e33f74f432a62..84ee045918fcd7 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -26,6 +26,7 @@ limitations under the License. */
 
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include "glog/logging.h"               // For VLOG()
+#include "paddle/common/macros.h"
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/details/op_registry.h"
 #include "paddle/fluid/framework/grad_op_desc_maker.h"
@@ -35,7 +36,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/shape_inference.h"
 #include "paddle/phi/core/flags.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/macros.h"
 #include "paddle/utils/test_macros.h"
 
 namespace paddle {
@@ -178,7 +178,7 @@ inline void RegisterKernelClass(const char* op_type,
   if (std::is_same<PlaceType, platform::CustomPlace>::value) {
     OpKernelType key(ToDataType(std::type_index(typeid(T))),
                      platform::CustomPlace(library_type),
-                     phi::StringToDataLayout(data_layout),
+                     common::StringToDataLayout(data_layout),
                      LibraryType::kPlain,
                      customized_type_value);
     OperatorWithKernel::AllOpKernels()[op_type][key] = func;
@@ -187,7 +187,7 @@ inline void RegisterKernelClass(const char* op_type,
 #endif
   OpKernelType key(ToDataType(std::type_index(typeid(T))),
                    PlaceType(),
-                   phi::StringToDataLayout(data_layout),
+                   common::StringToDataLayout(data_layout),
                    StringToLibraryType(library_type),
                    customized_type_value);
   OperatorWithKernel::AllOpKernels()[op_type][key] = func;
diff --git a/paddle/fluid/framework/op_version_registry.h b/paddle/fluid/framework/op_version_registry.h
index 278abe825da979..236a0e2b86187e 100644
--- a/paddle/fluid/framework/op_version_registry.h
+++ b/paddle/fluid/framework/op_version_registry.h
@@ -20,10 +20,10 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#include "paddle/common/macros.h"
 #include "paddle/fluid/framework/op_version_proto.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/phi/common/scalar.h"
-#include "paddle/phi/core/macros.h"
 #include "paddle/utils/none.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 3484c5cc05940e..4ae5e0ebdf8720 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_set>
 
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/data_type_transform.h"
@@ -37,7 +38,6 @@ limitations under the License. */
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/compat/get_kerneltype_forvar_utils.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/flags.h"
 #include "paddle/phi/core/kernel_context.h"
 #include "paddle/phi/core/kernel_factory.h"
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 0bcaaef0b48b28..d51c0ce0f415d0 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -40,12 +40,12 @@ limitations under the License. */
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/device_context.h"
 
+#include "paddle/common/macros.h"
 #include "paddle/phi/core/compat/arg_map_context.h"
 #include "paddle/phi/core/compat/op_utils.h"
 #include "paddle/phi/core/flags.h"
 #include "paddle/phi/core/kernel_context.h"
 #include "paddle/phi/core/kernel_factory.h"
-#include "paddle/phi/core/macros.h"
 #include "paddle/utils/flat_hash_map.h"
 #include "paddle/utils/test_macros.h"
 
diff --git a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
index 04241179e3c0de..4a0a869b8a2bd8 100644
--- a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
+++ b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
@@ -15,7 +15,8 @@ set(paddle2cinn_deps
     cinn_framework_proto
     schedule_desc_proto
     auto_schedule_proto
-    parallel_executor)
+    parallel_executor
+    common)
 if(WITH_MKLDNN)
   set(paddle2cinn ${paddle2cinn} mkldnn)
 endif()
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc
index 6fdbbaae9d70c0..00a6c94e4c0ed1 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc
@@ -21,10 +21,10 @@
 #include <sstream>
 #include <string>
 
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/paddle2cinn/transform_type.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/utils/string/string_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h
index d1797ddf6bbd48..601b18464c2c67 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h
+++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h
@@ -17,10 +17,10 @@
 #include <functional>
 #include <map>
 
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/phi/common/data_type.h"
-#include "paddle/phi/core/ddim.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h
index 103b9a3ca0d4bd..f926829dc9bd47 100644
--- a/paddle/fluid/framework/reader.h
+++ b/paddle/fluid/framework/reader.h
@@ -19,9 +19,9 @@
 #include <utility>
 #include <vector>
 
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/phi/core/ddim.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/shape_inference.h b/paddle/fluid/framework/shape_inference.h
index 72cfa4da2f2456..49603b34255db9 100644
--- a/paddle/fluid/framework/shape_inference.h
+++ b/paddle/fluid/framework/shape_inference.h
@@ -17,10 +17,10 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/framework/variable.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/type_defs.h"
 #include "paddle/utils/small_vector.h"
 
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index e92734a1e35dd4..18e5e87437ae2f 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -466,7 +466,7 @@ void TensorToStream(std::ostream& os,
      // void*    protobuf message
     proto::VarType::TensorDesc desc;
     desc.set_data_type(framework::TransToProtoVarType(tensor.dtype()));
-    auto dims = phi::vectorize(tensor.dims());
+    auto dims = common::vectorize(tensor.dims());
     auto* pb_dims = desc.mutable_dims();
     pb_dims->Resize(static_cast<int>(dims.size()), 0);
     std::copy(dims.begin(), dims.end(), pb_dims->begin());
@@ -608,7 +608,7 @@ void TensorFromStream(std::istream& is,
         platform::errors::InvalidArgument("Cannot parse tensor desc"));
   }
   {  // read tensor
-    tensor->Resize(phi::make_ddim(shape));
+    tensor->Resize(common::make_ddim(shape));
     size_t seekg = seek * framework::SizeOfType(desc.data_type());
     is.seekg(seekg, is.cur);  // NOLINT
 
@@ -621,7 +621,7 @@ void TensorFromStream(std::istream& is,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
     defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
       phi::DenseTensor cpu_tensor;
-      cpu_tensor.Resize(phi::make_ddim(shape));
+      cpu_tensor.Resize(common::make_ddim(shape));
       framework::VisitDataType(
           desc.data_type(),
           DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace()));
@@ -684,7 +684,7 @@ void TensorFromStream(std::istream& is,
     std::vector<int64_t> dims;
     dims.reserve(static_cast<size_t>(desc.dims().size()));
     std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
-    tensor->Resize(phi::make_ddim(dims));
+    tensor->Resize(common::make_ddim(dims));
     void* buf = nullptr;
     phi::CPUContext ctx;
     size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
@@ -694,7 +694,7 @@ void TensorFromStream(std::istream& is,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
     defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
       phi::DenseTensor cpu_tensor;
-      cpu_tensor.Resize(phi::make_ddim(dims));
+      cpu_tensor.Resize(common::make_ddim(dims));
       framework::VisitDataType(
           desc.data_type(),
           DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace()));
@@ -802,14 +802,14 @@ void TensorFromDLPack(const ::DLTensor& dl_tensor, phi::DenseTensor* dst) {
             dl_tensor.shape + dl_tensor.ndim,
             std::back_inserter(vec));
 
-  framework::DDim vddim = phi::make_ddim(vec);
+  framework::DDim vddim = common::make_ddim(vec);
 
   dst->Resize(vddim);
   ::DLDataType type = dl_tensor.dtype;
   void* dst_ptr = GetDstPtrByDLDataType(type, dst, dst_place);
 
   auto src_ptr = static_cast<const void*>(dl_tensor.data);
-  auto size = phi::product(vddim) * type.bits / 8;
+  auto size = common::product(vddim) * type.bits / 8;
 
   if (dl_tensor.device.device_type == kDLCPU) {
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
@@ -841,12 +841,12 @@ void TensorFromDLPack(const DLManagedTensor* src, phi::DenseTensor* dst) {
             src->dl_tensor.shape + src->dl_tensor.ndim,
             std::back_inserter(vec));
 
-  framework::DDim vddim = phi::make_ddim(vec);
+  framework::DDim vddim = common::make_ddim(vec);
   dst->Resize(vddim);
   ::DLDataType type = src->dl_tensor.dtype;
 
   auto src_ptr = static_cast<const void*>(src->dl_tensor.data);
-  auto size = phi::product(vddim) * type.bits / 8;
+  auto size = common::product(vddim) * type.bits / 8;
 
   if (src->dl_tensor.device.device_type == kDLCPU) {
     platform::CPUPlace dst_place = platform::CPUPlace();
@@ -973,7 +973,7 @@ TEST_API std::ostream& operator<<(std::ostream& os, const phi::DenseTensor& t) {
     os << "  - lod: " << t.lod() << "\n";
   }
   os << "  - shape: [" << t.dims() << "]\n";
-  os << "  - layout: " << phi::DataLayoutToString(t.layout()) << "\n";
+  os << "  - layout: " << common::DataLayoutToString(t.layout()) << "\n";
 
   if (!t.initialized()) {
     os << "uninited\n";
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index e6e3c01be624a4..b6d846e9a0c12d 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -1,11 +1,11 @@
 cc_library(
   imperative_flag
   SRCS flags.cc
-  DEPS phi)
+  DEPS phi common)
 cc_library(
   var_helper
   SRCS var_helper.cc
-  DEPS tensor phi)
+  DEPS tensor phi common)
 if(WITH_XPU)
   cc_library(
     prepared_operator
@@ -21,6 +21,7 @@ if(WITH_XPU)
          data_transform
          nan_inf_utils
          phi
+         common
          var_helper
          profiler
          place)
@@ -38,6 +39,7 @@ else()
          data_transform
          nan_inf_utils
          phi
+         common
          var_helper
          profiler
          place)
@@ -45,14 +47,19 @@ endif()
 cc_library(
   layer
   SRCS layer.cc
-  DEPS prepared_operator phi imperative_flag variable_helper op_registry
+  DEPS prepared_operator
+       phi
+       common
+       imperative_flag
+       variable_helper
+       op_registry
        var_helper)
 add_subdirectory(jit)
 if(WITH_GPU)
   cc_library(
     layout_autotune
     SRCS layout_autotune.cc
-    DEPS op_info phi)
+    DEPS op_info phi common)
 else()
   cc_library(
     layout_autotune
@@ -75,19 +82,20 @@ cc_library(
        var_helper
        layout_autotune
        ops_extra_info
-       phi)
+       phi
+       common)
 cc_library(
   basic_engine
   SRCS basic_engine.cc
-  DEPS layer gradient_accumulator phi)
+  DEPS layer gradient_accumulator phi common)
 cc_library(
   engine
   SRCS basic_engine.cc partial_grad_engine.cc
-  DEPS layer gradient_accumulator phi)
+  DEPS layer gradient_accumulator phi common)
 cc_library(
   imperative_profiler
   SRCS profiler.cc
-  DEPS phi)
+  DEPS phi common)
 if(NOT WIN32)
   if(WITH_NCCL OR WITH_RCCL)
     cc_library(
@@ -150,7 +158,7 @@ if(NOT WIN32)
   cc_library(
     data_loader
     SRCS data_loader.cc
-    DEPS enforce)
+    DEPS enforce common)
 endif()
 if(WITH_GLOO)
   cc_library(
@@ -173,4 +181,10 @@ endif()
 cc_library(
   gradient_accumulator
   SRCS gradient_accumulator.cc
-  DEPS operator lod_tensor selected_rows_utils var_type_traits layer phi)
+  DEPS operator
+       lod_tensor
+       selected_rows_utils
+       var_type_traits
+       layer
+       phi
+       common)
diff --git a/paddle/fluid/imperative/all_reduce.cc b/paddle/fluid/imperative/all_reduce.cc
index d5f03924e28c17..c4bb42e4c22bb4 100644
--- a/paddle/fluid/imperative/all_reduce.cc
+++ b/paddle/fluid/imperative/all_reduce.cc
@@ -146,7 +146,7 @@ static void AllReduce(const phi::SelectedRows &src,
   auto *dst_tensor = dst->mutable_value();
   auto dims = src_tensor.dims();
   dims[0] = rows_num;
-  auto feature_size = phi::product(dims) / dims[0];
+  auto feature_size = common::product(dims) / dims[0];
   dst_tensor->Resize(dims);
   auto *dst_tensor_ptr = dst_tensor->mutable_data(place, src_tensor.dtype());
   const auto *src_tensor_ptr = src_tensor.data();
diff --git a/paddle/fluid/imperative/gloo_context.cc b/paddle/fluid/imperative/gloo_context.cc
index 16de1baf72da96..4e0df45e840f25 100644
--- a/paddle/fluid/imperative/gloo_context.cc
+++ b/paddle/fluid/imperative/gloo_context.cc
@@ -165,7 +165,7 @@ void GLOOParallelContext::AllReduce(const phi::SelectedRows &src,
   auto *dst_tensor = dst->mutable_value();
   auto dims = src_tensor.dims();
   dims[0] = rows_num;
-  auto feature_size = phi::product(dims) / dims[0];
+  auto feature_size = common::product(dims) / dims[0];
   dst_tensor->Resize(dims);
 
   std::vector<size_t> element_nums = rows_num_vector;
diff --git a/paddle/fluid/imperative/infer_shape_context.h b/paddle/fluid/imperative/infer_shape_context.h
index 80c52ab4ac10f8..e4aeb477db8ad1 100644
--- a/paddle/fluid/imperative/infer_shape_context.h
+++ b/paddle/fluid/imperative/infer_shape_context.h
@@ -17,13 +17,13 @@
 #include <string>
 #include <vector>
 
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/shape_inference.h"
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/imperative/var_helper.h"
 #include "paddle/fluid/imperative/variable_wrapper.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_factory.h"
 
 namespace paddle {
diff --git a/paddle/fluid/imperative/jit/program_desc_tracer.cc b/paddle/fluid/imperative/jit/program_desc_tracer.cc
index deda1ff572a704..86a38f3942aaa7 100644
--- a/paddle/fluid/imperative/jit/program_desc_tracer.cc
+++ b/paddle/fluid/imperative/jit/program_desc_tracer.cc
@@ -264,7 +264,7 @@ void ProgramDescTracer::InsertVarIfNotExist(
   if (inner_var.IsType<phi::DenseTensor>()) {
     const auto &tensor = inner_var.Get<phi::DenseTensor>();
     new_var_desc->SetType(framework::proto::VarType::LOD_TENSOR);
-    new_var_desc->SetShape(phi::vectorize<int64_t>(tensor.dims()));
+    new_var_desc->SetShape(common::vectorize<int64_t>(tensor.dims()));
     new_var_desc->SetLoDLevel(static_cast<int32_t>(tensor.lod().size()));
     if (tensor.IsInitialized()) {
       new_var_desc->SetDataType(framework::TransToProtoVarType(tensor.dtype()));
diff --git a/paddle/fluid/imperative/layout_autotune.cc b/paddle/fluid/imperative/layout_autotune.cc
index 18baaf98fdf11c..4075be6491ff19 100644
--- a/paddle/fluid/imperative/layout_autotune.cc
+++ b/paddle/fluid/imperative/layout_autotune.cc
@@ -14,12 +14,12 @@
 
 #include "paddle/fluid/imperative/layout_autotune.h"
 
+#include "paddle/common/errors.h"
 #include "paddle/fluid/eager/api/utils/global_utils.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/imperative/layout_transformer.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 namespace paddle {
 namespace imperative {
 
@@ -207,7 +207,7 @@ paddle::imperative::NameVarMap<VarType> AutoTuneLayout(
       VLOG(3) << "Tune the layout from "
               << PADDLE_GET_CONST(std::string, (*attrs)["data_format"])
               << " to "
-              << phi::DataLayoutToString(
+              << common::DataLayoutToString(
                      LayoutAutoTune::Instance().GetDesiredLayout());
     }
   }
diff --git a/paddle/fluid/imperative/layout_autotune.h b/paddle/fluid/imperative/layout_autotune.h
index 4b08a34f943f43..bee6529493fecd 100644
--- a/paddle/fluid/imperative/layout_autotune.h
+++ b/paddle/fluid/imperative/layout_autotune.h
@@ -18,9 +18,9 @@
 #include <memory>
 #include <unordered_set>
 
+#include "paddle/common/layout.h"
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/imperative/tracer.h"
-#include "paddle/phi/common/layout.h"
 namespace paddle {
 namespace imperative {
 
diff --git a/paddle/fluid/imperative/layout_transformer.h b/paddle/fluid/imperative/layout_transformer.h
index 61bd4f9dfe2b8f..5827e824dad90e 100644
--- a/paddle/fluid/imperative/layout_transformer.h
+++ b/paddle/fluid/imperative/layout_transformer.h
@@ -13,12 +13,12 @@
 // limitations under the License.
 
 #pragma once
+#include "paddle/common/errors.h"
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/imperative/layout_autotune.h"
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/fluid/imperative/var_helper.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/tensor_utils.h"
 namespace paddle {
 namespace imperative {
@@ -62,10 +62,10 @@ std::shared_ptr<VarType> TraceTransposeOp(
   tracer->TraceOp("transpose2", ins, outs, std::move(attrs));
   paddle::imperative::SetDataLayout(out, layout);
   VLOG(4) << "Transpose " << paddle::imperative::GetNameFromVar(var) << "["
-          << phi::DataLayoutToString(paddle::imperative::GetDataLayout(var))
+          << common::DataLayoutToString(paddle::imperative::GetDataLayout(var))
           << "]"
           << " to " << paddle::imperative::GetNameFromVar(out) << "["
-          << phi::DataLayoutToString(paddle::imperative::GetDataLayout(out))
+          << common::DataLayoutToString(paddle::imperative::GetDataLayout(out))
           << "]";
   return out;
 }
@@ -102,7 +102,7 @@ class LayoutTransformer {
       }
     }
     VLOG(3) << "Optimze Layout agnostic op: " << type_ << " "
-            << phi::DataLayoutToString(in_layout);
+            << common::DataLayoutToString(in_layout);
     if (in_layout != DataLayout::UNDEFINED) {
       SetVarsLayout(outs, in_layout);
     }
@@ -184,8 +184,8 @@ class HeavilyLayoutSensitiveOpTransformer : public LayoutTransformer<VarType> {
 
     // Step 1: Adjust the data_layout attr to the desired layout
     auto desired_layout = LayoutAutoTune::Instance().GetDesiredLayout();
-    std::string desired_layout_str =
-        phi::DataLayoutToString(LayoutAutoTune::Instance().GetDesiredLayout());
+    std::string desired_layout_str = common::DataLayoutToString(
+        LayoutAutoTune::Instance().GetDesiredLayout());
     if (attrs->find("data_format") != attrs->end() &&
         PADDLE_GET_CONST(std::string, (*attrs)["data_format"]) !=
             desired_layout_str) {
@@ -251,10 +251,10 @@ class LightlyLayoutSensitiveOpTransformer : public LayoutTransformer<VarType> {
       for (auto& var : pair.second) {
         if (var != nullptr) {
           VLOG(3) << "Tune the layout from "
-                  << phi::DataLayoutToString(
+                  << common::DataLayoutToString(
                          paddle::imperative::GetDataLayout(var))
                   << " to "
-                  << phi::DataLayoutToString(
+                  << common::DataLayoutToString(
                          LayoutAutoTune::Instance().GetDesiredLayout());
         }
         if (var != nullptr &&
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index b03aadd4dc6aa2..4bbc52662fc96e 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -841,7 +841,7 @@ void Reducer::MarkGroupReady(size_t group_index) {
     UNUSED const int run_order = next_group_ % nrings_;
 
     auto *tensor = group.dense_contents_.GetMutable<phi::DenseTensor>();
-    tensor->Resize(phi::make_ddim({group.all_length_}))
+    tensor->Resize(common::make_ddim({group.all_length_}))
         .mutable_data(place_, framework::TransToPhiDataType(group.dtype_));
 
     // For CUDA or XPU, compute_stream --> comm_stream.
diff --git a/paddle/fluid/imperative/variable_wrapper.h b/paddle/fluid/imperative/variable_wrapper.h
index d4438e8b47b970..d18750a26f0337 100644
--- a/paddle/fluid/imperative/variable_wrapper.h
+++ b/paddle/fluid/imperative/variable_wrapper.h
@@ -19,13 +19,13 @@
 #include <string>
 #include <utility>
 
+#include "paddle/common/layout.h"
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/string_array.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/imperative/hooks.h"
 #include "paddle/fluid/imperative/op_base.h"
-#include "paddle/phi/common/layout.h"
 
 namespace paddle {
 namespace imperative {
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 81b0abe570e77d..f8bd690c885ea3 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -69,8 +69,14 @@ if(WIN32 AND WITH_GPU)
   cc_library(paddle_inference DEPS ${fluid_modules} ${STATIC_INFERENCE_API}
                                    ${utils_modules})
 else()
-  create_static_lib(paddle_inference ${phi_modules} ${fluid_modules}
-                    ${ir_targets} ${STATIC_INFERENCE_API} ${utils_modules})
+  create_static_lib(
+    paddle_inference
+    ${phi_modules}
+    ${fluid_modules}
+    ${ir_targets}
+    ${STATIC_INFERENCE_API}
+    ${utils_modules}
+    common_static)
 endif()
 
 if(NOT APPLE)
@@ -103,7 +109,7 @@ list(REMOVE_ITEM fluid_modules cinn_op_dialect)
 # shared library to prune library size.
 list(REMOVE_ITEM fluid_modules ${not_infer_modules})
 
-set(SHARED_INFERENCE_DEPS phi ${fluid_modules} analysis_predictor
+set(SHARED_INFERENCE_DEPS phi common ${fluid_modules} analysis_predictor
                           ${utils_modules})
 if(NOT WIN32)
   list(APPEND SHARED_INFERENCE_DEPS ${ir_targets})
@@ -134,7 +140,7 @@ target_link_libraries(paddle_inference_shared ${os_dependency_modules})
 if(WIN32)
   set_property(TARGET paddle_inference_shared
                PROPERTY WINDOWS_EXPORT_ALL_SYMBOLS ON)
-  target_link_libraries(paddle_inference_shared phi)
+  target_link_libraries(paddle_inference_shared phi common)
 endif()
 set_target_properties(paddle_inference_shared PROPERTIES LINK_FLAGS
                                                          "-Wl,-rpath,'$ORIGIN'")
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index 1ecc067f3b90ee..122dbbda8fabdd 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -22,13 +22,13 @@
 #include <utility>
 #include <vector>
 
+#include "paddle/common/errors.h"
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/analysis/argument.h"
 #include "paddle/fluid/string/pretty_log.h"
 #include "paddle/phi/common/data_type.h"
-#include "paddle/phi/core/errors.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index c5f00cb08355f6..ea648d8574c94b 100755
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -39,7 +39,7 @@ if(WITH_CRYPTO)
   list(APPEND paddle_inference_api_deps framework_io)
 endif()
 if(WITH_CUSTOM_DEVICE)
-  set(paddle_inference_api_deps ${paddle_inference_api_deps} phi)
+  set(paddle_inference_api_deps ${paddle_inference_api_deps} phi common)
 endif()
 
 if(WIN32)
@@ -61,7 +61,7 @@ cc_library(
        table_printer utf8proc)
 
 if(WIN32)
-  target_link_libraries(paddle_inference_api phi)
+  target_link_libraries(paddle_inference_api phi common)
 endif()
 
 set(inference_deps
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 32083da55e6542..c821aad73459a8 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -236,7 +236,7 @@ phi::Backend ConvertBackend(paddle_infer::PlaceType backend) {
 bool PaddleTensorToDenseTensor(const PaddleTensor &pt,
                                phi::DenseTensor *t,
                                const platform::Place &place) {
-  framework::DDim ddim = phi::make_ddim(pt.shape);
+  framework::DDim ddim = common::make_ddim(pt.shape);
   void *input_ptr = nullptr;
   if (pt.dtype == PaddleDType::INT64) {
     input_ptr = t->mutable_data<int64_t>(ddim, place);
@@ -254,7 +254,7 @@ bool PaddleTensorToDenseTensor(const PaddleTensor &pt,
   }
   // NOTE(Aurelius84): Some kernels support zero shape input
   // without memory holder, we should skip enforce logic.
-  bool has_zero_dim = (phi::product(ddim) == 0);
+  bool has_zero_dim = (common::product(ddim) == 0);
   VLOG(3) << "Found zero dim: " << has_zero_dim
           << " from input with ddim: " << ddim;
   if (!has_zero_dim) {
@@ -1138,7 +1138,7 @@ void AnalysisPredictor::MkldnnPreSet(
 #ifdef PADDLE_WITH_DNNL
   std::vector<std::vector<int>> inputs_shape;
   for (const auto &input : inputs) {
-    inputs_shape.emplace_back(phi::vectorize<int>(input.dims()));
+    inputs_shape.emplace_back(common::vectorize<int>(input.dims()));
   }
   MkldnnPreSet(inputs_shape);
 #endif
@@ -1416,7 +1416,7 @@ template <typename T>
 void AnalysisPredictor::GetFetchOne(const phi::DenseTensor &fetch,
                                     PaddleTensor *output) {
   // set shape.
-  auto shape = phi::vectorize(fetch.dims());
+  auto shape = common::vectorize(fetch.dims());
   output->shape.assign(shape.begin(), shape.end());
   // set data.
   int num_elems = inference::VecReduceToInt(shape);
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index c3f50fd6f6bb39..d886885edb5ba5 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -213,7 +213,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
 
   for (size_t i = 0; i < inputs.size(); ++i) {
     auto &input = feed_tensors_[i];
-    framework::DDim ddim = phi::make_ddim(inputs[i].shape);
+    framework::DDim ddim = common::make_ddim(inputs[i].shape);
     void *input_ptr = nullptr;
     if (inputs[i].dtype == PaddleDType::INT64) {
       input_ptr = input.mutable_data<int64_t>(ddim, place_);
@@ -299,7 +299,7 @@ template <typename T>
 void NativePaddlePredictor::GetFetchOne(const phi::DenseTensor &fetch,
                                         PaddleTensor *output) {
   // set shape.
-  auto shape = phi::vectorize(fetch.dims());
+  auto shape = common::vectorize(fetch.dims());
   output->shape.assign(shape.begin(), shape.end());
   // set data.
   const T *data = fetch.data<T>();
diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h
index e86dbe14d1746e..75bc0b3cafa17a 100644
--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/naive_executor.h"
@@ -31,7 +32,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
-#include "paddle/phi/core/ddim.h"
 
 namespace paddle {
 
diff --git a/paddle/fluid/inference/api/details/CMakeLists.txt b/paddle/fluid/inference/api/details/CMakeLists.txt
index 105ff16747dfd0..b7eb0030155b72 100644
--- a/paddle/fluid/inference/api/details/CMakeLists.txt
+++ b/paddle/fluid/inference/api/details/CMakeLists.txt
@@ -21,20 +21,20 @@ if(WITH_ONNXRUNTIME)
   cc_library(
     zero_copy_tensor
     SRCS zero_copy_tensor.cc
-    DEPS scope lod_tensor enforce onnxruntime)
+    DEPS scope lod_tensor enforce onnxruntime common)
   cc_library(
     zero_copy_tensor_dummy
     SRCS zero_copy_tensor_dummy.cc
-    DEPS onnxruntime phi)
+    DEPS onnxruntime phi common)
 else()
   cc_library(
     zero_copy_tensor
     SRCS zero_copy_tensor.cc
-    DEPS scope lod_tensor enforce)
+    DEPS scope lod_tensor enforce common)
   cc_library(
     zero_copy_tensor_dummy
     SRCS zero_copy_tensor_dummy.cc
-    DEPS phi)
+    DEPS phi common)
 endif()
 
 cc_test(
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index cbd74f644d40d2..eee3a707a03b14 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -59,7 +59,7 @@ void Tensor::Reshape(const std::vector<int> &shape) {
       paddle::platform::errors::PreconditionNotMet(
           "No tensor called [%s] in the runtime scope", name_));
   auto *tensor = var->GetMutable<phi::DenseTensor>();
-  tensor->Resize(phi::make_ddim(shape));
+  tensor->Resize(common::make_ddim(shape));
 }
 
 void Tensor::ReshapeStrings(const size_t &shape) {
@@ -337,7 +337,7 @@ void Tensor::ShareExternalData(const T *data,
       std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()) *
       sizeof(T);
   phi::DenseTensorMeta meta(
-      DataTypeInfo<T>().TYPE, phi::make_ddim(shape), LayoutConvert(layout));
+      DataTypeInfo<T>().TYPE, common::make_ddim(shape), LayoutConvert(layout));
   if (place == PlaceType::kCPU) {
     phi::DenseTensor dtensor(
         std::make_shared<phi::Allocation>(
@@ -733,18 +733,19 @@ std::vector<int> Tensor::shape() const {
     // at last nhwC, so for dim==2 these layouts are the same and nothing should
     // be done. Similarly for dim==1 when you have just one possible
     // combination.
-    if (tensor->dims().size() < 3) return phi::vectorize<int>(tensor->dims());
+    if (tensor->dims().size() < 3)
+      return common::vectorize<int>(tensor->dims());
     if (out_layout == phi::DataLayout::kNHWC ||
         out_layout == phi::DataLayout::kNDHWC) {
-      auto dims = phi::vectorize<int>(tensor->dims());
+      auto dims = common::vectorize<int>(tensor->dims());
       std::rotate(dims.begin() + 1, dims.begin() + 2, dims.end());
       return dims;
     } else {
-      return phi::vectorize<int>(tensor->dims());
+      return common::vectorize<int>(tensor->dims());
     }
   }
 #endif
-  return phi::vectorize<int>(tensor->dims());
+  return common::vectorize<int>(tensor->dims());
 }
 
 void Tensor::SetLoD(const std::vector<std::vector<size_t>> &x) {
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc
index 1b604b544b9475..54a198b4e2f590 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc
@@ -428,7 +428,7 @@ AnalysisPredictor::MkldnnQuantizer::GetMaxChScalingFactor(
 
   auto dims = var_tensor.dims();
   constexpr int num_col_dims = 1;
-  auto flattened_dims = phi::flatten_to_2d(dims, num_col_dims);
+  auto flattened_dims = common::flatten_to_2d(dims, num_col_dims);
   ConstEigenMatrixArrayMap eigen_tensor_mat{
       var_tensor.data<float>(), flattened_dims[0], flattened_dims[1]};
 
diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.cc b/paddle/fluid/inference/api/onnxruntime_predictor.cc
index 4f8435ca505c0e..25970440469168 100644
--- a/paddle/fluid/inference/api/onnxruntime_predictor.cc
+++ b/paddle/fluid/inference/api/onnxruntime_predictor.cc
@@ -317,7 +317,7 @@ Ort::Value ONNXRuntimePredictor::GetOrtValue(const ONNXDesc &desc,
   size_t size =
       tensor->numel() *
       framework::SizeOfType(framework::TransToProtoVarType(tensor->dtype()));
-  std::vector<int64_t> shape = phi::vectorize<int64_t>(tensor->dims());
+  std::vector<int64_t> shape = common::vectorize<int64_t>(tensor->dims());
   return Ort::Value::CreateTensor(memory_info,
                                   static_cast<void *>(tensor->data()),
                                   size,
diff --git a/paddle/fluid/inference/api/resource_manager.cc b/paddle/fluid/inference/api/resource_manager.cc
index 2414aaee1b78b5..2806204f4b9406 100644
--- a/paddle/fluid/inference/api/resource_manager.cc
+++ b/paddle/fluid/inference/api/resource_manager.cc
@@ -20,6 +20,7 @@
 #include <unordered_map>
 #include <utility>
 
+#include "paddle/common/errors.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/phi/backends/gpu/forwards.h"
@@ -28,7 +29,6 @@
 #include "paddle/phi/backends/gpu/gpu_resources.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/allocator.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/generator.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
diff --git a/paddle/fluid/inference/capi_exp/CMakeLists.txt b/paddle/fluid/inference/capi_exp/CMakeLists.txt
index 3a72f0e880c4b7..97a7910669a108 100644
--- a/paddle/fluid/inference/capi_exp/CMakeLists.txt
+++ b/paddle/fluid/inference/capi_exp/CMakeLists.txt
@@ -36,6 +36,7 @@ if(APPLE)
     cryptopp
     protobuf
     phi
+    common
     pir
     cblas)
 endif()
diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc
index 6de5f9cfa0ca17..9b36b6dc745e85 100644
--- a/paddle/fluid/inference/lite/tensor_utils.cc
+++ b/paddle/fluid/inference/lite/tensor_utils.cc
@@ -218,7 +218,7 @@ void TensorCopyAsync(paddle::lite_api::Tensor* dst,
   const platform::Place& dst_place = GetNativePlace(dst->target());
   const size_t bytes =
       static_cast<size_t>(src.numel()) * phi::SizeOf(src.dtype());
-  dst->Resize(phi::vectorize(src.dims()));
+  dst->Resize(common::vectorize(src.dims()));
   const void* src_data = src.data();
   void* dst_data{nullptr};
   dst_data = GetLiteTensorDataPtr(
@@ -236,7 +236,7 @@ template <>
 void TensorCopyAsync(phi::DenseTensor* dst,
                      const paddle::lite_api::Tensor& src,
                      const platform::DeviceContext& ctx) {
-  dst->Resize(phi::make_ddim(src.shape()));
+  dst->Resize(common::make_ddim(src.shape()));
   InitDstTensor(dst, src);
   const platform::Place& src_place = GetNativePlace(src.target());
   const platform::Place& dst_place = dst->place();
@@ -254,7 +254,7 @@ void TensorCopyAsync(phi::DenseTensor* dst,
 
 template <>
 void TensorDataShare(paddle::lite_api::Tensor* dst, phi::DenseTensor* src) {
-  dst->Resize(phi::vectorize(src->dims()));
+  dst->Resize(common::vectorize(src->dims()));
   dst->ShareExternalMemory(
       src->data(), src->memory_size(), GetLiteTargetType(src->place()));
   dst->SetPrecision(
@@ -273,7 +273,7 @@ void TensorDataShare(phi::DenseTensor* dst, paddle::lite_api::Tensor* src) {
       framework::SizeOfType(GetNativePrecisionType(src->precision()));
   std::shared_ptr<phi::Allocation> holder(new phi::Allocation(
       src_raw_data, memory_size, GetNativePlace(src->target())));
-  dst->Resize(phi::make_ddim(src->shape()));
+  dst->Resize(common::make_ddim(src->shape()));
   SetLoD(dst->mutable_lod(), src->lod());
   dst->ResetHolderWithType(
       holder,
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 1d62d0aec013c6..b5b7bc857c1174 100755
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -150,7 +150,8 @@ if(WIN32)
   nv_test(
     test_custom_plugin_creater
     SRCS test_custom_plugin_creater.cc
-    DEPS paddle_framework tensorrt_converter phi custom_operator init_phi)
+    DEPS paddle_framework tensorrt_converter phi common custom_operator
+         init_phi)
 elseif(WITH_CINN)
   nv_test(
     test_custom_plugin_creater
@@ -158,6 +159,7 @@ elseif(WITH_CINN)
     DEPS paddle_framework
          tensorrt_converter
          phi
+         common
          custom_operator
          init_phi
          fleet_executor
@@ -169,6 +171,7 @@ else()
     DEPS paddle_framework
          tensorrt_converter
          phi
+         common
          custom_operator
          init_phi
          fleet_executor
diff --git a/paddle/fluid/inference/tensorrt/convert/bilinear_interp_v2_op.cc b/paddle/fluid/inference/tensorrt/convert/bilinear_interp_v2_op.cc
index 77153d8ade56db..4cd7378c17b443 100644
--- a/paddle/fluid/inference/tensorrt/convert/bilinear_interp_v2_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/bilinear_interp_v2_op.cc
@@ -33,7 +33,7 @@ class BilinearInterpolateV2OpConverter : public OpConverter {
 
     auto input = engine_->GetITensor(input_name);
 
-    auto data_layout = phi::StringToDataLayout(
+    auto data_layout = common::StringToDataLayout(
         PADDLE_GET_CONST(std::string, op_desc.GetAttr("data_layout")));
     auto interp_method =
         PADDLE_GET_CONST(std::string, op_desc.GetAttr("interp_method"));
diff --git a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
index c0d93a7588b4b2..bec18da482e41a 100644
--- a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
@@ -49,7 +49,7 @@ class DropoutOpConverter : public OpConverter {
 
     platform::CPUPlace cpu_place;
     std::unique_ptr<phi::DenseTensor> weight_tensor(new phi::DenseTensor());
-    weight_tensor->Resize(phi::make_ddim({1}));
+    weight_tensor->Resize(common::make_ddim({1}));
     auto* weight_data =
         weight_tensor->mutable_data<float>(platform::CPUPlace());
     weight_data[0] = 1 - dropout_prob;
diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
index 198a164894c0b1..f5f7e53cf4e0d4 100644
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -33,7 +33,7 @@ class ElementwiseTensorOpConverter : public OpConverter {
     if (Y_v && !engine_->with_dynamic_shape()) {
       // Y is weight
       auto* Y_t = Y_v->GetMutable<phi::DenseTensor>();
-      std::vector<int> dims_y = phi::vectorize<int>(Y_t->dims());
+      std::vector<int> dims_y = common::vectorize<int>(Y_t->dims());
       auto y_weight = engine_->GetTrtWeight(op_desc.Input("Y").front(), *Y_t);
 
       nvinfer1::Dims trt_dims_y;
diff --git a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
index dba8086f2952e7..6ccb22e072f1b2 100644
--- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
@@ -9,13 +9,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/utils.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/inference/tensorrt/plugin/many_emb_layernorm_plugin.h"
 #include "paddle/fluid/inference/tensorrt/plugin/many_emb_layernorm_varseqlen_plugin.h"
-#include "paddle/phi/core/ddim.h"
 
 namespace paddle {
 namespace inference {
@@ -54,8 +54,8 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
     framework::DDim bias_dims, scale_dims;
     TensorRTEngine::Weight bias_weight, scale_weight;
 
-    int64_t bias_size = phi::product(bias_dims);
-    int64_t scale_size = phi::product(scale_dims);
+    int64_t bias_size = common::product(bias_dims);
+    int64_t scale_size = common::product(scale_dims);
     nvinfer1::ILayer* layer = nullptr;
     bool enable_int8 = op_desc.HasAttr("enable_int8");
 
@@ -81,8 +81,8 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
       }
       bias_weight = GetWeight(op_desc.Input("Bias").front(), &bias_dims);
       scale_weight = GetWeight(op_desc.Input("Scale").front(), &scale_dims);
-      bias_size = phi::product(bias_dims);
-      scale_size = phi::product(scale_dims);
+      bias_size = common::product(bias_dims);
+      scale_size = common::product(scale_dims);
       // other_id(except pos_id)
       engine_->SetITensor("word_id", input_ids[1]);
 
@@ -189,8 +189,8 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
       }
       bias_weight = GetWeight(op_desc.Input("Bias").front(), &bias_dims);
       scale_weight = GetWeight(op_desc.Input("Scale").front(), &scale_dims);
-      bias_size = phi::product(bias_dims);
-      scale_size = phi::product(scale_dims);
+      bias_size = common::product(bias_dims);
+      scale_size = common::product(scale_dims);
 
       int output_fp16 = static_cast<int>((engine_->WithFp16() == 1) ? 1 : 0);
       if (enable_int8) {
diff --git a/paddle/fluid/inference/tensorrt/convert/fill_constant_op.cc b/paddle/fluid/inference/tensorrt/convert/fill_constant_op.cc
index f6f476dc204851..4da409f9097337 100644
--- a/paddle/fluid/inference/tensorrt/convert/fill_constant_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fill_constant_op.cc
@@ -36,7 +36,7 @@ class FillConstantOpConverter : public OpConverter {
       str_value = std::to_string(value);
     }
     std::unique_ptr<phi::DenseTensor> out_tensor(new phi::DenseTensor());
-    out_tensor->Resize(phi::make_ddim(shape));
+    out_tensor->Resize(common::make_ddim(shape));
     nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT;
     void* trt_data = nullptr;
     size_t trt_num;
diff --git a/paddle/fluid/inference/tensorrt/convert/generic_and_custom_plugin_creater.cc b/paddle/fluid/inference/tensorrt/convert/generic_and_custom_plugin_creater.cc
index 9f14c8c1b64fb8..e811827a7296c1 100644
--- a/paddle/fluid/inference/tensorrt/convert/generic_and_custom_plugin_creater.cc
+++ b/paddle/fluid/inference/tensorrt/convert/generic_and_custom_plugin_creater.cc
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/common/errors.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/inference/tensorrt/plugin/generic_plugin.h"
 #include "paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.h"
 #include "paddle/phi/api/ext/op_meta_info.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
index d14be87e3ffd94..85a085aa221c41 100644
--- a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
@@ -47,7 +47,7 @@ class LeakyReluOpConverter : public OpConverter {
 #else
     platform::CPUPlace place;
     std::unique_ptr<phi::DenseTensor> alpha_tensor(new phi::DenseTensor());
-    alpha_tensor->Resize(phi::make_ddim({2}));
+    alpha_tensor->Resize(common::make_ddim({2}));
     float* alpha_data = alpha_tensor->mutable_data<float>(place);
     alpha_data[0] = alpha;
     alpha_data[1] = 1.f - alpha;
diff --git a/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc b/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc
index 84c2cc54f955d2..4c0b1a027640bc 100644
--- a/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc
@@ -33,7 +33,7 @@ class NearestInterpolateOpConverter : public OpConverter {
 
     auto data_layout = !op_desc.HasAttr("data_layout")
                            ? phi::DataLayout::kNCHW
-                           : phi::StringToDataLayout(PADDLE_GET_CONST(
+                           : common::StringToDataLayout(PADDLE_GET_CONST(
                                  std::string, op_desc.GetAttr("data_layout")));
     auto interp_method =
         PADDLE_GET_CONST(std::string, op_desc.GetAttr("interp_method"));
diff --git a/paddle/fluid/inference/tensorrt/convert/nearest_interp_v2_op.cc b/paddle/fluid/inference/tensorrt/convert/nearest_interp_v2_op.cc
index 997a467077043b..6f33a710469776 100644
--- a/paddle/fluid/inference/tensorrt/convert/nearest_interp_v2_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/nearest_interp_v2_op.cc
@@ -31,7 +31,7 @@ class NearestInterpolateV2OpConverter : public OpConverter {
     auto input = engine_->GetITensor(input_name);
     auto inputs = op_desc.Inputs();
 
-    auto data_layout = phi::StringToDataLayout(
+    auto data_layout = common::StringToDataLayout(
         PADDLE_GET_CONST(std::string, op_desc.GetAttr("data_layout")));
     auto interp_method =
         PADDLE_GET_CONST(std::string, op_desc.GetAttr("interp_method"));
diff --git a/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc
index afeacd5cd6b981..529175c7de81a8 100644
--- a/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc
@@ -63,8 +63,8 @@ class PrelnEmbEltwiseLayerNormOpConverter : public OpConverter {
     framework::DDim bias_dims, scale_dims;
     TensorRTEngine::Weight bias_weight, scale_weight;
 
-    int64_t bias_size = phi::product(bias_dims);
-    int64_t scale_size = phi::product(scale_dims);
+    int64_t bias_size = common::product(bias_dims);
+    int64_t scale_size = common::product(scale_dims);
 
     std::vector<std::string> id_names = op_desc.Input("Ids");
     std::vector<std::string> emb_names = op_desc.Input("Embs");
@@ -135,8 +135,8 @@ class PrelnEmbEltwiseLayerNormOpConverter : public OpConverter {
     }
     bias_weight = GetWeight(op_desc.Input("Bias").front(), &bias_dims);
     scale_weight = GetWeight(op_desc.Input("Scale").front(), &scale_dims);
-    bias_size = phi::product(bias_dims);
-    scale_size = phi::product(scale_dims);
+    bias_size = common::product(bias_dims);
+    scale_size = common::product(scale_dims);
     // other_id(except pos_id)
     engine_->SetITensor("word_id", input_ids[1]);
 
diff --git a/paddle/fluid/inference/tensorrt/convert/preln_residual_bias.cc b/paddle/fluid/inference/tensorrt/convert/preln_residual_bias.cc
index d8ffe4da595bdd..9091cfd10e3e8f 100644
--- a/paddle/fluid/inference/tensorrt/convert/preln_residual_bias.cc
+++ b/paddle/fluid/inference/tensorrt/convert/preln_residual_bias.cc
@@ -53,10 +53,10 @@ class PrelnResidualBiasOpConverter : public OpConverter {
     float* ele_bias =
         has_bias ? get_persistable_data("Bias", &ele_bias_dims) : nullptr;
 
-    int bias_size = phi::product(bias_dims);
+    int bias_size = common::product(bias_dims);
 
-    int scale_size = phi::product(scale_dims);
-    int ele_bias_size = has_bias ? phi::product(ele_bias_dims) : 0;
+    int scale_size = common::product(scale_dims);
+    int ele_bias_size = has_bias ? common::product(ele_bias_dims) : 0;
     float epsilon = PADDLE_GET_CONST(float, op_desc.GetAttr("ln_epsilon"));
     bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
     if (engine_->precision() == phi::DataType::INT8) {
diff --git a/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc
index f9d4be7c55f1a2..d21247e877cec4 100644
--- a/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc
@@ -57,8 +57,8 @@ class PrelnSkipLayerNormOpConverter : public OpConverter {
     framework::DDim bias_dims, scale_dims;
     auto* bias = get_persistable_data("Bias", &bias_dims);
     auto* scale = get_persistable_data("Scale", &scale_dims);
-    int bias_size = phi::product(bias_dims);
-    int scale_size = phi::product(scale_dims);
+    int bias_size = common::product(bias_dims);
+    int scale_size = common::product(scale_dims);
 
     nvinfer1::ILayer* layer = nullptr;
 
diff --git a/paddle/fluid/inference/tensorrt/convert/prompt_tuning_emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/prompt_tuning_emb_eltwise_layernorm.cc
index f5fc773135c565..9fc4c96ab7b93f 100644
--- a/paddle/fluid/inference/tensorrt/convert/prompt_tuning_emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/prompt_tuning_emb_eltwise_layernorm.cc
@@ -9,12 +9,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/utils.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/inference/tensorrt/plugin/prompt_tuning_emb_layernorm_varseqlen_plugin.h"
-#include "paddle/phi/core/ddim.h"
 
 namespace paddle {
 namespace inference {
@@ -55,8 +55,8 @@ class PromptTuningEmbEltwiseLayerNormOpConverter : public OpConverter {
     framework::DDim bias_dims, scale_dims;
     TensorRTEngine::Weight bias_weight, scale_weight;
 
-    int64_t bias_size = phi::product(bias_dims);
-    int64_t scale_size = phi::product(scale_dims);
+    int64_t bias_size = common::product(bias_dims);
+    int64_t scale_size = common::product(scale_dims);
     bool enable_int8 = op_desc.HasAttr("enable_int8");
 
     std::vector<std::string> id_names = op_desc.Input("Ids");
@@ -80,8 +80,8 @@ class PromptTuningEmbEltwiseLayerNormOpConverter : public OpConverter {
     }
     bias_weight = GetWeight(op_desc.Input("Bias").front(), &bias_dims);
     scale_weight = GetWeight(op_desc.Input("Scale").front(), &scale_dims);
-    bias_size = phi::product(bias_dims);
-    scale_size = phi::product(scale_dims);
+    bias_size = common::product(bias_dims);
+    scale_size = common::product(scale_dims);
     // other_id(except pos_id)
     engine_->SetITensor("word_id", input_ids[1]);
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
index b653df0bca83bc..7ef6d1f3241d8b 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
@@ -59,7 +59,7 @@ TEST(OpConverter, ConvertBlock) {
   std::vector<int> dim_vec = {3, 2, 3, 3};
   auto* x = scope.Var("conv2d-Y");
   auto* x_tensor = x->GetMutable<phi::DenseTensor>();
-  x_tensor->Resize(phi::make_ddim(dim_vec));
+  x_tensor->Resize(common::make_ddim(dim_vec));
   x_tensor->mutable_data<float>(platform::CUDAPlace(0));
 
   OpTeller::Global().SetOpConverterType(conv2d_op, OpConverterType::Default);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc
index 1d23aeedc5a8d7..738097190767a3 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc
@@ -28,20 +28,20 @@ void TensorRTSplitTest(const std::vector<int> &in_shape,
   framework::Scope scope;
   TRTConvertValidation validator(BatchSize + 1, parameters, scope, 10000);
 
-  auto make_dim = [](const std::vector<int> &shape) {
+  auto common::make_dim = [](const std::vector<int> &shape) {
     nvinfer1::Dims3 dim;
     dim.c() = shape[0];
     dim.h() = shape[1];
     dim.w() = shape[2];
     return dim;
   };
-  validator.DeclInputVar("split_input", make_dim(in_shape));
+  validator.DeclInputVar("split_input", common::make_dim(in_shape));
   std::vector<std::string> output_vars;
   for (size_t i = 0; i < sections.size(); ++i) {
     auto out_shape = in_shape;
     out_shape[Axis - 1] = sections[i];
     std::string output_name = "split_out" + std::to_string(i);
-    validator.DeclOutputVar(output_name, make_dim(out_shape));
+    validator.DeclOutputVar(output_name, common::make_dim(out_shape));
     output_vars.push_back(output_name);
   }
 
diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
index 8a41d564a09da3..8901d0a43fd41b 100644
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -131,7 +131,7 @@ class TRTConvertValidation {
 
     auto* x = scope_.Var(name);
     auto* x_tensor = x->GetMutable<phi::DenseTensor>();
-    x_tensor->Resize(phi::make_ddim(dim_vec));
+    x_tensor->Resize(common::make_ddim(dim_vec));
     RandomizeTensor(x_tensor, place_, ctx);
   }
   // Declare a variable in a fluid Scope.
@@ -226,7 +226,7 @@ class TRTConvertValidation {
       size_t fluid_out_size = fluid_outs[index].size();
       if (if_add_batch_ == true) {
         fluid_out_size =
-            batch_size * (phi::product(tensor->dims()) / max_batch_size_);
+            batch_size * (common::product(tensor->dims()) / max_batch_size_);
       }
 
       for (size_t i = 0; i < fluid_out_size; i++) {
diff --git a/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_factory.h b/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_factory.h
index 0196d81754fdd9..599b8557f8ad8a 100644
--- a/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_factory.h
+++ b/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_factory.h
@@ -17,10 +17,10 @@
 #include <NvInfer.h>
 #include <string>
 
+#include "paddle/common/macros.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/macros.h"
 #include "paddle/utils/flat_hash_map.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 983b19ca4a8a12..dba6582eb36538 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -741,7 +741,7 @@ struct SimpleOpTypeSetTeller : public Teller {
 
     if (op_type == "affine_channel") {
       if (!desc.HasAttr("data_layout")) return false;
-      auto data_layout = phi::StringToDataLayout(
+      auto data_layout = common::StringToDataLayout(
           PADDLE_GET_CONST(std::string, desc.GetAttr("data_layout")));
       if (data_layout != phi::DataLayout::kNCHW) return false;
 
@@ -816,7 +816,7 @@ struct SimpleOpTypeSetTeller : public Teller {
         if (!desc.HasAttr(attr)) return false;
       }
       if (desc.HasAttr("data_layout")) {
-        auto data_layout = phi::StringToDataLayout(
+        auto data_layout = common::StringToDataLayout(
             PADDLE_GET_CONST(std::string, desc.GetAttr("data_layout")));
         if (data_layout != phi::DataLayout::kNCHW &&
             data_layout != phi::DataLayout::kNHWC)
@@ -861,7 +861,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       for (auto const& attr : attrs) {
         if (!desc.HasAttr(attr)) return false;
       }
-      auto data_layout = phi::StringToDataLayout(
+      auto data_layout = common::StringToDataLayout(
           PADDLE_GET_CONST(std::string, desc.GetAttr("data_layout")));
       if (data_layout != phi::DataLayout::kNCHW &&
           data_layout != phi::DataLayout::kNHWC)
@@ -928,7 +928,7 @@ struct SimpleOpTypeSetTeller : public Teller {
         }
       }
 
-      auto data_layout = phi::StringToDataLayout(
+      auto data_layout = common::StringToDataLayout(
           PADDLE_GET_CONST(std::string, desc.GetAttr("data_layout")));
       if (data_layout != phi::DataLayout::kNCHW &&
           data_layout != phi::DataLayout::kNHWC) {
diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
index bfc9e6b9072daf..9f45d49aa420f2 100644
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -57,7 +57,11 @@ endif()
 nv_library(
   tensorrt_plugin
   SRCS ${TRT_FILES}
-  DEPS enforce tensorrt_engine tensor bert_encoder_functor
+  DEPS enforce
+       tensorrt_engine
+       tensor
+       bert_encoder_functor
+       common
        tensorrt_dynamic_shape_infermeta_factory
        tensorrt_plugin_arg_mapping_context)
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwiseadd_transpose_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/elementwiseadd_transpose_op_plugin.cu
index d2f373bca07de8..aee768b5df4b48 100644
--- a/paddle/fluid/inference/tensorrt/plugin/elementwiseadd_transpose_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwiseadd_transpose_op_plugin.cu
@@ -131,7 +131,7 @@ void ElementwiseAddTransposePluginDynamic::configurePlugin(
   if (x_numel <= 0) {
     return;
   }
-  ele_out_tensor_.Resize(phi::make_ddim(x_shape));
+  ele_out_tensor_.Resize(common::make_ddim(x_shape));
   paddle::platform::DeviceContextPool &pool =
       paddle::platform::DeviceContextPool::Instance();
   platform::CUDAPlace place(platform::GetCurrentDeviceId());
@@ -139,20 +139,20 @@ void ElementwiseAddTransposePluginDynamic::configurePlugin(
   const phi::GPUContext &dev_ctx = *device_context;
 
   if (x_type == nvinfer1::DataType::kFLOAT) {
-    x_meta_ =
-        phi::DenseTensorMeta(phi::DataType::FLOAT32, phi::make_ddim(x_shape));
-    y_meta_ =
-        phi::DenseTensorMeta(phi::DataType::FLOAT32, phi::make_ddim(y_shape));
-    out_meta_ =
-        phi::DenseTensorMeta(phi::DataType::FLOAT32, phi::make_ddim(out_shape));
+    x_meta_ = phi::DenseTensorMeta(phi::DataType::FLOAT32,
+                                   common::make_ddim(x_shape));
+    y_meta_ = phi::DenseTensorMeta(phi::DataType::FLOAT32,
+                                   common::make_ddim(y_shape));
+    out_meta_ = phi::DenseTensorMeta(phi::DataType::FLOAT32,
+                                     common::make_ddim(out_shape));
     dev_ctx.template Alloc<float>(&ele_out_tensor_, x_numel * sizeof(float));
   } else if (x_type == nvinfer1::DataType::kHALF) {
-    x_meta_ =
-        phi::DenseTensorMeta(phi::DataType::FLOAT16, phi::make_ddim(x_shape));
-    y_meta_ =
-        phi::DenseTensorMeta(phi::DataType::FLOAT16, phi::make_ddim(y_shape));
-    out_meta_ =
-        phi::DenseTensorMeta(phi::DataType::FLOAT16, phi::make_ddim(out_shape));
+    x_meta_ = phi::DenseTensorMeta(phi::DataType::FLOAT16,
+                                   common::make_ddim(x_shape));
+    y_meta_ = phi::DenseTensorMeta(phi::DataType::FLOAT16,
+                                   common::make_ddim(y_shape));
+    out_meta_ = phi::DenseTensorMeta(phi::DataType::FLOAT16,
+                                     common::make_ddim(out_shape));
     dev_ctx.template Alloc<phi::dtype::float16>(
         &ele_out_tensor_, x_numel * sizeof(phi::dtype::float16));
   }
diff --git a/paddle/fluid/inference/tensorrt/plugin/generic_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/generic_plugin.cu
index c56f8da2044737..ba91d96b7b59a2 100644
--- a/paddle/fluid/inference/tensorrt/plugin/generic_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/generic_plugin.cu
@@ -593,7 +593,7 @@ int GenericPlugin::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
     for (int k = 0; k < input_shape.size(); k++) input_numel *= input_shape[k];
     auto data_type_and_size = nvType2PhiType(input_desc[i].type);
     phi::DenseTensorMeta input_meta(data_type_and_size.first,
-                                    phi::make_ddim(input_shape));
+                                    common::make_ddim(input_shape));
     std::shared_ptr<phi::Allocation> input_alloc(
         new phi::Allocation((void*)(inputs[i]),  // NOLINT
                             input_numel * data_type_and_size.second,
@@ -617,7 +617,7 @@ int GenericPlugin::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
 
     auto data_type_and_size = nvType2PhiType(output_desc[i].type);
     phi::DenseTensorMeta output_meta(data_type_and_size.first,
-                                     phi::make_ddim(output_shape));
+                                     common::make_ddim(output_shape));
     std::shared_ptr<phi::Allocation> output_alloc(
         new phi::Allocation(reinterpret_cast<void*>(outputs[i]),
                             output_numel * data_type_and_size.second,
diff --git a/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.cu
index 4d5517ef111ed4..c9e56f1d63823d 100644
--- a/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.cu
@@ -18,8 +18,8 @@ limitations under the License. */
 #include "paddle/phi/kernels/group_norm_kernel.h"
 
 #include <cub/cub.cuh>
+#include "paddle/common/layout.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -395,7 +395,7 @@ int GroupNormPlugin::enqueue(int batch_size,
   for (int i = 0; i < input_dims.nbDims; i++) {
     input_shape.push_back(input_dims.d[i]);
   }
-  const auto input_ddim = phi::make_ddim(input_shape);
+  const auto input_ddim = common::make_ddim(input_shape);
 
   int C = input_shape[1];
 
@@ -578,7 +578,7 @@ int GroupNormPluginDynamic::enqueue(
     input_shape.push_back(input_dims.d[i]);
   }
 
-  const auto input_ddim = phi::make_ddim(input_shape);
+  const auto input_ddim = common::make_ddim(input_shape);
 
   int C = input_shape[1];
   int image_size = input_shape[2] * input_shape[3];
diff --git a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu
index 82e24bea09aaca..fe666415c6c00d 100644
--- a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu
@@ -75,8 +75,8 @@ int InstanceNormPlugin::enqueue(int batch_size,
   int h = input_dims.d[1];
   int w = input_dims.d[2];
 
-  scale_t.Resize(phi::make_ddim({batch_size, c}));
-  bias_t.Resize(phi::make_ddim({batch_size, c}));
+  scale_t.Resize(common::make_ddim({batch_size, c}));
+  bias_t.Resize(common::make_ddim({batch_size, c}));
   int device_id;
   cudaGetDevice(&device_id);
   float *scale_d = scale_t.mutable_data<float>(platform::CUDAPlace(device_id));
@@ -170,8 +170,8 @@ int InstanceNormPluginDynamic::enqueue(
   int h = input_dims.d[2];
   int w = input_dims.d[3];
 
-  scale_t.Resize(phi::make_ddim({n, c}));
-  bias_t.Resize(phi::make_ddim({n, c}));
+  scale_t.Resize(common::make_ddim({n, c}));
+  bias_t.Resize(common::make_ddim({n, c}));
   int device_id;
   cudaGetDevice(&device_id);
   float *scale_d = scale_t.mutable_data<float>(platform::CUDAPlace(device_id));
diff --git a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
index 09a93d661bd351..da26780c975a11 100644
--- a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
@@ -107,8 +107,8 @@ int LayerNormPlugin::enqueue(int batch_size,
   for (int i = 0; i < input_dims.nbDims; i++) {
     input_shape.push_back(input_dims.d[i]);
   }
-  const auto input_ddim = phi::make_ddim(input_shape);
-  auto matrix_dim = phi::flatten_to_2d(input_ddim, begin_norm_axis);
+  const auto input_ddim = common::make_ddim(input_shape);
+  auto matrix_dim = common::flatten_to_2d(input_ddim, begin_norm_axis);
   int feature_size = static_cast<int>(matrix_dim[1]);
   PADDLE_ENFORCE_EQ(feature_size,
                     scale_.size(),
@@ -127,8 +127,8 @@ int LayerNormPlugin::enqueue(int batch_size,
 
   int device_id;
   cudaGetDevice(&device_id);
-  mean_t.Resize(phi::make_ddim({batched_mean_shape}));
-  variance_t.Resize(phi::make_ddim({batched_variance_shape}));
+  mean_t.Resize(common::make_ddim({batched_mean_shape}));
+  variance_t.Resize(common::make_ddim({batched_variance_shape}));
   float *mean_d = mean_t.mutable_data<float>(platform::CUDAPlace(device_id));
   float *variance_d =
       variance_t.mutable_data<float>(platform::CUDAPlace(device_id));
@@ -309,8 +309,8 @@ int LayerNormPluginDynamic::enqueue(
                         "but got:%d",
                         variance_shape_[0]));
 
-  const auto input_ddim = phi::make_ddim(input_shape);
-  auto matrix_dim = phi::flatten_to_2d(input_ddim, begin_norm_axis);
+  const auto input_ddim = common::make_ddim(input_shape);
+  auto matrix_dim = common::flatten_to_2d(input_ddim, begin_norm_axis);
   int feature_size = static_cast<int>(matrix_dim[1]);
   PADDLE_ENFORCE_EQ(feature_size,
                     scale_.size(),
@@ -329,8 +329,8 @@ int LayerNormPluginDynamic::enqueue(
 
   int device_id;
   cudaGetDevice(&device_id);
-  mean_t.Resize(phi::make_ddim(mean_shape_));
-  variance_t.Resize(phi::make_ddim(variance_shape_));
+  mean_t.Resize(common::make_ddim(mean_shape_));
+  variance_t.Resize(common::make_ddim(variance_shape_));
   float *mean_d = mean_t.mutable_data<float>(platform::CUDAPlace(device_id));
   float *variance_d =
       variance_t.mutable_data<float>(platform::CUDAPlace(device_id));
diff --git a/paddle/fluid/inference/tensorrt/plugin/preln_groupnorm_act_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/preln_groupnorm_act_op_plugin.cu
index 01a91662c2f251..7ccf5d8a8a1bc7 100644
--- a/paddle/fluid/inference/tensorrt/plugin/preln_groupnorm_act_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/preln_groupnorm_act_op_plugin.cu
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/plugin/preln_groupnorm_act_op_plugin.h"
 #include <cub/cub.cuh>
+#include "paddle/common/layout.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_groupnorm_act_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/skip_groupnorm_act_op_plugin.cu
index 45bd8688da18b4..95c408fa859251 100644
--- a/paddle/fluid/inference/tensorrt/plugin/skip_groupnorm_act_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_groupnorm_act_op_plugin.cu
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/plugin/skip_groupnorm_act_op_plugin.h"
 #include <cub/cub.cuh>
+#include "paddle/common/layout.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/tensorrt/plugin/trans_layernorm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/trans_layernorm_op_plugin.cu
index a9177ee2d8f6ae..7fd486c30acc40 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trans_layernorm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/trans_layernorm_op_plugin.cu
@@ -337,7 +337,7 @@ int TransLayerNormPluginDynamic::enqueue(
   std::vector<int> trans_result_shape{
       input_shape[0], input_shape[2], input_shape[3], input_shape[1]};
 
-  const auto input_ddim = phi::make_ddim(input_shape);
+  const auto input_ddim = common::make_ddim(input_shape);
   int feature_size = static_cast<int>(input_ddim[1]);
   PADDLE_ENFORCE_EQ(feature_size,
                     scale_.size(),
@@ -371,8 +371,8 @@ int TransLayerNormPluginDynamic::enqueue(
   auto *device_context = static_cast<phi::GPUContext *>(pool.Get(place));
   const phi::GPUContext &dev_ctx = *device_context;
 
-  mean_t.Resize(phi::make_ddim(mean_shape_));
-  variance_t.Resize(phi::make_ddim(variance_shape_));
+  mean_t.Resize(common::make_ddim(mean_shape_));
+  variance_t.Resize(common::make_ddim(variance_shape_));
   float *mean_d =
       dev_ctx.template Alloc<float>(&mean_t, mean_shape_[0] * sizeof(float));
   float *variance_d = dev_ctx.template Alloc<float>(
@@ -388,15 +388,15 @@ int TransLayerNormPluginDynamic::enqueue(
     int trans_result_numel = input_numel;
     int norm_result_numel = input_numel;
     phi::DenseTensorMeta input_meta(phi::DataType::FLOAT32,
-                                    phi::make_ddim(input_shape));
+                                    common::make_ddim(input_shape));
     phi::DenseTensorMeta bias_meta(phi::DataType::FLOAT32,
-                                   phi::make_ddim({feature_size}));
+                                   common::make_ddim({feature_size}));
     phi::DenseTensorMeta scale_meta(phi::DataType::FLOAT32,
-                                    phi::make_ddim({feature_size}));
-    phi::DenseTensorMeta trans_result_meta(phi::DataType::FLOAT32,
-                                           phi::make_ddim(trans_result_shape));
-    phi::DenseTensorMeta norm_result_meta(phi::DataType::FLOAT32,
-                                          phi::make_ddim(trans_result_shape));
+                                    common::make_ddim({feature_size}));
+    phi::DenseTensorMeta trans_result_meta(
+        phi::DataType::FLOAT32, common::make_ddim(trans_result_shape));
+    phi::DenseTensorMeta norm_result_meta(
+        phi::DataType::FLOAT32, common::make_ddim(trans_result_shape));
     std::shared_ptr<phi::Allocation> input_alloc(new phi::Allocation(
         static_cast<void *>(const_cast<float *>(input)),  // NOLINT
         input_numel * sizeof(float),
@@ -446,13 +446,13 @@ int TransLayerNormPluginDynamic::enqueue(
     if (input_desc[0].format == nvinfer1::PluginFormat::kLINEAR) {
       VLOG(1) << "TRT Plugin format selected. trans_layernorm-->kLINEAR";
       phi::DenseTensorMeta input_meta(phi::DataType::FLOAT16,
-                                      phi::make_ddim(input_shape));
+                                      common::make_ddim(input_shape));
       std::shared_ptr<phi::Allocation> input_alloc(new phi::Allocation(
           static_cast<void *>(const_cast<half *>(input)),  // NOLINT
           input_numel * sizeof(half),
           place));
       phi::DenseTensorMeta trans_result_meta(
-          phi::DataType::FLOAT16, phi::make_ddim(trans_result_shape));
+          phi::DataType::FLOAT16, common::make_ddim(trans_result_shape));
       std::shared_ptr<phi::Allocation> trans_result_alloc(
           new phi::Allocation(static_cast<void *>(dst),  // NOLINT
                               trans_result_numel * sizeof(half),
diff --git a/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc b/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc
index 8d2eb4cb4919b4..b565df0ec3d8cd 100644
--- a/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc
@@ -15,11 +15,11 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
+#include "paddle/common/layout.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/phi/common/data_type.h"
-#include "paddle/phi/common/layout.h"
 #if PADDLE_WITH_CUSPARSELT && IS_TRT_VERSION_GE(8000)
 #include "paddle/fluid/inference/tensorrt/plugin/spmm_plugin.h"
 #endif
@@ -86,7 +86,7 @@ class TensorRTDynamicShapeValueEngineTest : public ::testing::Test {
   void PrepareInputOutput(const std::vector<float> &input,
                           std::vector<int> output_shape) {
     paddle::framework::TensorFromVector(input, *ctx_, &input_);
-    output_.Resize(phi::make_ddim(output_shape));
+    output_.Resize(common::make_ddim(output_shape));
   }
   void PrepareShapeInput(const std::vector<int> &input) {
     paddle::framework::TensorFromVector(input, *ctx_, &shape_);
@@ -202,7 +202,7 @@ class TensorRTDynamicEngineTest : public ::testing::Test {
   void PrepareInputOutput(const std::vector<float16> &input,
                           std::vector<int> output_shape) {
     paddle::framework::TensorFromVector(input, *ctx_, &input_);
-    output_.Resize(phi::make_ddim(output_shape));
+    output_.Resize(common::make_ddim(output_shape));
   }
 
   void GetOutput(std::vector<float> *output) {
@@ -377,7 +377,7 @@ class TensorRTDynamicTestFusedTokenPrune : public ::testing::Test {
       paddle::framework::TensorFromVector(inputs[i], *ctx_, &inputs_[i]);
     }
     for (int i = 0; i < num_outputs; ++i) {
-      outputs_[i].Resize(phi::make_ddim(output_shapes[i]));
+      outputs_[i].Resize(common::make_ddim(output_shapes[i]));
     }
   }
 
@@ -573,7 +573,7 @@ class TensorRTDynamicTestFusedTokenPruneHalf : public ::testing::Test {
       paddle::framework::TensorFromVector(inputs[i], *ctx_, &inputs_[i]);
     }
     for (int i = 0; i < num_outputs; ++i) {
-      outputs_[i].Resize(phi::make_ddim(output_shapes[i]));
+      outputs_[i].Resize(common::make_ddim(output_shapes[i]));
     }
   }
 
diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc
index 256af16fb155fc..8d64d0d0891445 100644
--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -59,7 +59,7 @@ class TensorRTEngineTest : public ::testing::Test {
   void PrepareInputOutput(const std::vector<float> &input,
                           std::vector<int> output_shape) {
     paddle::framework::TensorFromVector(input, *ctx_, &input_);
-    output_.Resize(phi::make_ddim(output_shape));
+    output_.Resize(common::make_ddim(output_shape));
   }
 
   void GetOutput(std::vector<float> *output) {
diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc
index 176738ce6295e0..3cb30da55e407e 100644
--- a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc
+++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc
@@ -37,7 +37,7 @@ TRTInt8Calibrator::TRTInt8Calibrator(
     std::string input_name = it.first;
     int data_size = it.second;
     int num_ele = data_size / sizeof(int16_t);
-    framework::DDim data_shape = phi::make_ddim({num_ele});
+    framework::DDim data_shape = common::make_ddim({num_ele});
     temp_tensor.Resize(data_shape);
     data_tensors_.push_back(temp_tensor);
     data_buffers_[input_name] = std::pair<void*, size_t>(
diff --git a/paddle/fluid/inference/utils/CMakeLists.txt b/paddle/fluid/inference/utils/CMakeLists.txt
index 5804a637574f11..46b74a60ad4449 100644
--- a/paddle/fluid/inference/utils/CMakeLists.txt
+++ b/paddle/fluid/inference/utils/CMakeLists.txt
@@ -1,7 +1,7 @@
 cc_library(
   benchmark
   SRCS benchmark.cc
-  DEPS enforce)
+  DEPS enforce common)
 paddle_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark)
 cc_library(
   infer_io_utils
@@ -10,7 +10,7 @@ cc_library(
 cc_library(
   model_utils
   SRCS model_utils.cc
-  DEPS proto_desc enforce)
+  DEPS proto_desc enforce common)
 
 cc_test_old(
   infer_io_utils_tester
diff --git a/paddle/fluid/ir_adaptor/translator/attribute_translator.cc b/paddle/fluid/ir_adaptor/translator/attribute_translator.cc
index ebb58cc0ebf61d..928087c8cb8d8c 100644
--- a/paddle/fluid/ir_adaptor/translator/attribute_translator.cc
+++ b/paddle/fluid/ir_adaptor/translator/attribute_translator.cc
@@ -17,14 +17,14 @@
 #include <string>
 #include <vector>
 
+#include "paddle/common/enforce.h"
+#include "paddle/common/layout.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/int_array.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/utils/data_type.h"
-#include "paddle/pir/core/enforce.h"
 #include "paddle/utils/variant.h"
 
 namespace paddle {
diff --git a/paddle/fluid/ir_adaptor/translator/op_translator.cc b/paddle/fluid/ir_adaptor/translator/op_translator.cc
index 3c21bffebabdbb..8db987eb20fd70 100644
--- a/paddle/fluid/ir_adaptor/translator/op_translator.cc
+++ b/paddle/fluid/ir_adaptor/translator/op_translator.cc
@@ -22,6 +22,7 @@
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/common/enforce.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/ir_adaptor/translator/attribute_translator.h"
 #include "paddle/fluid/ir_adaptor/translator/op_compat_info.h"
@@ -37,7 +38,6 @@
 #include "paddle/pir/core/builder.h"
 #include "paddle/pir/core/builtin_op.h"
 #include "paddle/pir/core/builtin_type.h"
-#include "paddle/pir/core/enforce.h"
 #include "paddle/pir/core/ir_context.h"
 #include "paddle/pir/core/operation.h"
 #include "paddle/pir/core/value.h"
@@ -980,7 +980,7 @@ pir::OpResult TranslateDropOutStateIn(pir::IrContext* ctx,
 
   pir::Builder builder(ctx, block);
   dialect::FullOp full_op = builder.Build<dialect::FullOp>(
-      phi::vectorize(tensor_type.dims()),
+      common::vectorize(tensor_type.dims()),
       0.0f,
       dialect::TransToPhiDataType(tensor_type.dtype()),
       phi::CPUPlace());
@@ -1331,7 +1331,7 @@ ValueInfo GetTensorInfoByVarName(const OpDesc& op_desc,
   dialect::DenseTensorType tensor_type =
       type.dyn_cast<dialect::DenseTensorType>();
 
-  std::vector<int64_t> shape = phi::vectorize(tensor_type.dims());
+  std::vector<int64_t> shape = common::vectorize(tensor_type.dims());
 
   return std::make_tuple(shape, tensor_type, value);
 }
@@ -1416,7 +1416,7 @@ struct MulOpTranscriber : public OpTranscriber {
         builder.Build<dialect::ReshapeOp>(x_value, x_new_shape);
     pir::OpResult x_new = reshape_op_x.out();
     VLOG(6) << "[" << op_desc.Type() << "] x_shape change from "
-            << x_tensor_type.dims() << " to " << phi::make_ddim(x_new_shape);
+            << x_tensor_type.dims() << " to " << common::make_ddim(x_new_shape);
 
     std::vector<int64_t> y_new_shape(
         {std::max(std::accumulate(y_shape.begin(),
@@ -1434,7 +1434,7 @@ struct MulOpTranscriber : public OpTranscriber {
         builder.Build<dialect::ReshapeOp>(y_value, y_new_shape);
     pir::OpResult y_new = reshape_op_y.out();
     VLOG(6) << "[" << op_desc.Type() << "] y_shape change from "
-            << y_tensor_type.dims() << " to " << phi::make_ddim(y_new_shape);
+            << y_tensor_type.dims() << " to " << common::make_ddim(y_new_shape);
 
     return {x_new, y_new};
   }
@@ -1482,7 +1482,7 @@ struct MulOpTranscriber : public OpTranscriber {
       pir::OpResult out_new = reshape_op_out.out().dyn_cast<pir::OpResult>();
       VLOG(6) << "[" << op_desc.Type() << "] out_shape change from "
               << out_tensor_type.dims() << " to "
-              << phi::make_ddim(out_new_shape);
+              << common::make_ddim(out_new_shape);
 
       param_map->PushValue(output_name,
                            VariableDefiningInfo(out_new, false, -1));
@@ -1579,7 +1579,7 @@ struct MulGradOpTranscriber : public OpTranscriber {
         builder.Build<dialect::ReshapeOp>(x_value, x_new_shape);
     pir::OpResult x_new = reshape_op_x.out();
     VLOG(6) << "[" << op_desc.Type() << "] x_shape change from "
-            << x_tensor_type.dims() << " to " << phi::make_ddim(x_new_shape);
+            << x_tensor_type.dims() << " to " << common::make_ddim(x_new_shape);
 
     std::vector<int64_t> y_new_shape(
         {std::max(std::accumulate(y_shape.begin(),
@@ -1597,7 +1597,7 @@ struct MulGradOpTranscriber : public OpTranscriber {
         builder.Build<dialect::ReshapeOp>(y_value, y_new_shape);
     pir::OpResult y_new = reshape_op_y.out();
     VLOG(6) << "[" << op_desc.Type() << "] y_shape change from "
-            << y_tensor_type.dims() << " to " << phi::make_ddim(y_new_shape);
+            << y_tensor_type.dims() << " to " << common::make_ddim(y_new_shape);
 
     std::vector<int64_t> out_grad_new_shape(
         {x_new_shape.front(), y_new_shape.back()});
@@ -1607,7 +1607,7 @@ struct MulGradOpTranscriber : public OpTranscriber {
     pir::OpResult out_grad_new = reshape_op_out_grad.out();
     VLOG(6) << "[" << op_desc.Type() << "] out_grad_shape change from "
             << out_grad_tensor_type.dims() << " to "
-            << phi::make_ddim(out_grad_new_shape);
+            << common::make_ddim(out_grad_new_shape);
 
     return {x_new, y_new, out_grad_new};
   }
@@ -1653,7 +1653,7 @@ struct MulGradOpTranscriber : public OpTranscriber {
                  op_desc.Type(),
                  var_name.substr(0, 1));
       std::vector<int64_t> shape = var_desc->GetShape();
-      DenseTensorTypeStorage::Dim dim = phi::make_ddim(shape);
+      DenseTensorTypeStorage::Dim dim = common::make_ddim(shape);
 
       pir::OpResult value_res = operation->result(idx_in_op);
       auto reshape_op = builder.Build<dialect::ReshapeOp>(value_res, shape);
@@ -2016,7 +2016,7 @@ struct ElementwiseTranscriber : public OpTranscriber {
                x_type);
     dialect::DenseTensorType x_tensor_type =
         x_type.dyn_cast<dialect::DenseTensorType>();
-    std::vector<int64_t> x_shape = phi::vectorize(x_tensor_type.dims());
+    std::vector<int64_t> x_shape = common::vectorize(x_tensor_type.dims());
 
     auto y_names = op_desc.Input("Y", true);
     IR_ENFORCE(y_names.size() == 1,
@@ -2047,7 +2047,7 @@ struct ElementwiseTranscriber : public OpTranscriber {
                y_type);
     dialect::DenseTensorType y_tensor_type =
         y_type.dyn_cast<dialect::DenseTensorType>();
-    std::vector<int64_t> y_shape = phi::vectorize(y_tensor_type.dims());
+    std::vector<int64_t> y_shape = common::vectorize(y_tensor_type.dims());
 
     if (axis < 0) {
       axis += static_cast<int>(x_shape.size());
@@ -2075,7 +2075,8 @@ struct ElementwiseTranscriber : public OpTranscriber {
           builder.Build<dialect::ReshapeOp>(y_value, y_new_shape);
       y_new = reshape_op.out();
       VLOG(6) << "[" << op_desc.Type() << "] y_shape change from "
-              << y_tensor_type.dims() << " to " << phi::make_ddim(y_new_shape);
+              << y_tensor_type.dims() << " to "
+              << common::make_ddim(y_new_shape);
     } else {
       auto shape_op = builder.Build<dialect::ShapeOp>(y_value);
       auto append_shape_op = builder.Build<dialect::FullIntArrayOp>(
@@ -2182,7 +2183,7 @@ struct ElementwiseGradTranscriber : public OpTranscriber {
       return;
     }
 
-    std::vector<int64_t> y_shape = phi::vectorize(y_tensor_type.dims());
+    std::vector<int64_t> y_shape = common::vectorize(y_tensor_type.dims());
     pir::Builder builder(ctx, operation->GetParent());
     auto reshape_op = builder.Build<dialect::ReshapeOp>(value, y_shape);
     param_map->PushValue(y_grad_var_name,
@@ -2400,7 +2401,7 @@ struct RandIntOpTranscriber : public OpTranscriber {
 
     pir::Type dtype = type_translator[var_type](ctx, *var);
     paddle::dialect::DenseTensorTypeStorage::Dim dim =
-        phi::make_ddim(var->GetShape());
+        common::make_ddim(var->GetShape());
     paddle::dialect::DenseTensorTypeStorage::DataLayout layout =
         paddle::dialect::DenseTensorTypeStorage::DataLayout::UNDEFINED;
     paddle::dialect::DenseTensorTypeStorage::LoD lod = {};
diff --git a/paddle/fluid/ir_adaptor/translator/program_translator.cc b/paddle/fluid/ir_adaptor/translator/program_translator.cc
index 468f1f6b1d0282..9f8f2259550a84 100644
--- a/paddle/fluid/ir_adaptor/translator/program_translator.cc
+++ b/paddle/fluid/ir_adaptor/translator/program_translator.cc
@@ -17,6 +17,7 @@
 #include <unordered_map>
 
 #include "glog/logging.h"
+#include "paddle/common/enforce.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/ir_adaptor/translator/attribute_translator.h"
@@ -33,7 +34,6 @@
 #include "paddle/pir/core/builtin_attribute.h"
 #include "paddle/pir/core/builtin_op.h"
 #include "paddle/pir/core/builtin_type.h"
-#include "paddle/pir/core/enforce.h"
 #include "paddle/pir/core/operation.h"
 #include "paddle/pir/core/value.h"
 #include "paddle/pir/dialect/control_flow/ir/cf_dialect.h"
diff --git a/paddle/fluid/ir_adaptor/translator/type_translator.cc b/paddle/fluid/ir_adaptor/translator/type_translator.cc
index b251ff5ae45da4..8a9a26373f085d 100644
--- a/paddle/fluid/ir_adaptor/translator/type_translator.cc
+++ b/paddle/fluid/ir_adaptor/translator/type_translator.cc
@@ -88,7 +88,8 @@ TypeTranslator::TypeTranslator() {
 
          pir::Type dtype =
              this->operator[](var_desc.GetDataType())(ctx, var_desc);
-         DenseTensorTypeStorage::Dim dim = phi::make_ddim(var_desc.GetShape());
+         DenseTensorTypeStorage::Dim dim =
+             common::make_ddim(var_desc.GetShape());
          DenseTensorTypeStorage::DataLayout layout =
              DenseTensorTypeStorage::DataLayout::UNDEFINED;
          DenseTensorTypeStorage::LoD lod = {};
@@ -114,7 +115,8 @@ TypeTranslator::TypeTranslator() {
          pir::Type dtype =
              this->operator[](var_desc.GetDataType())(ctx, var_desc);
 
-         SelectedRowsTypeStorage::Dim dim = phi::make_ddim(var_desc.GetShape());
+         SelectedRowsTypeStorage::Dim dim =
+             common::make_ddim(var_desc.GetShape());
          SelectedRowsTypeStorage::DataLayout layout =
              SelectedRowsTypeStorage::DataLayout::UNDEFINED;
          SelectedRowsTypeStorage::LoD lod = {};
diff --git a/paddle/fluid/ir_adaptor/translator/utils.cc b/paddle/fluid/ir_adaptor/translator/utils.cc
index 7f50115c5c578e..ebba4428220f70 100644
--- a/paddle/fluid/ir_adaptor/translator/utils.cc
+++ b/paddle/fluid/ir_adaptor/translator/utils.cc
@@ -16,12 +16,12 @@
 
 #include <unordered_map>
 
+#include "paddle/common/enforce.h"
 #include "paddle/fluid/ir_adaptor/translator/op_translator.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
 #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 #include "paddle/pir/core/builtin_attribute.h"
 #include "paddle/pir/core/builtin_type.h"
-#include "paddle/pir/core/enforce.h"
 #include "paddle/pir/core/utils.h"
 
 namespace paddle {
diff --git a/paddle/fluid/jit/layer.cc b/paddle/fluid/jit/layer.cc
index d6986b51306ebd..0b2e20f77837a2 100644
--- a/paddle/fluid/jit/layer.cc
+++ b/paddle/fluid/jit/layer.cc
@@ -14,9 +14,9 @@
 
 #include "paddle/fluid/jit/layer.h"
 
+#include "paddle/common/errors.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 
 #include "paddle/fluid/jit/compilation_unit.h"
 #include "paddle/fluid/jit/engine/base_engine.h"
diff --git a/paddle/fluid/jit/property.cc b/paddle/fluid/jit/property.cc
index 9b0c50a954624c..687468df83a3dc 100644
--- a/paddle/fluid/jit/property.cc
+++ b/paddle/fluid/jit/property.cc
@@ -18,10 +18,10 @@ limitations under the License. */
 
 #include "glog/logging.h"
 
+#include "paddle/common/errors.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/jit/property.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 
 namespace paddle {
 namespace jit {
diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt
index d0f131ec931156..5b49d927ae6762 100644
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
@@ -9,7 +9,7 @@ endif()
 cc_library(
   malloc
   SRCS malloc.cc
-  DEPS place enforce allocator profiler ${MKLDNN_CTX_DEPS})
+  DEPS place enforce common allocator profiler ${MKLDNN_CTX_DEPS})
 cc_library(
   memcpy
   SRCS memcpy.cc
@@ -17,7 +17,7 @@ cc_library(
 cc_library(
   stats
   SRCS stats.cc
-  DEPS enforce)
+  DEPS enforce common)
 cc_library(memory DEPS malloc memcpy stats)
 
 cc_test(
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 21ffde20022afc..ffce57d78f1642 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -1,6 +1,6 @@
 include(ExternalProject)
 
-set(ALLOCATOR_DEPS place stats profiler phi device_context)
+set(ALLOCATOR_DEPS place stats profiler phi common device_context)
 set(ALLOCATOR_SRCS
     allocator.cc
     cpu_allocator.cc
@@ -32,7 +32,7 @@ if(WITH_GPU OR WITH_ROCM)
 endif()
 
 if(WITH_GPU)
-  list(APPEND ALLOCATOR_DEPS phi)
+  list(APPEND ALLOCATOR_DEPS phi common)
 endif()
 
 if(CUDA_VERSION VERSION_GREATER_EQUAL 10.2)
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 6af73d8f48958d..d469b8e278f64d 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 
+#include "paddle/common/macros.h"
 #include "paddle/fluid/memory/allocation/aligned_allocator.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
@@ -25,7 +26,6 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/phi/core/macros.h"
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include <shared_mutex>
diff --git a/paddle/fluid/memory/stats.cc b/paddle/fluid/memory/stats.cc
index e18646f0e82bf9..fd61c4f2c39524 100644
--- a/paddle/fluid/memory/stats.cc
+++ b/paddle/fluid/memory/stats.cc
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/memory/stats.h"
 
+#include "paddle/common/macros.h"
 #include "paddle/fluid/memory/allocation/spin_lock.h"
 #include "paddle/fluid/platform/flags.h"
-#include "paddle/phi/core/macros.h"
 
 PADDLE_DEFINE_EXPORTED_bool(
     log_memory_stats,
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 60efffc107dccd..fe5fae7bafaebb 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -90,7 +90,7 @@ if(WITH_UNITY_BUILD)
     include(unity_build_rule.cmake)
 endif()
 
-set(OP_HEADER_DEPS ${OP_HEADER_DEPS} phi phi_utils static_prim_api get_expected_kernel_func)
+set(OP_HEADER_DEPS ${OP_HEADER_DEPS} phi common phi_utils static_prim_api get_expected_kernel_func)
 
 register_operators(EXCLUDES py_func_op dgc_op generated_op1 generated_op2 generated_op3 generated_op4 load_combine_op lstm_op run_program_op quantize_linear_op
         recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op activation_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS} processgroup_comm_utils)
@@ -98,8 +98,8 @@ register_operators(EXCLUDES py_func_op dgc_op generated_op1 generated_op2 genera
 op_library(generated_op UNITY SRCS generated_op1.cc generated_op2.cc generated_op3.cc generated_op4.cc DEPS ${OP_HEADER_DEPS})
 op_library(run_program_op DEPS executor_cache ${OP_HEADER_DEPS})
 target_link_libraries(run_program_op cuda_graph_with_memory_pool)
-op_library(quantize_linear_op DEPS phi)
-op_library(save_combine_op DEPS string_array phi)
+op_library(quantize_linear_op DEPS phi common)
+op_library(save_combine_op DEPS string_array phi common)
 op_library(load_combine_op DEPS string_array)
 
 if (WITH_GPU OR WITH_ROCM)
@@ -141,9 +141,9 @@ if (WITH_DGC)
 endif()
 
 cc_library(common_infer_shape_functions SRCS common_infer_shape_functions.cc DEPS operator)
-cc_library(ops_extra_info SRCS ops_extra_info.cc DEPS attribute phi)
+cc_library(ops_extra_info SRCS ops_extra_info.cc DEPS attribute phi common)
 
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} phi)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} phi common)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_utils
 lod_tensor unpooling lod_rank_table context_project executor static_prim_api)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc static_prim_api static_utils static_global_utils prim_utils)
@@ -191,7 +191,7 @@ endif()
 copy_if_different(${pybind_file} ${pybind_file_final})
 
 if (WITH_CUSTOM_DEVICE)
-cc_library(custom_device_common_op_registry SRCS custom_device_common_op_registry.cc DEPS operator phi type_info)
+cc_library(custom_device_common_op_registry SRCS custom_device_common_op_registry.cc DEPS operator phi common type_info)
 endif()
 
 if(NOT "${OP_LIST}" STREQUAL "")
diff --git a/paddle/fluid/operators/affine_channel_op.cc b/paddle/fluid/operators/affine_channel_op.cc
index 137249a30d4553..f44c181cca0977 100644
--- a/paddle/fluid/operators/affine_channel_op.cc
+++ b/paddle/fluid/operators/affine_channel_op.cc
@@ -70,8 +70,8 @@ class AffineChannelOp : public framework::OperatorWithKernel {
     auto x_dims = ctx->GetInputDim("X");
     auto scale_dims = ctx->GetInputDim("Scale");
     auto b_dims = ctx->GetInputDim("Bias");
-    const phi::DataLayout data_layout =
-        phi::StringToDataLayout(ctx->Attrs().Get<std::string>("data_layout"));
+    const phi::DataLayout data_layout = common::StringToDataLayout(
+        ctx->Attrs().Get<std::string>("data_layout"));
 
     const int64_t C =
         (data_layout == phi::DataLayout::kNCHW ? x_dims[1]
@@ -196,7 +196,7 @@ class AffineChannelKernel : public framework::OpKernel<T> {
     y->mutable_data<T>(ctx.GetPlace());
 
     const phi::DataLayout layout =
-        phi::StringToDataLayout(ctx.Attr<std::string>("data_layout"));
+        common::StringToDataLayout(ctx.Attr<std::string>("data_layout"));
 
     auto dims = x->dims();
     int N = static_cast<int>(dims[0]);
@@ -243,7 +243,7 @@ class AffineChannelGradKernel : public framework::OpKernel<T> {
     auto* dbias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
 
     const phi::DataLayout layout =
-        phi::StringToDataLayout(ctx.Attr<std::string>("data_layout"));
+        common::StringToDataLayout(ctx.Attr<std::string>("data_layout"));
 
     auto dims = x->dims();
     int N = static_cast<int>(dims[0]);
diff --git a/paddle/fluid/operators/affine_channel_op.cu b/paddle/fluid/operators/affine_channel_op.cu
index 6ec8d77da2c856..a07f311c6125ef 100644
--- a/paddle/fluid/operators/affine_channel_op.cu
+++ b/paddle/fluid/operators/affine_channel_op.cu
@@ -60,7 +60,7 @@ class AffineChannelCUDAKernel : public framework::OpKernel<T> {
     y->mutable_data<T>(ctx.GetPlace());
 
     const phi::DataLayout layout =
-        phi::StringToDataLayout(ctx.Attr<std::string>("data_layout"));
+        common::StringToDataLayout(ctx.Attr<std::string>("data_layout"));
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
 
     auto dims = x->dims();
@@ -147,7 +147,7 @@ class AffineChannelGradCUDAKernel : public framework::OpKernel<T> {
     auto* dbias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
 
     const phi::DataLayout layout =
-        phi::StringToDataLayout(ctx.Attr<std::string>("data_layout"));
+        common::StringToDataLayout(ctx.Attr<std::string>("data_layout"));
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
 
     auto dims = dy->dims();
diff --git a/paddle/fluid/operators/affine_channel_op_xpu.cc b/paddle/fluid/operators/affine_channel_op_xpu.cc
index 944a516f6c8f43..799bb87cf9892b 100644
--- a/paddle/fluid/operators/affine_channel_op_xpu.cc
+++ b/paddle/fluid/operators/affine_channel_op_xpu.cc
@@ -37,7 +37,7 @@ class AffineChannelXPUKernel : public framework::OpKernel<T> {
     y->mutable_data<T>(ctx.GetPlace());
 
     const phi::DataLayout layout =
-        phi::StringToDataLayout(ctx.Attr<std::string>("data_layout"));
+        common::StringToDataLayout(ctx.Attr<std::string>("data_layout"));
 
     auto dims = x->dims();
     int N = dims[0];
@@ -99,7 +99,7 @@ class AffineChannelGradXPUKernel : public framework::OpKernel<T> {
     auto* dbias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
 
     const phi::DataLayout layout =
-        phi::StringToDataLayout(ctx.Attr<std::string>("data_layout"));
+        common::StringToDataLayout(ctx.Attr<std::string>("data_layout"));
 
     auto dims = x->dims();
     int N = dims[0];
diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc
index 2325de03211a30..2c85ec6ea2076b 100644
--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
@@ -107,11 +107,12 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
     platform::Place place = x[0].place();
     auto data_type = x[0].dtype();
     int64_t batch_size = x[0].dims()[0];
-    framework::DDim ins_dims =
-        rank > 1 ? phi::slice_ddim(x[0].dims(), 1, rank) : phi::make_ddim({0});
+    framework::DDim ins_dims = rank > 1
+                                   ? common::slice_ddim(x[0].dims(), 1, rank)
+                                   : common::make_ddim({0});
     for (size_t i = 1; i < x.size(); ++i) {
-      auto ins_i_dims = rank > 1 ? phi::slice_ddim(x[i].dims(), 1, rank)
-                                 : phi::make_ddim({0});
+      auto ins_i_dims = rank > 1 ? common::slice_ddim(x[i].dims(), 1, rank)
+                                 : common::make_ddim({0});
       PADDLE_ENFORCE_EQ(
           ins_i_dims,
           ins_dims,
@@ -144,9 +145,9 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
               data_type));
       batch_size += x[i].dims()[0];
     }
-    auto ins_dim_vec = phi::vectorize(ins_dims);
+    auto ins_dim_vec = common::vectorize(ins_dims);
     ins_dim_vec.insert(ins_dim_vec.begin(), batch_size);
-    framework::DDim out_dims = phi::make_ddim(ins_dim_vec);
+    framework::DDim out_dims = common::make_ddim(ins_dim_vec);
     out->Resize(out_dims);
     out->mutable_data(place, data_type);
 
diff --git a/paddle/fluid/operators/assign_value_op.h b/paddle/fluid/operators/assign_value_op.h
index f5b74c5441174a..2a6a31ba03004d 100644
--- a/paddle/fluid/operators/assign_value_op.h
+++ b/paddle/fluid/operators/assign_value_op.h
@@ -126,7 +126,7 @@ class AssignValueKernel : public framework::OpKernel<T> {
         break;
     }
     CopyVectorToTensor<T>(value_name, out, ctx);
-    out->Resize(phi::make_ddim(shape));
+    out->Resize(common::make_ddim(shape));
   }
 };
 
diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index 4ec16e62f2ffad..7986bc8499427a 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -107,7 +107,7 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
             "Expected input(H0)'s dimension is 2. But received %d.",
             h_dims.size()));
     if (ctx->IsRuntime() ||
-        (phi::product(c_dims) > 0 && phi::product(h_dims) > 0)) {
+        (common::product(c_dims) > 0 && common::product(h_dims) > 0)) {
       PADDLE_ENFORCE_EQ(h_dims,
                         c_dims,
                         platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 270e0debbdb1b6..fd05b018bbfb66 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -79,7 +79,7 @@ void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const {
   }
 
   const DataLayout data_layout =
-      phi::StringToDataLayout(ctx->Attrs().Get<std::string>("data_layout"));
+      common::StringToDataLayout(ctx->Attrs().Get<std::string>("data_layout"));
 
   if (ctx->IsRuntime() && ctx->HasInput("MomentumTensor")) {
     auto mom = ctx->Inputs("MomentumTensor");
@@ -144,8 +144,9 @@ void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const {
 
   bool check = true;
   if (!ctx->HasInput("Scale") || !ctx->HasInput("Bias") ||
-      ((!ctx->IsRuntime()) && (phi::product(ctx->GetInputDim("Scale")) <= 0 ||
-                               phi::product(ctx->GetInputDim("Bias")) <= 0))) {
+      ((!ctx->IsRuntime()) &&
+       (common::product(ctx->GetInputDim("Scale")) <= 0 ||
+        common::product(ctx->GetInputDim("Bias")) <= 0))) {
     check = false;
   }
 
@@ -229,7 +230,7 @@ phi::KernelKey BatchNormOp::GetKernelTypeForVar(
     auto attrs = Attrs();
     auto ar = paddle::framework::AttrReader(attrs);
     const std::string data_layout = ar.Get<std::string>("data_layout");
-    auto dl = phi::StringToDataLayout(data_layout);
+    auto dl = common::StringToDataLayout(data_layout);
     // Some models may have intentionally set "AnyLayout" for pool
     // op. Treat this as NCHW (default data_format value)
     if (dl != phi::DataLayout::kAnyLayout) {
@@ -368,7 +369,7 @@ void BatchNormGradOp::InferShape(framework::InferShapeContext *ctx) const {
   OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "BatchNormGrad");
   const auto x_dims = ctx->GetInputDim("X");
   const DataLayout data_layout =
-      phi::StringToDataLayout(ctx->Attrs().Get<std::string>("data_layout"));
+      common::StringToDataLayout(ctx->Attrs().Get<std::string>("data_layout"));
 
   const int C = static_cast<int>(
       ((ctx->IsRunMKLDNNKernel() == true) || (data_layout == DataLayout::kNCHW)
@@ -418,7 +419,7 @@ phi::KernelKey BatchNormGradOp::GetKernelTypeForVar(
     auto attrs = Attrs();
     auto ar = paddle::framework::AttrReader(attrs);
     const std::string data_layout = ar.Get<std::string>("data_layout");
-    auto dl = phi::StringToDataLayout(data_layout);
+    auto dl = common::StringToDataLayout(data_layout);
     // Some models may have intentionally set "AnyLayout" for pool
     // op. Treat this as NCHW (default data_format value)
     if (dl != phi::DataLayout::kAnyLayout) {
@@ -510,7 +511,7 @@ void BatchNormDoubleGradOp::InferShape(
 
   const auto x_dims = ctx->GetInputDim("X");
   const DataLayout data_layout =
-      phi::StringToDataLayout(ctx->Attrs().Get<std::string>("data_layout"));
+      common::StringToDataLayout(ctx->Attrs().Get<std::string>("data_layout"));
   const int C = static_cast<int>(
       ((ctx->IsRunMKLDNNKernel() == true) || (data_layout == DataLayout::kNCHW)
            ? x_dims[1]
diff --git a/paddle/fluid/operators/bilateral_slice_op.cc b/paddle/fluid/operators/bilateral_slice_op.cc
index 1a6561fc383cc6..111f128fc3cc6b 100644
--- a/paddle/fluid/operators/bilateral_slice_op.cc
+++ b/paddle/fluid/operators/bilateral_slice_op.cc
@@ -81,7 +81,7 @@ class BilateralSliceOp : public framework::OperatorWithKernel {
     output_dims.push_back(h);
     output_dims.push_back(w);
 
-    ctx->SetOutputDim("Out", phi::make_ddim(output_dims));
+    ctx->SetOutputDim("Out", common::make_ddim(output_dims));
   }
 
  protected:
diff --git a/paddle/fluid/operators/bpr_loss_op.cc b/paddle/fluid/operators/bpr_loss_op.cc
index 4a2928338251e1..c628bad0aa3c05 100644
--- a/paddle/fluid/operators/bpr_loss_op.cc
+++ b/paddle/fluid/operators/bpr_loss_op.cc
@@ -38,10 +38,10 @@ class BprLossOp : public framework::OperatorWithKernel {
             "Input(X) and Input(Label) shall have the same rank."));
 
     if (ctx->IsRuntime() ||
-        (phi::product(x_dims) > 0 && phi::product(label_dims) > 0)) {
+        (common::product(x_dims) > 0 && common::product(label_dims) > 0)) {
       PADDLE_ENFORCE_EQ(
-          phi::slice_ddim(x_dims, 0, rank - 1),
-          phi::slice_ddim(label_dims, 0, rank - 1),
+          common::slice_ddim(x_dims, 0, rank - 1),
+          common::slice_ddim(label_dims, 0, rank - 1),
           platform::errors::InvalidArgument(
               "Input(X) and Input(Label) shall have the same shape "
               "except the last dimension."));
@@ -93,13 +93,13 @@ class BprLossGradientOp : public framework::OperatorWithKernel {
         rank,
         platform::errors::InvalidArgument(
             "Input(Label) and Input(X) should have the same rank."));
-    PADDLE_ENFORCE_EQ(phi::slice_ddim(x_dims, 0, rank - 1),
-                      phi::slice_ddim(label_dims, 0, rank - 1),
+    PADDLE_ENFORCE_EQ(common::slice_ddim(x_dims, 0, rank - 1),
+                      common::slice_ddim(label_dims, 0, rank - 1),
                       platform::errors::InvalidArgument(
                           "The Input(X) and Input(Label) should have the same "
                           "shape except the last dimension."));
-    PADDLE_ENFORCE_EQ(phi::slice_ddim(x_dims, 0, rank - 1),
-                      phi::slice_ddim(dy_dims, 0, rank - 1),
+    PADDLE_ENFORCE_EQ(common::slice_ddim(x_dims, 0, rank - 1),
+                      common::slice_ddim(dy_dims, 0, rank - 1),
                       platform::errors::InvalidArgument(
                           "The Input(X) and Input(Y@Grad) should have the same "
                           "shape except the last dimension."));
diff --git a/paddle/fluid/operators/cinn/CMakeLists.txt b/paddle/fluid/operators/cinn/CMakeLists.txt
index 5527aefff3cfa9..d56fd36c55c649 100644
--- a/paddle/fluid/operators/cinn/CMakeLists.txt
+++ b/paddle/fluid/operators/cinn/CMakeLists.txt
@@ -8,6 +8,7 @@ cc_library(
   cinn_launch_context
   SRCS cinn_launch_context.cc
   DEPS phi
+       common
        lod_tensor
        scope
        proto_desc
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.cc b/paddle/fluid/operators/cinn/cinn_launch_context.cc
index 0700028807fc05..bd32fa2a875dbe 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc
@@ -26,6 +26,7 @@
 #include "paddle/cinn/hlir/framework/tensor.h"
 #include "paddle/cinn/runtime/cinn_runtime.h"
 #include "paddle/cinn/runtime/intrinsic.h"
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/details/build_strategy.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"
@@ -42,7 +43,6 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/printf.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/pir/core/program.h"
 #include "paddle/pir/core/value.h"
 #include "paddle/utils/string/string_helper.h"
@@ -267,12 +267,12 @@ void CinnLaunchContext::CheckTensorEquivalent(
                         "Variable(%s) not applied in cinn", var_name));
   // check dimension
   auto cinn_tensor = GetCinnTensorOfVar(var_name);
-  auto cinn_dims = phi::make_ddim(cinn_tensor->shape().data());
+  auto cinn_dims = common::make_ddim(cinn_tensor->shape().data());
   if (paddle_tensor.dims().size() == 0) {
     // VLOG when paddle inputs 0D-Tensor
     VLOG(4) << "Paddle inputs 0D-Tensor, CINN changes 0D-Tensor " << var_name
             << " to 1D-Tensor";
-    PADDLE_ENFORCE_EQ(phi::make_ddim({1}),
+    PADDLE_ENFORCE_EQ(common::make_ddim({1}),
                       cinn_dims,
                       phi::errors::PreconditionNotMet(
                           "Tensor's shape of variable(%s) are not consistent, "
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.h b/paddle/fluid/operators/cinn/cinn_launch_context.h
index 34667bddc423d3..3d0b8d5f64b1d2 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.h
@@ -21,10 +21,10 @@
 #include <unordered_set>
 #include <vector>
 
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/parallel_executor.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/phi/core/ddim.h"
 
 // type declaration forward
 struct cinn_buffer_t;
diff --git a/paddle/fluid/operators/class_center_sample_op.cu b/paddle/fluid/operators/class_center_sample_op.cu
index efac6332c6d29c..ecfae25270f911 100644
--- a/paddle/fluid/operators/class_center_sample_op.cu
+++ b/paddle/fluid/operators/class_center_sample_op.cu
@@ -604,7 +604,7 @@ void ClassCenterSampleKernel(const Context& dev_ctx,
                      true,
                      &num_classes_per_device);
   T actual_num_samples = num_classes_per_device.data<T>()[rank + 1];
-  sampled_local_class_center->Resize(phi::make_ddim({actual_num_samples}));
+  sampled_local_class_center->Resize(common::make_ddim({actual_num_samples}));
 
   T* sampled_local_class_center_ptr =
       dev_ctx.template Alloc<T>(sampled_local_class_center);
diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt
index cef1390ed23907..1c8c8f00217cc5 100644
--- a/paddle/fluid/operators/collective/CMakeLists.txt
+++ b/paddle/fluid/operators/collective/CMakeLists.txt
@@ -18,7 +18,7 @@ foreach(src ${OPS})
 endforeach()
 
 if(WITH_GLOO)
-  set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} gloo_wrapper phi)
+  set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} gloo_wrapper phi common)
 endif()
 
 register_operators(
@@ -31,7 +31,8 @@ register_operators(
   ${COLLECTIVE_DEPS})
 
 if(WITH_NCCL OR WITH_RCCL)
-  set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper phi)
+  set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper phi
+                      common)
   op_library(c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
   op_library(gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
 endif()
diff --git a/paddle/fluid/operators/collective/barrier_op.h b/paddle/fluid/operators/collective/barrier_op.h
index 099d6cccb9a039..b05f2de53a0739 100644
--- a/paddle/fluid/operators/collective/barrier_op.h
+++ b/paddle/fluid/operators/collective/barrier_op.h
@@ -18,10 +18,10 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/ddim.h"
 
 #if defined(PADDLE_WITH_GLOO)
 #include <gloo/barrier.h>
diff --git a/paddle/fluid/operators/collective/c_allgather_op.cc b/paddle/fluid/operators/collective/c_allgather_op.cc
index ab5d28b3a9db27..2a0087cd8aa72b 100644
--- a/paddle/fluid/operators/collective/c_allgather_op.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op.cc
@@ -33,7 +33,7 @@ class CAllGatherOp : public framework::OperatorWithKernel {
     framework::DDim dim = ctx->GetInputDim("X");
     // 0D use stack/unstack while others use concat/split
     if (dim.size() == 0) {
-      dim = phi::make_ddim({nranks});
+      dim = common::make_ddim({nranks});
     } else {
       dim[0] = dim[0] * nranks;
       if (dim[0] < 0) dim[0] = -1;
diff --git a/paddle/fluid/operators/collective/c_allgather_op.h b/paddle/fluid/operators/collective/c_allgather_op.h
index c5373bf1304380..b4aff2c2363ec2 100644
--- a/paddle/fluid/operators/collective/c_allgather_op.h
+++ b/paddle/fluid/operators/collective/c_allgather_op.h
@@ -18,10 +18,10 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/ddim.h"
 
 #if defined(PADDLE_WITH_GLOO)
 #include <gloo/allgather.h>
diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
index d7cbd5e8653669..4d49bc4990c6ec 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
@@ -74,7 +74,7 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel<T> {
         PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
             out->data<T>(), numel, dtype, root, comm->comm(), stream));
         VLOG(3) << "rank " << comm->rank() << " invoke Bcast. received "
-                << phi::product(out->dims());
+                << common::product(out->dims());
       }
     }
 
diff --git a/paddle/fluid/operators/collective/c_embedding_op.cc b/paddle/fluid/operators/collective/c_embedding_op.cc
index 637490e59b2d9d..c40f582085a082 100644
--- a/paddle/fluid/operators/collective/c_embedding_op.cc
+++ b/paddle/fluid/operators/collective/c_embedding_op.cc
@@ -40,9 +40,9 @@ class CEmbeddingOp : public framework::OperatorWithKernel {
                           table_dims.size(),
                           table_dims));
 
-    auto output_dims = phi::vectorize(ids_dims);
+    auto output_dims = common::vectorize(ids_dims);
     output_dims.push_back(table_dims[1]);
-    ctx->SetOutputDim("Out", phi::make_ddim(output_dims));
+    ctx->SetOutputDim("Out", common::make_ddim(output_dims));
 
     if (ctx->GetOutputsVarType("Out")[0] ==
         framework::proto::VarType::LOD_TENSOR) {
diff --git a/paddle/fluid/operators/collective/c_reduce_op.h b/paddle/fluid/operators/collective/c_reduce_op.h
index ada47430d4b562..20884d1ae8a969 100644
--- a/paddle/fluid/operators/collective/c_reduce_op.h
+++ b/paddle/fluid/operators/collective/c_reduce_op.h
@@ -19,11 +19,11 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.h b/paddle/fluid/operators/collective/c_reducescatter_op.h
index e523293e8e68c4..52af0b9c435412 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op.h
+++ b/paddle/fluid/operators/collective/c_reducescatter_op.h
@@ -18,11 +18,11 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
-#include "paddle/phi/core/ddim.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/collective/global_gather_op.cc b/paddle/fluid/operators/collective/global_gather_op.cc
index a78f40686e9250..de93ca747b4e9f 100644
--- a/paddle/fluid/operators/collective/global_gather_op.cc
+++ b/paddle/fluid/operators/collective/global_gather_op.cc
@@ -44,7 +44,7 @@ class GlobalGatherOp : public framework::OperatorWithKernel {
                           "The input tensor's dimension must be 2. "
                           "But received input's dimension = %d.",
                           ndim_input));
-    framework::DDim out_dims = phi::make_ddim({-1, -1});
+    framework::DDim out_dims = common::make_ddim({-1, -1});
     ctx->SetOutputDim("Out", out_dims);
   }
 
diff --git a/paddle/fluid/operators/collective/global_gather_op.cu.cc b/paddle/fluid/operators/collective/global_gather_op.cu.cc
index d95c194452174e..7a9c02628088fd 100644
--- a/paddle/fluid/operators/collective/global_gather_op.cu.cc
+++ b/paddle/fluid/operators/collective/global_gather_op.cu.cc
@@ -128,7 +128,7 @@ struct GlobalGatherFunctor<phi::GPUContext, T> {
     for (auto i = 0; i < local_count_len; ++i) {
       fwd_count += cpu_local_count_data[i];
     }
-    framework::DDim out_dims = phi::make_ddim({fwd_count, in_feat});
+    framework::DDim out_dims = common::make_ddim({fwd_count, in_feat});
     int64_t* expert_ptr = new int64_t[n_expert * nranks];
     expert_ptr[0] = 0;
     auto tot_experts = n_expert * nranks;
@@ -268,7 +268,7 @@ struct GlobalGatherProcessGroupFunctor<phi::GPUContext, T> {
     for (auto i = 0; i < local_count_len; ++i) {
       fwd_count += cpu_local_count_data[i];
     }
-    framework::DDim out_dims = phi::make_ddim({fwd_count, in_feat});
+    framework::DDim out_dims = common::make_ddim({fwd_count, in_feat});
     int64_t* expert_ptr = new int64_t[n_expert * nranks];
     expert_ptr[0] = 0;
     auto tot_experts = n_expert * nranks;
diff --git a/paddle/fluid/operators/collective/global_scatter_op.cc b/paddle/fluid/operators/collective/global_scatter_op.cc
index dc6f1fd735baca..095f968306bdc2 100644
--- a/paddle/fluid/operators/collective/global_scatter_op.cc
+++ b/paddle/fluid/operators/collective/global_scatter_op.cc
@@ -47,7 +47,7 @@ class GlobalScatterOp : public framework::OperatorWithKernel {
                           "But received input's dimension = %d.",
                           ndim_input));
 
-    framework::DDim out_dims = phi::make_ddim({-1, -1});
+    framework::DDim out_dims = common::make_ddim({-1, -1});
     ctx->SetOutputDim("Out", out_dims);
   }
 
diff --git a/paddle/fluid/operators/collective/global_scatter_op.cu.cc b/paddle/fluid/operators/collective/global_scatter_op.cu.cc
index d8cd6d4be5f54a..6b915d35be0430 100644
--- a/paddle/fluid/operators/collective/global_scatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/global_scatter_op.cu.cc
@@ -133,7 +133,7 @@ struct GlobalScatterFunctor<phi::GPUContext, T> {
     for (auto i = 0; i < global_count_len; ++i) {
       fwd_count += cpu_global_count_data[i];
     }
-    framework::DDim out_dims = phi::make_ddim({fwd_count, in_feat});
+    framework::DDim out_dims = common::make_ddim({fwd_count, in_feat});
     int64_t* expert_ptr = new int64_t[n_expert * nranks];
     expert_ptr[0] = 0;
     auto tot_experts = n_expert * nranks;
@@ -274,7 +274,7 @@ struct GlobalScatterProcessGroupFunctor<phi::GPUContext, T> {
     for (auto i = 0; i < global_count_len; ++i) {
       fwd_count += cpu_global_count_data[i];
     }
-    framework::DDim out_dims = phi::make_ddim({fwd_count, in_feat});
+    framework::DDim out_dims = common::make_ddim({fwd_count, in_feat});
     int64_t* expert_ptr = new int64_t[n_expert * nranks];
     expert_ptr[0] = 0;
     auto tot_experts = n_expert * nranks;
diff --git a/paddle/fluid/operators/collective/partial_allgather_op.h b/paddle/fluid/operators/collective/partial_allgather_op.h
index 815558d0227eb0..178545f4dd2d3c 100644
--- a/paddle/fluid/operators/collective/partial_allgather_op.h
+++ b/paddle/fluid/operators/collective/partial_allgather_op.h
@@ -18,10 +18,10 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/ddim.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/collective/partial_recv_op.cc b/paddle/fluid/operators/collective/partial_recv_op.cc
index 681864e4e1aa4b..5d8a1276a630e7 100644
--- a/paddle/fluid/operators/collective/partial_recv_op.cc
+++ b/paddle/fluid/operators/collective/partial_recv_op.cc
@@ -68,15 +68,15 @@ class PartialRecvOp : public framework::OperatorWithKernel {
                             i,
                             out_shape[i]));
     }
-    auto out_dims = phi::make_ddim(out_shape);
-    int64_t numel = phi::product(out_dims);
+    auto out_dims = common::make_ddim(out_shape);
+    int64_t numel = common::product(out_dims);
     PADDLE_ENFORCE_EQ(
         (numel % num),
         0,
         platform::errors::InvalidArgument(
             "The output numel (%d) must be divisible by num(%d)", numel, num));
 
-    ctx->SetOutputDim("Out", phi::make_ddim(out_shape));
+    ctx->SetOutputDim("Out", common::make_ddim(out_shape));
   }
 
  protected:
diff --git a/paddle/fluid/operators/collective/recv_v2_op.cc b/paddle/fluid/operators/collective/recv_v2_op.cc
index 260e676affdc36..40757ca89daa89 100644
--- a/paddle/fluid/operators/collective/recv_v2_op.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op.cc
@@ -63,7 +63,7 @@ class RecvOpV2 : public framework::OperatorWithKernel {
                                 i,
                                 out_shape[i]));
         }
-        ctx->SetOutputDim("Out", phi::make_ddim(out_shape));
+        ctx->SetOutputDim("Out", common::make_ddim(out_shape));
       }
     }
   }
diff --git a/paddle/fluid/operators/collective/recv_v2_op.cu.cc b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
index b85f37d2144f1d..41c2e70df8c35f 100644
--- a/paddle/fluid/operators/collective/recv_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
@@ -238,7 +238,7 @@ class RecvOpV2CUDAKernel : public framework::OpKernel<T> {
           PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
               out->data<T>(), numel, dtype, peer, comm->comm(), stream));
           VLOG(3) << "rank " << comm->rank() << " recv "
-                  << phi::product(out_dims) << " from " << peer;
+                  << common::product(out_dims) << " from " << peer;
         }
       }
       return;
@@ -277,7 +277,7 @@ class RecvOpV2CUDAKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
           out->data<T>(), numel, dtype, peer, comm->comm(), stream));
       VLOG(3) << "rank " << comm->rank() << " recv "
-              << phi::product(out->dims()) << " from " << peer;
+              << common::product(out->dims()) << " from " << peer;
     }
 #else
     PADDLE_THROW(platform::errors::Unavailable(
diff --git a/paddle/fluid/operators/collective/send_v2_op.cu.cc b/paddle/fluid/operators/collective/send_v2_op.cu.cc
index 523830bbb187bf..86be6908e3cd28 100644
--- a/paddle/fluid/operators/collective/send_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/send_v2_op.cu.cc
@@ -225,8 +225,8 @@ class SendOpV2CUDAKernel : public framework::OpKernel<T> {
           PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
               x.data<T>(), numel, dtype, peer, comm->comm(), stream));
         }
-        VLOG(3) << "rank " << comm->rank() << " send " << phi::product(x.dims())
-                << " to " << peer;
+        VLOG(3) << "rank " << comm->rank() << " send "
+                << common::product(x.dims()) << " to " << peer;
       }
       return;
     }
@@ -251,8 +251,8 @@ class SendOpV2CUDAKernel : public framework::OpKernel<T> {
           platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype()));
       PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
           x->data<T>(), numel, dtype, peer, comm->comm(), stream));
-      VLOG(3) << "rank " << comm->rank() << " send " << phi::product(x->dims())
-              << " to " << peer;
+      VLOG(3) << "rank " << comm->rank() << " send "
+              << common::product(x->dims()) << " to " << peer;
     }
 #else
     PADDLE_THROW(platform::errors::Unavailable(
diff --git a/paddle/fluid/operators/common_infer_shape_functions.cc b/paddle/fluid/operators/common_infer_shape_functions.cc
index fcb58dcb242270..52836ead345a1b 100644
--- a/paddle/fluid/operators/common_infer_shape_functions.cc
+++ b/paddle/fluid/operators/common_infer_shape_functions.cc
@@ -102,7 +102,7 @@ framework::DDim BroadcastTwoDims(const framework::DDim &x_dims,
                          out_dims_array.data(),
                          max_dim,
                          axis);
-  return phi::make_ddim(out_dims_array);
+  return common::make_ddim(out_dims_array);
 }
 
 }  // namespace details
diff --git a/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc b/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
index b9aff315444f52..c04e897aa63665 100644
--- a/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
+++ b/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
@@ -174,7 +174,7 @@ class ReadFromArrayOp : public ArrayOp {
 
       framework::AttributeMap attrs;
       attrs["dtype"] = framework::TransToProtoVarType(fw_var_tensor.dtype());
-      attrs["shape"] = phi::vectorize<int>(fw_var_tensor.dims());
+      attrs["shape"] = common::vectorize<int>(fw_var_tensor.dims());
       attrs["value"] = 0.0f;
 
       auto zero_op = framework::OpRegistry::CreateOp(
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index 5c3e1c127606c8..5d4d65aa0a2ca6 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -397,10 +397,10 @@ class WhileGradOp : public framework::OperatorBase {
             auto shape = var_desc->GetShape();
             VLOG(8) << "Found uninitialized tensor " << outside_og_name
                     << " in step 0, fill it with 0.0f. dims="
-                    << phi::make_ddim(shape);
+                    << common::make_ddim(shape);
             framework::AttributeMap attrs;
             attrs["dtype"] = var_desc->GetDataType();
-            attrs["shape"] = phi::vectorize<int>(phi::make_ddim(shape));
+            attrs["shape"] = common::vectorize<int>(common::make_ddim(shape));
             attrs["value"] = 0.0f;
 
             auto var_name = outside_og_name;
@@ -541,7 +541,7 @@ class WhileGradOp : public framework::OperatorBase {
             framework::AttributeMap attrs;
             attrs["dtype"] =
                 framework::TransToProtoVarType(inside_tensor.dtype());
-            attrs["shape"] = phi::vectorize<int>(inside_tensor.dims());
+            attrs["shape"] = common::vectorize<int>(inside_tensor.dims());
             attrs["value"] = 0.0f;
 
             auto var_name = pg_ig_names[param_id];
diff --git a/paddle/fluid/operators/correlation_op.cc b/paddle/fluid/operators/correlation_op.cc
index bedadbd18746c4..4547026e74e913 100644
--- a/paddle/fluid/operators/correlation_op.cc
+++ b/paddle/fluid/operators/correlation_op.cc
@@ -106,7 +106,7 @@ class CorrelationOp : public framework::OperatorWithKernel {
                               kernel_size,
                               pad_size,
                               max_displacement);
-    ctx->SetOutputDim("Output", phi::make_ddim(output_shape));
+    ctx->SetOutputDim("Output", common::make_ddim(output_shape));
   }
 
  protected:
diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc
index b615fbd58faeca..03d0b8a0a72be9 100644
--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
@@ -44,16 +44,16 @@ class CropOp : public framework::OperatorWithKernel {
       for (size_t i = 0; i < shape.size(); ++i) {
         tensor_shape[i] = static_cast<int64_t>(shape[i]);
       }
-      ctx->SetOutputDim("Out", phi::make_ddim(tensor_shape));
+      ctx->SetOutputDim("Out", common::make_ddim(tensor_shape));
     } else {
       auto y_dim = ctx->GetInputDim("Y");
-      PADDLE_ENFORCE_EQ(phi::arity(x_dim),
-                        phi::arity(y_dim),
+      PADDLE_ENFORCE_EQ(common::arity(x_dim),
+                        common::arity(y_dim),
                         platform::errors::InvalidArgument(
                             "The number of dimensions (%d) of CropOp's input(X)"
                             " must be equal to that (%d) of input(Y).",
-                            phi::arity(x_dim),
-                            phi::arity(y_dim)));
+                            common::arity(x_dim),
+                            common::arity(y_dim)));
       ctx->SetOutputDim("Out", y_dim);
     }
   }
diff --git a/paddle/fluid/operators/crop_op.h b/paddle/fluid/operators/crop_op.h
index 0c791f01bd9235..fdb2c538fd8a35 100644
--- a/paddle/fluid/operators/crop_op.h
+++ b/paddle/fluid/operators/crop_op.h
@@ -89,7 +89,7 @@ void CropFunction(const framework::ExecutionContext& context) {
     out_dims[0] = x->dims()[0];
   }
   out->mutable_data<T>(out_dims, context.GetPlace());
-  auto x_stride = phi::stride(x->dims());
+  auto x_stride = common::stride(x->dims());
   auto offsets = GetOffsets(context);
   int64_t offset = 0;
   for (size_t i = 0; i < offsets.size(); ++i) {
diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc
index 0eeb917860735b..42ece4219187a5 100644
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -34,14 +34,14 @@ class CrossEntropyOpBase : public framework::OperatorWithKernel {
     auto label_dims = ctx->GetInputDim("Label");
     int rank = x_dims.size();
 
-    bool contain_unknown_dim = phi::contain_unknown_dim(x_dims) ||
-                               phi::contain_unknown_dim(label_dims);
+    bool contain_unknown_dim = common::contain_unknown_dim(x_dims) ||
+                               common::contain_unknown_dim(label_dims);
     bool check = ctx->IsRuntime() || !contain_unknown_dim;
 
     if (check) {
       PADDLE_ENFORCE_EQ(
-          phi::slice_ddim(x_dims, 0, rank - 1),
-          phi::slice_ddim(label_dims, 0, rank - 1),
+          common::slice_ddim(x_dims, 0, rank - 1),
+          common::slice_ddim(label_dims, 0, rank - 1),
           platform::errors::InvalidArgument(
               "Input(X) and Input(Label) shall have the same shape "
               "except the last dimension. But received: the shape of Input(X) "
@@ -166,15 +166,15 @@ class CrossEntropyGradientOpBase : public framework::OperatorWithKernel {
             dy_dims.size(),
             label_dims.size()));
 
-    bool contain_unknown_dim =
-        phi::contain_unknown_dim(x_dims) || phi::contain_unknown_dim(dy_dims);
+    bool contain_unknown_dim = common::contain_unknown_dim(x_dims) ||
+                               common::contain_unknown_dim(dy_dims);
 
     bool check = ctx->IsRuntime() || !contain_unknown_dim;
 
     if (check) {
       PADDLE_ENFORCE_EQ(
-          phi::slice_ddim(x_dims, 0, rank - 1),
-          phi::slice_ddim(dy_dims, 0, rank - 1),
+          common::slice_ddim(x_dims, 0, rank - 1),
+          common::slice_ddim(dy_dims, 0, rank - 1),
           platform::errors::InvalidArgument(
               "The Input(X) and Input(Y@Grad) should have the same "
               "shape except the last dimension. but received: "
@@ -321,9 +321,9 @@ class CrossEntropyOp2 : public CrossEntropyOpBase {
     OP_INOUT_CHECK(
         ctx->HasOutput("MatchX"), "Output", "MatchX", "CrossEntropyOp2");
     auto x_dims = ctx->GetInputDim("X");
-    auto x_dims_vec = phi::vectorize(x_dims);
+    auto x_dims_vec = common::vectorize(x_dims);
     x_dims_vec.push_back(0);
-    ctx->SetOutputDim("XShape", phi::make_ddim(x_dims_vec));
+    ctx->SetOutputDim("XShape", common::make_ddim(x_dims_vec));
     x_dims[x_dims.size() - 1] = 1;
     ctx->SetOutputDim("MatchX", x_dims);
     ctx->ShareLoD("X", /*->*/ "XShape");
diff --git a/paddle/fluid/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h
index 6e6617c7bc4cca..d755cb1639572a 100644
--- a/paddle/fluid/operators/cross_entropy_op.h
+++ b/paddle/fluid/operators/cross_entropy_op.h
@@ -39,10 +39,10 @@ class CrossEntropyOpKernel : public framework::OpKernel<T> {
     phi::DenseTensor labels_2d, y_2d;
     if (label_dims.size() < rank) {
       labels_2d.ShareDataWith(*labels);
-      labels_2d.Resize({phi::product(label_dims), 1});
+      labels_2d.Resize({common::product(label_dims), 1});
 
       y_2d.ShareDataWith(*y);
-      y_2d.Resize({phi::product(y->dims()), 1});
+      y_2d.Resize({common::product(y->dims()), 1});
 
     } else {
       labels_2d = phi::ReshapeToMatrix(*labels, rank - 1);
@@ -250,7 +250,7 @@ class CrossEntropyOpKernel2 : public framework::OpKernel<T> {
 
     auto& x_dims = x->dims();
     auto feature_size = x_dims[x_dims.size() - 1];
-    auto batch_size = phi::product(x->dims()) / feature_size;
+    auto batch_size = common::product(x->dims()) / feature_size;
 
     auto* p_x = x->data<T>();
     auto* p_label = label->data<int64_t>();
@@ -283,7 +283,7 @@ class CrossEntropyGradientOpKernel2 : public framework::OpKernel<T> {
     int64_t ignore_index = ctx.Attr<int>("ignore_index");
     int rank = dx->dims().size();
     int64_t feature_size = dx->dims()[rank - 1];
-    int64_t batch_size = phi::product(dx->dims()) / feature_size;
+    int64_t batch_size = common::product(dx->dims()) / feature_size;
 
     platform::ForRange<DeviceContext> for_range(
         ctx.template device_context<DeviceContext>(),
diff --git a/paddle/fluid/operators/ctc_align_op.h b/paddle/fluid/operators/ctc_align_op.h
index 127d89b8f25fd0..faa2efab772a6e 100644
--- a/paddle/fluid/operators/ctc_align_op.h
+++ b/paddle/fluid/operators/ctc_align_op.h
@@ -33,7 +33,7 @@ class CTCAlignKernel : public framework::OpKernel<T> {
     size_t blank = static_cast<size_t>(ctx.Attr<int>("blank"));
     bool merge_repeated = ctx.Attr<bool>("merge_repeated");
     T* output_data = output->mutable_data<T>(ctx.GetPlace());
-    auto input_dims = phi::vectorize<int>(input->dims());
+    auto input_dims = common::vectorize<int>(input->dims());
     const T* input_data = input->data<T>();
 
     // support tensor input, no lod information
diff --git a/paddle/fluid/operators/custom_device_common_op_registry.cc b/paddle/fluid/operators/custom_device_common_op_registry.cc
index f01b0a92d89f82..9573809d6c7ccf 100644
--- a/paddle/fluid/operators/custom_device_common_op_registry.cc
+++ b/paddle/fluid/operators/custom_device_common_op_registry.cc
@@ -678,7 +678,7 @@ class CBroadcastOpCustomDeviceKernel : public framework::OpKernel<T> {
                                        comm->GetXcclComm(),
                                        *stream);
       VLOG(3) << "rank " << comm->GetRank() << " invoke Bcast. received "
-              << phi::product(out->dims());
+              << common::product(out->dims());
     }
     out->set_lod(x->lod());
   }
@@ -956,7 +956,7 @@ class GlobalScatterOpCustomDeviceKernel : public framework::OpKernel<T> {
       for (auto i = 0; i < global_count_len; ++i) {
         fwd_count += cpu_global_count_data[i];
       }
-      framework::DDim out_dims = phi::make_ddim({fwd_count, in_feat});
+      framework::DDim out_dims = common::make_ddim({fwd_count, in_feat});
       int64_t* expert_ptr = new int64_t[n_expert * nranks];
       expert_ptr[0] = 0;
       auto tot_experts = n_expert * nranks;
@@ -1038,7 +1038,7 @@ class GlobalScatterOpCustomDeviceKernel : public framework::OpKernel<T> {
       for (auto i = 0; i < global_count_len; ++i) {
         fwd_count += cpu_global_count_data[i];
       }
-      framework::DDim out_dims = phi::make_ddim({fwd_count, in_feat});
+      framework::DDim out_dims = common::make_ddim({fwd_count, in_feat});
       int64_t* expert_ptr = new int64_t[n_expert * nranks];
       expert_ptr[0] = 0;
       auto tot_experts = n_expert * nranks;
@@ -1170,7 +1170,7 @@ class GlobalGatherOpCustomDeviceKernel : public framework::OpKernel<T> {
       for (auto i = 0; i < local_count_len; ++i) {
         fwd_count += cpu_local_count_data[i];
       }
-      framework::DDim out_dims = phi::make_ddim({fwd_count, in_feat});
+      framework::DDim out_dims = common::make_ddim({fwd_count, in_feat});
       int64_t* expert_ptr = new int64_t[n_expert * nranks];
       expert_ptr[0] = 0;
       auto tot_experts = n_expert * nranks;
@@ -1250,7 +1250,7 @@ class GlobalGatherOpCustomDeviceKernel : public framework::OpKernel<T> {
       for (auto i = 0; i < local_count_len; ++i) {
         fwd_count += cpu_local_count_data[i];
       }
-      framework::DDim out_dims = phi::make_ddim({fwd_count, in_feat});
+      framework::DDim out_dims = common::make_ddim({fwd_count, in_feat});
       int64_t* expert_ptr = new int64_t[n_expert * nranks];
       expert_ptr[0] = 0;
       auto tot_experts = n_expert * nranks;
diff --git a/paddle/fluid/operators/data_norm_op.cc b/paddle/fluid/operators/data_norm_op.cc
index 2e70168876162f..32cc8b49cd007b 100644
--- a/paddle/fluid/operators/data_norm_op.cc
+++ b/paddle/fluid/operators/data_norm_op.cc
@@ -66,8 +66,8 @@ class DataNormOp : public framework::OperatorWithKernel {
     }
 
     const auto x_dims = ctx->GetInputDim("X");
-    const DataLayout data_layout =
-        phi::StringToDataLayout(ctx->Attrs().Get<std::string>("data_layout"));
+    const DataLayout data_layout = common::StringToDataLayout(
+        ctx->Attrs().Get<std::string>("data_layout"));
 
     PADDLE_ENFORCE_EQ(x_dims.size() >= 2 && x_dims.size() <= 5,
                       true,
@@ -130,7 +130,7 @@ class DataNormOp : public framework::OperatorWithKernel {
 
       bool check = true;
       if ((!ctx->IsRuntime()) &&
-          (phi::product(scale_dim) <= 0 || phi::product(bias_dim) <= 0)) {
+          (common::product(scale_dim) <= 0 || common::product(bias_dim) <= 0)) {
         check = false;
       }
 
@@ -272,7 +272,7 @@ class DataNormKernel<T, phi::CPUContext> : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext &ctx) const override {
     // const bool is_test = ctx.Attr<bool>("is_test");
     const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+    const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
 
     const auto *x = ctx.Input<phi::DenseTensor>("X");
     const auto &x_dims = x->dims();
@@ -452,8 +452,8 @@ class DataNormGradOp : public framework::OperatorWithKernel {
                    "DataNormGrad");
 
     const auto x_dims = ctx->GetInputDim("X");
-    const DataLayout data_layout =
-        phi::StringToDataLayout(ctx->Attrs().Get<std::string>("data_layout"));
+    const DataLayout data_layout = common::StringToDataLayout(
+        ctx->Attrs().Get<std::string>("data_layout"));
     const int C = static_cast<int>(data_layout == DataLayout::kNCHW
                                        ? x_dims[1]
                                        : x_dims[x_dims.size() - 1]);
@@ -516,7 +516,7 @@ class DataNormGradKernel<T, phi::CPUContext> : public framework::OpKernel<T> {
     const auto *means = ctx.Input<phi::DenseTensor>("Means");
 
     const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+    const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
 
     // Get the size for each dimension.
     // NCHW [batch_size, in_channels, in_height, in_width]
diff --git a/paddle/fluid/operators/dequantize_abs_max_op.h b/paddle/fluid/operators/dequantize_abs_max_op.h
index a403d974a98637..5b07dfb2a9b001 100644
--- a/paddle/fluid/operators/dequantize_abs_max_op.h
+++ b/paddle/fluid/operators/dequantize_abs_max_op.h
@@ -16,9 +16,9 @@ limitations under the License. */
 
 #include <vector>
 
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/ddim.h"
 
 namespace phi {
 class DenseTensor;
diff --git a/paddle/fluid/operators/dequantize_log_op.h b/paddle/fluid/operators/dequantize_log_op.h
index e13b25694f33e5..f17ba146461ae3 100644
--- a/paddle/fluid/operators/dequantize_log_op.h
+++ b/paddle/fluid/operators/dequantize_log_op.h
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include <vector>
 
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/ddim.h"
 
 namespace phi {
 class DenseTensor;
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index fe32cc32d02d4b..d38a72556f7596 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -48,8 +48,9 @@ detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc
 detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
 detection_library(generate_proposal_labels_op SRCS
                   generate_proposal_labels_op.cc)
-detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc DEPS phi)
-detection_library(locality_aware_nms_op SRCS locality_aware_nms_op.cc DEPS phi)
+detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc DEPS phi common)
+detection_library(locality_aware_nms_op SRCS locality_aware_nms_op.cc DEPS phi
+                  common)
 detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu)
 detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc
                   box_decoder_and_assign_op.cu)
diff --git a/paddle/fluid/operators/detection/anchor_generator_op.cc b/paddle/fluid/operators/detection/anchor_generator_op.cc
index 70c7430c0e23f8..8c3705ba3e760a 100644
--- a/paddle/fluid/operators/detection/anchor_generator_op.cc
+++ b/paddle/fluid/operators/detection/anchor_generator_op.cc
@@ -56,8 +56,8 @@ class AnchorGeneratorOp : public framework::OperatorWithKernel {
     dim_vec[1] = input_dims[3];
     dim_vec[2] = static_cast<int64_t>(num_anchors);
     dim_vec[3] = 4;
-    ctx->SetOutputDim("Anchors", phi::make_ddim(dim_vec));
-    ctx->SetOutputDim("Variances", phi::make_ddim(dim_vec));
+    ctx->SetOutputDim("Anchors", common::make_ddim(dim_vec));
+    ctx->SetOutputDim("Variances", common::make_ddim(dim_vec));
   }
 
  protected:
diff --git a/paddle/fluid/operators/detection/anchor_generator_op.h b/paddle/fluid/operators/detection/anchor_generator_op.h
index 9e667d9f99fc1c..ea36d1b47c849a 100644
--- a/paddle/fluid/operators/detection/anchor_generator_op.h
+++ b/paddle/fluid/operators/detection/anchor_generator_op.h
@@ -109,7 +109,7 @@ class AnchorGeneratorOpKernel : public framework::OpKernel<T> {
 
     phi::DenseTensor var_t;
     var_t.mutable_data<T>(
-        phi::make_ddim({1, static_cast<int>(variances.size())}),
+        common::make_ddim({1, static_cast<int>(variances.size())}),
         ctx.GetPlace());
     auto var_et = phi::EigenTensor<T, 2>::From(var_t);
     for (size_t i = 0; i < variances.size(); ++i) {
diff --git a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc
index 8c97523559de64..552a6da3b34257 100644
--- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc
+++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc
@@ -127,11 +127,13 @@ class BoxDecoderAndAssignOp : public framework::OperatorWithKernel {
               box_score_dims[1],
               prior_box_dims[1]));
     }
-    ctx->SetOutputDim("DecodeBox",
-                      phi::make_ddim({target_box_dims[0], target_box_dims[1]}));
+    ctx->SetOutputDim(
+        "DecodeBox",
+        common::make_ddim({target_box_dims[0], target_box_dims[1]}));
     ctx->ShareLoD("TargetBox", /*->*/ "DecodeBox");
-    ctx->SetOutputDim("OutputAssignBox",
-                      phi::make_ddim({prior_box_dims[0], prior_box_dims[1]}));
+    ctx->SetOutputDim(
+        "OutputAssignBox",
+        common::make_ddim({prior_box_dims[0], prior_box_dims[1]}));
     ctx->ShareLoD("PriorBox", /*->*/ "OutputAssignBox");
   }
 };
diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cc b/paddle/fluid/operators/detection/density_prior_box_op.cc
index bd4230fed59314..881ef3ac18f76f 100644
--- a/paddle/fluid/operators/detection/density_prior_box_op.cc
+++ b/paddle/fluid/operators/detection/density_prior_box_op.cc
@@ -92,8 +92,8 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel {
       dim_vec[1] = input_dims[3];
       dim_vec[2] = static_cast<int64_t>(num_priors);
       dim_vec[3] = 4;
-      ctx->SetOutputDim("Boxes", phi::make_ddim(dim_vec));
-      ctx->SetOutputDim("Variances", phi::make_ddim(dim_vec));
+      ctx->SetOutputDim("Boxes", common::make_ddim(dim_vec));
+      ctx->SetOutputDim("Variances", common::make_ddim(dim_vec));
     } else if (ctx->IsRuntime()) {
       int64_t dim0 =
           static_cast<int64_t>(input_dims[2] * input_dims[3] * num_priors);
diff --git a/paddle/fluid/operators/detection/density_prior_box_op.h b/paddle/fluid/operators/detection/density_prior_box_op.h
index 9474e39cb7daf7..995abf11200130 100644
--- a/paddle/fluid/operators/detection/density_prior_box_op.h
+++ b/paddle/fluid/operators/detection/density_prior_box_op.h
@@ -123,7 +123,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
     }
     phi::DenseTensor var_t;
     var_t.mutable_data<T>(
-        phi::make_ddim({1, static_cast<int>(variances.size())}),
+        common::make_ddim({1, static_cast<int>(variances.size())}),
         ctx.GetPlace());
 
     auto var_et = phi::EigenTensor<T, 2>::From(var_t);
diff --git a/paddle/fluid/operators/detection/iou_similarity_op.cc b/paddle/fluid/operators/detection/iou_similarity_op.cc
index ca107077232457..0f2ac1c86d6289 100644
--- a/paddle/fluid/operators/detection/iou_similarity_op.cc
+++ b/paddle/fluid/operators/detection/iou_similarity_op.cc
@@ -52,7 +52,7 @@ class IOUSimilarityOp : public framework::OperatorWithKernel {
             "The shape of Y is [M, 4], but got dimension = %d.", y_dims[1]));
 
     ctx->ShareLoD("X", /*->*/ "Out");
-    ctx->SetOutputDim("Out", phi::make_ddim({x_dims[0], y_dims[0]}));
+    ctx->SetOutputDim("Out", common::make_ddim({x_dims[0], y_dims[0]}));
   }
 };
 
diff --git a/paddle/fluid/operators/detection/mine_hard_examples_op.cc b/paddle/fluid/operators/detection/mine_hard_examples_op.cc
index 4c3e934fab4dc8..0ce9979ff2a3d4 100644
--- a/paddle/fluid/operators/detection/mine_hard_examples_op.cc
+++ b/paddle/fluid/operators/detection/mine_hard_examples_op.cc
@@ -152,7 +152,7 @@ class MineHardExamplesKernel : public framework::OpKernel<T> {
     out_neg_indices_lod.emplace_back(batch_starts);
     int neg_offset = 0;
     auto neg_data = out_neg_indices->mutable_data<int>(
-        phi::make_ddim({static_cast<int>(batch_starts.back()), 1}),
+        common::make_ddim({static_cast<int>(batch_starts.back()), 1}),
         ctx.GetPlace());
 
     for (auto neg_indices : all_neg_indices) {
diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc
index 9f3f426d1ad853..be08e4e9680148 100644
--- a/paddle/fluid/operators/detection/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc
@@ -355,7 +355,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
     auto index = ctx.Output<phi::DenseTensor>("Index");
     bool has_roisnum = ctx.HasInput("RoisNum") ? true : false;
     auto rois_num = ctx.Input<phi::DenseTensor>("RoisNum");
-    auto score_dims = phi::vectorize<int>(scores->dims());
+    auto score_dims = common::vectorize<int>(scores->dims());
     auto score_size = score_dims.size();
     auto& dev_ctx = ctx.template device_context<phi::CPUContext>();
 
diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cc b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
index 936480a9e23ddb..0059aedcdc86ca 100644
--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cc
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
@@ -26,7 +26,7 @@ class PolygonBoxTransformCPUKernel : public framework::OpKernel<T> {
         true,
         platform::errors::InvalidArgument("It must use CUDAPlace."));
     auto* in = ctx.Input<phi::DenseTensor>("Input");
-    auto in_dims = phi::vectorize<int>(in->dims());
+    auto in_dims = common::vectorize<int>(in->dims());
     const T* in_data = in->data<T>();
     auto* out = ctx.Output<phi::DenseTensor>("Output");
     T* out_data = out->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/detection/prior_box_op.h b/paddle/fluid/operators/detection/prior_box_op.h
index b49841399c71f9..b6a6e283479df3 100644
--- a/paddle/fluid/operators/detection/prior_box_op.h
+++ b/paddle/fluid/operators/detection/prior_box_op.h
@@ -181,7 +181,7 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
 
     phi::DenseTensor var_t;
     var_t.mutable_data<K>(
-        phi::make_ddim({1, static_cast<int>(variances.size())}),
+        common::make_ddim({1, static_cast<int>(variances.size())}),
         ctx.GetPlace());
     auto var_et = phi::EigenTensor<K, 2>::From(var_t);
 
diff --git a/paddle/fluid/operators/detection_map_op.cc b/paddle/fluid/operators/detection_map_op.cc
index a342347986d145..a0879337f5ae75 100644
--- a/paddle/fluid/operators/detection_map_op.cc
+++ b/paddle/fluid/operators/detection_map_op.cc
@@ -85,7 +85,7 @@ class DetectionMAPOp : public framework::OperatorWithKernel {
               "Input(PosCount) is not null."));
     }
 
-    ctx->SetOutputDim("MAP", phi::make_ddim({1}));
+    ctx->SetOutputDim("MAP", common::make_ddim({1}));
   }
 
  protected:
diff --git a/paddle/fluid/operators/detection_map_op.h b/paddle/fluid/operators/detection_map_op.h
index 9579e527a20d4b..ccf08349687939 100644
--- a/paddle/fluid/operators/detection_map_op.h
+++ b/paddle/fluid/operators/detection_map_op.h
@@ -257,12 +257,12 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
     }
 
     int* pos_count_data = output_pos_count->mutable_data<int>(
-        phi::make_ddim({class_num, 1}), ctx.GetPlace());
+        common::make_ddim({class_num, 1}), ctx.GetPlace());
 
     T* true_pos_data = output_true_pos->mutable_data<T>(
-        phi::make_ddim({true_pos_count, 2}), ctx.GetPlace());
+        common::make_ddim({true_pos_count, 2}), ctx.GetPlace());
     T* false_pos_data = output_false_pos->mutable_data<T>(
-        phi::make_ddim({false_pos_count, 2}), ctx.GetPlace());
+        common::make_ddim({false_pos_count, 2}), ctx.GetPlace());
     true_pos_count = 0;
     false_pos_count = 0;
     std::vector<size_t> true_pos_starts = {0};
diff --git a/paddle/fluid/operators/dlnne/dlnne_engine_op.h b/paddle/fluid/operators/dlnne/dlnne_engine_op.h
index ccc1764b0adb8d..3a8441d76a9dd8 100644
--- a/paddle/fluid/operators/dlnne/dlnne_engine_op.h
+++ b/paddle/fluid/operators/dlnne/dlnne_engine_op.h
@@ -30,6 +30,7 @@
 #include <utility>
 #include <vector>
 
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/data_device_transform.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -37,7 +38,6 @@
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/utils/io_utils.h"
 #include "paddle/fluid/platform/float16.h"
-#include "paddle/phi/core/ddim.h"
 
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
@@ -423,7 +423,7 @@ class DlnneEngineOp : public framework::OperatorBase {
         // convert input and copy to Dlnne engine's buffer
         auto &t = inference::analysis::GetFromScope<phi::DenseTensor>(scope, x);
 
-        auto t_shape = phi::vectorize<int64_t>(t.dims());
+        auto t_shape = common::vectorize<int64_t>(t.dims());
         std::vector<int64_t> runtime_input_shape(t_shape.begin(),
                                                  t_shape.end());
         const int bind_index = index;
@@ -484,7 +484,7 @@ class DlnneEngineOp : public framework::OperatorBase {
       }
       input_buffers[bind_index] = buffer;
 
-      auto t_shape = phi::vectorize<int64_t>(t.dims());
+      auto t_shape = common::vectorize<int64_t>(t.dims());
       std::vector<int64_t> runtime_input_shape(t_shape.begin(), t_shape.end());
       for (auto &size : t_shape) {
         data_bytes = data_bytes * size;
@@ -562,7 +562,7 @@ class DlnneEngineOp : public framework::OperatorBase {
 
       VLOG(4) << bind_index << ": out_shapes[bind_index] dim:"
               << out_shapes[bind_index].size();
-      fluid_t->Resize(phi::make_ddim(out_shapes[bind_index]));
+      fluid_t->Resize(common::make_ddim(out_shapes[bind_index]));
 
       dl::nne::DataType dl_type = out_types[bind_index];
       if (dlnne_log_flag_) {
@@ -678,7 +678,7 @@ class DlnneEngineOp : public framework::OperatorBase {
 
       // TODO(pei.jiang): refine this code, because when run dlnne create
       // engine, there is same code
-      auto t_shape = phi::vectorize<int64_t>(t.dims());
+      auto t_shape = common::vectorize<int64_t>(t.dims());
       std::vector<int64_t> input_shape(t_shape.begin(), t_shape.end());
       calib_data_shape_map.emplace(x, input_shape);
       std::string data_type = inference::ConvertType(t.type());
diff --git a/paddle/fluid/operators/elementwise/elementwise_functor.h b/paddle/fluid/operators/elementwise/elementwise_functor.h
index 844b0a195031f5..718a78c7811ea3 100644
--- a/paddle/fluid/operators/elementwise/elementwise_functor.h
+++ b/paddle/fluid/operators/elementwise/elementwise_functor.h
@@ -11,8 +11,8 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/common/array.h"
 #include "paddle/fluid/platform/complex.h"
-#include "paddle/phi/core/utils/array.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index d8fb9a658fa00d..d835caedbf3c87 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -119,13 +119,13 @@ class ElementwiseOp : public framework::OperatorWithKernel {
       if (should_rotate) {
         // Pick bigger shape and rotate this one
         bool x_over_y = (x_dims.size() > y_dims.size());
-        auto vdims = x_over_y ? phi::vectorize<int>(x_dims)
-                              : phi::vectorize<int>(y_dims);
+        auto vdims = x_over_y ? common::vectorize<int>(x_dims)
+                              : common::vectorize<int>(y_dims);
         std::rotate(vdims.begin() + 1, vdims.begin() + 2, vdims.end());
         if (x_over_y) {
-          x_dims = phi::make_ddim(vdims);
+          x_dims = common::make_ddim(vdims);
         } else {
-          y_dims = phi::make_ddim(vdims);
+          y_dims = common::make_ddim(vdims);
         }
       }
 #endif
@@ -145,7 +145,7 @@ class ElementwiseOp : public framework::OperatorWithKernel {
                     out_dims_array.end());
       }
 #endif
-      ctx->SetOutputDim("Out", phi::make_ddim(out_dims_array));
+      ctx->SetOutputDim("Out", common::make_ddim(out_dims_array));
       // to do
       ctx->ShareLoD("X", /*->*/ "Out");
     }
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index 4894dff4b971ca..face0f758f8484 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -473,7 +473,7 @@ void FusedElemwiseAndActComputeNoBroadcast(
     CompoundFunctor compound_functor,
     phi::DenseTensor *out,
     phi::DenseTensor *intermediate_out) {
-  size_t N = static_cast<size_t>(phi::product(x_dim));
+  size_t N = static_cast<size_t>(common::product(x_dim));
 
   platform::ForRange<DeviceContext> for_range(
       ctx.template device_context<DeviceContext>(), N);
@@ -654,7 +654,7 @@ void FusedElemwiseAndActGradComputeNoBroadcast(
     DX_OP dx_op,
     DY_OP dy_op,
     DIntermediate_OP dintermediate_op) {
-  size_t N = static_cast<size_t>(phi::product(x_dim));
+  size_t N = static_cast<size_t>(common::product(x_dim));
   platform::ForRange<DeviceContext> for_range(
       ctx.template device_context<DeviceContext>(), N);
   const T *x_data = nullptr;
diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc
index c6e750f4fe0ecd..4c2dd992657812 100644
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -67,7 +67,7 @@ class ExpandOp : public framework::OperatorWithKernel {
       }
     }
 
-    ctx->SetOutputDim("Out", phi::make_ddim(out_shape));
+    ctx->SetOutputDim("Out", common::make_ddim(out_shape));
     if (out_shape[0] == x_dims[0]) {
       ctx->ShareLoD("X", "Out");
     }
diff --git a/paddle/fluid/operators/fake_dequantize_op.cc b/paddle/fluid/operators/fake_dequantize_op.cc
index a941fb8171de3e..fde9a0ca0b8a25 100644
--- a/paddle/fluid/operators/fake_dequantize_op.cc
+++ b/paddle/fluid/operators/fake_dequantize_op.cc
@@ -121,9 +121,9 @@ struct ChannelDequantizeFunctor<phi::CPUContext, T> {
         const T* scale_two = scales[1]->data<T>();
         for (int i = 0; i < batch_size; i++) {
           phi::DenseTensor one_batch_in = in->Slice(i, i + 1).Resize(
-              phi::slice_ddim(in->dims(), 1, in->dims().size()));
+              common::slice_ddim(in->dims(), 1, in->dims().size()));
           phi::DenseTensor one_batch_out = out->Slice(i, i + 1).Resize(
-              phi::slice_ddim(out->dims(), 1, out->dims().size()));
+              common::slice_ddim(out->dims(), 1, out->dims().size()));
           for (int j = 0; j < channel; j++) {
             T s = scale_one[j];
             phi::DenseTensor one_channel_in = one_batch_in.Slice(j, j + 1);
diff --git a/paddle/fluid/operators/fake_dequantize_op.h b/paddle/fluid/operators/fake_dequantize_op.h
index 2718ea6050176b..57887721308d4f 100644
--- a/paddle/fluid/operators/fake_dequantize_op.h
+++ b/paddle/fluid/operators/fake_dequantize_op.h
@@ -16,9 +16,9 @@ limitations under the License. */
 
 #include <vector>
 
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/ddim.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/fake_quantize_op.cu.h b/paddle/fluid/operators/fake_quantize_op.cu.h
index cefe558ab245ed..bdf8a80debb649 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu.h
+++ b/paddle/fluid/operators/fake_quantize_op.cu.h
@@ -82,7 +82,8 @@ struct FindAbsMaxFunctor<phi::GPUContext, T> {
     grid = (grid > block) ? block : grid;
 
     phi::DenseTensor max;
-    T *max_data = max.mutable_data<T>(phi::make_ddim({grid}), ctx.GetPlace());
+    T *max_data =
+        max.mutable_data<T>(common::make_ddim({grid}), ctx.GetPlace());
     FindAbsMaxKernel<T>
         <<<grid, block, 1024 * sizeof(T), ctx.stream()>>>(in, num, max_data);
     FindAbsMaxKernel<T>
diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h
index 7fb47ebd1a76e7..13f1e5a3a26124 100644
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -330,7 +330,7 @@ class FakeMovingAverageAbsMaxKernelBase : public framework::OpKernel<T> {
     auto *in_state = context.Input<phi::DenseTensor>("InState");
 
     phi::DenseTensor tmp_scale;
-    tmp_scale.Resize(phi::make_dim(1));
+    tmp_scale.Resize(common::make_dim(1));
     T *cur_scale_data = dev_ctx.template Alloc<T>(&tmp_scale);
 
     FindAbsMaxFunctor<DeviceContext, T>()(
@@ -420,7 +420,7 @@ class MovingAverageAbsMaxScaleKernel : public framework::OpKernel<T> {
     auto *in_accum = context.Input<phi::DenseTensor>("InAccum");
     auto *in_state = context.Input<phi::DenseTensor>("InState");
     phi::DenseTensor tmp_scale;
-    tmp_scale.Resize(phi::make_dim(1));
+    tmp_scale.Resize(common::make_dim(1));
     T *cur_scale_data = dev_ctx.template Alloc<T>(&tmp_scale);
 
     FindAbsMaxFunctor<DeviceContext, T>()(
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index a398698f40cabb..1263d156ce220b 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -38,7 +38,7 @@ class FillConstantOp : public framework::OperatorWithKernel {
                 "than 0. But received: shape[%u] = %d; shape = [%s].",
                 i,
                 shape[i],
-                phi::make_ddim(shape)));
+                common::make_ddim(shape)));
       }
     }
     if (shape.empty() && ctx->HasInput("ShapeTensor")) {
@@ -48,11 +48,11 @@ class FillConstantOp : public framework::OperatorWithKernel {
         num_ele *= static_cast<int>(shape_dims[i]);
       }
       auto vec_dims = std::vector<int>(num_ele, -1);
-      ctx->SetOutputDim("Out", phi::make_ddim(vec_dims));
+      ctx->SetOutputDim("Out", common::make_ddim(vec_dims));
 
       return;
     }
-    ctx->SetOutputDim("Out", phi::make_ddim(shape));
+    ctx->SetOutputDim("Out", common::make_ddim(shape));
   }
 
  protected:
diff --git a/paddle/fluid/operators/fill_op.cc b/paddle/fluid/operators/fill_op.cc
index aeefe07d348e93..f8c4087344f9c9 100644
--- a/paddle/fluid/operators/fill_op.cc
+++ b/paddle/fluid/operators/fill_op.cc
@@ -47,7 +47,7 @@ class FillOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* context) const override {
     OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "Fill");
     auto& shape = context->Attrs().Get<std::vector<int>>("shape");
-    context->SetOutputDim("Out", phi::make_ddim(shape));
+    context->SetOutputDim("Out", common::make_ddim(shape));
   }
 
  protected:
diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc
index ddb67eef4a3fa6..6325036dc01bfe 100644
--- a/paddle/fluid/operators/flatten_op.cc
+++ b/paddle/fluid/operators/flatten_op.cc
@@ -52,7 +52,7 @@ class Flatten2Op : public framework::OperatorWithKernel {
             "The axis should be less than or equal to input tensor's rank"));
 
     const auto &out_dims = Flatten2Op::GetOutputShape(axis, in_dims);
-    ctx->SetOutputDim("Out", phi::make_ddim(out_dims));
+    ctx->SetOutputDim("Out", common::make_ddim(out_dims));
     if (in_dims[0] == out_dims[0]) {
       // Only pass LoD when the first dimension of output and Input(X)
       // are the same.
@@ -65,7 +65,7 @@ class Flatten2Op : public framework::OperatorWithKernel {
     for (int i = 0; i < in_dims.size(); ++i) {
       xshape_dims[i + 1] = in_dims[i];
     }
-    ctx->SetOutputDim("XShape", phi::make_ddim(xshape_dims));
+    ctx->SetOutputDim("XShape", common::make_ddim(xshape_dims));
     ctx->ShareLoD("X", "XShape");
   }
 
@@ -189,7 +189,7 @@ class Flatten2GradOp : public framework::OperatorWithKernel {
                    framework::GradVarName("Out"),
                    "Flatten2Grad");
     auto xshape_dims = context->GetInputDim("XShape");
-    auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
+    auto x_dims = common::slice_ddim(xshape_dims, 1, xshape_dims.size());
     context->SetOutputDim(framework::GradVarName("X"), x_dims);
     context->ShareLoD("XShape", framework::GradVarName("X"));
   }
diff --git a/paddle/fluid/operators/flatten_op.h b/paddle/fluid/operators/flatten_op.h
index 6942a0f7db2da4..1b71627a067782 100644
--- a/paddle/fluid/operators/flatten_op.h
+++ b/paddle/fluid/operators/flatten_op.h
@@ -38,7 +38,7 @@ class Flatten2Kernel : public framework::OpKernel<T> {
 
     auto *out = context.Output<phi::DenseTensor>("Out");
 
-    auto out_dims = phi::make_ddim(GetOutputShape(axes, x_dims));
+    auto out_dims = common::make_ddim(GetOutputShape(axes, x_dims));
 
     out->mutable_data(context.GetPlace(), in->type());
     framework::TensorCopy(
@@ -78,7 +78,7 @@ class Flatten2GradKernel : public framework::OpKernel<T> {
     auto *d_out = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
 
     auto xshape_dims = ctx.Input<phi::DenseTensor>("XShape")->dims();
-    auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
+    auto x_dims = common::slice_ddim(xshape_dims, 1, xshape_dims.size());
 
     d_x->mutable_data(ctx.GetPlace(), d_out->type());
     framework::TensorCopy(
diff --git a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
index 762e86406917dd..8f3b5e4f09a065 100644
--- a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
@@ -43,7 +43,7 @@ struct BNStatsFinalizeArgs {
             "The size of param_shape is expected to 4. But received "
             "param_shape's size is %d, param_shape is [%s].",
             param_shape.size(),
-            phi::make_ddim(param_shape)));
+            common::make_ddim(param_shape)));
 
     in_desc.set(param_shape, format, param_dtype);
     out_desc.set(param_shape, format, dtype);
diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
index c82ccc959d204c..9dbb8a8eaebc8f 100644
--- a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
@@ -59,7 +59,7 @@ struct NormConvolutionArgs {
             "The size of input_shape is expected to 4. But received "
             "input_shape's size is %d, input_shape is [%s].",
             input_shape.size(),
-            phi::make_ddim(input_shape)));
+            common::make_ddim(input_shape)));
     PADDLE_ENFORCE_EQ(
         filter_shape.size(),
         4U,
@@ -67,14 +67,14 @@ struct NormConvolutionArgs {
             "The size of filter_shape is expected to 4. But received "
             "filter_shape's size is %d, filter_shape is [%s].",
             filter_shape.size(),
-            phi::make_ddim(filter_shape)));
+            common::make_ddim(filter_shape)));
     PADDLE_ENFORCE_EQ(filter_shape[1] == filter_shape[2] &&
                           (filter_shape[1] == 1 || filter_shape[1] == 3),
                       true,
                       platform::errors::InvalidArgument(
                           "The filter_shape is expected to store as nhwc, and "
                           "h = w = 1 or 3. But received filter_shape is [%s].",
-                          phi::make_ddim(filter_shape)));
+                          common::make_ddim(filter_shape)));
     PADDLE_ENFORCE_EQ((filter_shape[0] % 32 == 0 && filter_shape[3] % 8 == 0),
                       true,
                       platform::errors::InvalidArgument(
@@ -91,7 +91,7 @@ struct NormConvolutionArgs {
             "The size of output_shape is expected to 4. But received "
             "filter_shape's size is %d, filter_shape is [%s].",
             output_shape.size(),
-            phi::make_ddim(output_shape)));
+            common::make_ddim(output_shape)));
     is_support = IsSupport(ctx, filter_shape, stride, dilation, group);
     PADDLE_ENFORCE_EQ(
         is_support,
diff --git a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
index 4ecc5795ff41a4..8b731e2c55408b 100644
--- a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
@@ -47,7 +47,7 @@ struct ScaleBiasAddReluArgs {
             "The size of data_shape is expected to 4. But received "
             "data_shape's size is %d, data_shape is [%s].",
             data_shape.size(),
-            phi::make_ddim(data_shape)));
+            common::make_ddim(data_shape)));
     PADDLE_ENFORCE_EQ(
         param_shape.size(),
         4U,
@@ -55,7 +55,7 @@ struct ScaleBiasAddReluArgs {
             "The size of param_shape is expected to 4. But received "
             "param_shape's size is %d, param_shape is [%s].",
             param_shape.size(),
-            phi::make_ddim(param_shape)));
+            common::make_ddim(param_shape)));
     PADDLE_ENFORCE_EQ(
         bitmask_shape.size(),
         3U,
@@ -63,7 +63,7 @@ struct ScaleBiasAddReluArgs {
             "The size of bitmask_shape is expected to 3. But received "
             "bitmask_shape's size is %d, bitmask_shape is [%s].",
             bitmask_shape.size(),
-            phi::make_ddim(bitmask_shape)));
+            common::make_ddim(bitmask_shape)));
 
     in_desc.set(data_shape, format, dtype);
     out_desc.set(data_shape, format, dtype);
diff --git a/paddle/fluid/operators/fused/fused_attention_utils.h b/paddle/fluid/operators/fused/fused_attention_utils.h
index 7d17041133bcd7..b198c4a5792912 100644
--- a/paddle/fluid/operators/fused/fused_attention_utils.h
+++ b/paddle/fluid/operators/fused/fused_attention_utils.h
@@ -23,8 +23,8 @@
 PHI_DECLARE_bool(dynamic_static_unified_comm);
 #endif
 
+#include "paddle/common/errors.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
-#include "paddle/phi/core/errors.h"
 
 namespace phi {
 namespace fusion {
diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.cc b/paddle/fluid/operators/fused/fused_bn_activation_op.cc
index ca59a466a5c2b6..2ea40d840d2b38 100644
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.cc
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cc
@@ -128,7 +128,7 @@ void FusedBatchNormActOp::InferShape(framework::InferShapeContext *ctx) const {
 
   bool check = true;
   if ((!ctx->IsRuntime()) &&
-      (phi::product(scale_dim) <= 0 || phi::product(bias_dim) <= 0)) {
+      (common::product(scale_dim) <= 0 || common::product(bias_dim) <= 0)) {
     check = false;
   }
 
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
index ed416d4ad13d13..ac198e9cf2c258 100644
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
@@ -106,7 +106,7 @@ void FusedBatchNormAddActOp::InferShape(
 
   bool check = true;
   if ((!ctx->IsRuntime()) &&
-      (phi::product(scale_dim) <= 0 || phi::product(bias_dim) <= 0)) {
+      (common::product(scale_dim) <= 0 || common::product(bias_dim) <= 0)) {
     check = false;
   }
 
diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
index 96c400ea625d46..e69825fdd90765 100644
--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
@@ -303,15 +303,15 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
   bool is_reverse = ctx.Attr<bool>("is_reverse");               \
   bool use_peepholes = ctx.Attr<bool>("use_peepholes");
 
-#define INIT_BASE_SIZES                                \
-  auto ids_dims = ids->dims();             /* T x M*/  \
-  auto ids_numel = phi::product(ids_dims); /* T x 1*/  \
-  auto wh_dims = wh->dims();               /* D x 4D*/ \
-  const int D = wh_dims[0];                            \
-  const int D2 = D * 2;                                \
-  const int D3 = D * 3;                                \
-  int64_t row_number = embeddings->dims()[0];          \
-  int64_t row_width = embeddings->dims()[1];           \
+#define INIT_BASE_SIZES                                   \
+  auto ids_dims = ids->dims();                /* T x M*/  \
+  auto ids_numel = common::product(ids_dims); /* T x 1*/  \
+  auto wh_dims = wh->dims();                  /* D x 4D*/ \
+  const int D = wh_dims[0];                               \
+  const int D2 = D * 2;                                   \
+  const int D3 = D * 3;                                   \
+  int64_t row_number = embeddings->dims()[0];             \
+  int64_t row_width = embeddings->dims()[1];              \
   const int D4 = wh_dims[1];
 
 #define INIT_BASE_INPUT_DATAS                                        \
diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
index 0bd497b4c5ae29..a0ee64bd2eced7 100644
--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
@@ -68,7 +68,7 @@ class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel {
 
     // in compile time, the shape from Ids -> output
     // should be [-1, 1] -> [-1, embedding_size]
-    ctx->SetOutputDim("Out", phi::make_ddim({-1, last_dim}));
+    ctx->SetOutputDim("Out", common::make_ddim({-1, last_dim}));
   }
 
  protected:
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cc b/paddle/fluid/operators/fused/fused_feedforward_op.cc
index faf2561e5d3ec6..f6343f5bd1cbf7 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cc
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cc
@@ -31,7 +31,7 @@ static framework::DDim RowMatrixFromVector(const framework::DDim &x_dim) {
   if (x_dim.size() > 1) {
     return x_dim;
   }
-  return phi::make_ddim({1, x_dim[0]});
+  return common::make_ddim({1, x_dim[0]});
 }
 
 class FusedFeedForwardOp : public framework::OperatorWithKernel {
@@ -97,7 +97,7 @@ class FusedFeedForwardOp : public framework::OperatorWithKernel {
       context->SetOutputDim("Dropout2Mask", dim_x);
     }
     framework::DDim mean_dim =
-        phi::make_ddim({mat_dim_x.batch_size_ * mat_dim_x.height_});
+        common::make_ddim({mat_dim_x.batch_size_ * mat_dim_x.height_});
     bool pre_layer_norm = context->Attrs().Get<bool>("pre_layer_norm");
     if (pre_layer_norm) {
       OP_INOUT_CHECK(context->HasOutput("Ln1Mean"),
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu
index ee40633e4252b3..656f8ba6ad0acb 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cu
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/common/errors.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/fused/fused_attention_utils.h"
 #include "paddle/phi/api/include/tensor.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
diff --git a/paddle/fluid/operators/fused/fused_gate_attention.h b/paddle/fluid/operators/fused/fused_gate_attention.h
index 89f17f24b74a1e..69fbca0f9be0fc 100644
--- a/paddle/fluid/operators/fused/fused_gate_attention.h
+++ b/paddle/fluid/operators/fused/fused_gate_attention.h
@@ -903,8 +903,8 @@ class FlashAttnWithGating {
     AllocWithDebugInfo<float>(dev_ctx_, "softmax_lse", softmax_lse);
 
     if (VLOG_IS_ON(6)) {
-      VLOG(6) << "temp_mask_dim={" << phi::make_ddim(temp_mask_dim) << "}";
-      VLOG(6) << "temp_bias_dim={" << phi::make_ddim(temp_bias_dim) << "}";
+      VLOG(6) << "temp_mask_dim={" << common::make_ddim(temp_mask_dim) << "}";
+      VLOG(6) << "temp_bias_dim={" << common::make_ddim(temp_bias_dim) << "}";
       VLOG(6) << TensorDebugString(&cu_seq_q, "cu_seq_q");
       VLOG(6) << TensorDebugString(&cu_seq_k, "cu_seq_k");
       VLOG(6) << TensorDebugString(nonbatched_bias, "nonbatched_bias");
@@ -998,12 +998,12 @@ class FlashAttnWithGating {
     const T* v_ptr = k_ptr + q_size;
 
     phi::DenseTensor qkv_transpose_out_grad;
-    qkv_transpose_out_grad.Resize(phi::make_ddim({3,
-                                                  config->batch_size,
-                                                  config->seq_len_m,
-                                                  config->seq_len_r,
-                                                  config->num_heads,
-                                                  config->head_dim}));
+    qkv_transpose_out_grad.Resize(common::make_ddim({3,
+                                                     config->batch_size,
+                                                     config->seq_len_m,
+                                                     config->seq_len_r,
+                                                     config->num_heads,
+                                                     config->head_dim}));
     AllocWithDebugInfo<T>(
         dev_ctx_, "qkv_transpose_out_grad", &qkv_transpose_out_grad);
 
diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cu b/paddle/fluid/operators/fused/fused_gate_attention_op.cu
index 9caca507c08bbd..d066086bd6ae06 100644
--- a/paddle/fluid/operators/fused/fused_gate_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cu
@@ -412,12 +412,12 @@ class FusedGateAttentionOpKernel : public framework::OpKernel<T> {
       ComputeMergedQKVMatmulForward<T>(ctx, config, query, qkv_out);
 
       if (config.CanUseFlashAttn()) {
-        qkv_transpose_out->Resize(phi::make_ddim({3,
-                                                  config.batch_size,
-                                                  config.seq_len_m,
-                                                  config.seq_len_r,
-                                                  config.num_heads,
-                                                  config.head_dim}));
+        qkv_transpose_out->Resize(common::make_ddim({3,
+                                                     config.batch_size,
+                                                     config.seq_len_m,
+                                                     config.seq_len_r,
+                                                     config.num_heads,
+                                                     config.head_dim}));
       }
       AllocWithDebugInfo<T>(dev_ctx, "qkv_transpose_out", qkv_transpose_out);
     } else {
diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
index fb0550b0c10e5e..2d6a1122b0c286 100644
--- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
@@ -73,7 +73,7 @@ class FusedGemmEpilogueOp : public framework::OperatorWithKernel {
                           y_dims));
 
     auto x_mat_dims =
-        phi::flatten_to_2d(x_dims, trans_x ? 1 : x_dims.size() - 1);
+        common::flatten_to_2d(x_dims, trans_x ? 1 : x_dims.size() - 1);
 
     int K_from_x = trans_x ? x_mat_dims[0] : x_mat_dims[1];
     int K_from_y = trans_y ? y_dims[1] : y_dims[0];
@@ -100,11 +100,11 @@ class FusedGemmEpilogueOp : public framework::OperatorWithKernel {
     } else {
       out_dims.push_back(y_dims[1]);
     }
-    ctx->SetOutputDim("Out", phi::make_ddim(out_dims));
+    ctx->SetOutputDim("Out", common::make_ddim(out_dims));
 
     auto activation = ctx->Attrs().Get<std::string>("activation");
     if (ctx->HasOutput("ReserveSpace")) {
-      ctx->SetOutputDim("ReserveSpace", phi::make_ddim(out_dims));
+      ctx->SetOutputDim("ReserveSpace", common::make_ddim(out_dims));
 
       if (activation == "none") {
         PADDLE_THROW(platform::errors::InvalidArgument(
@@ -235,8 +235,8 @@ class FusedGemmEpilogueGradOp : public framework::OperatorWithKernel {
             dout_dims.size(),
             x_dims.size()));
 
-    auto dout_mat_dims = phi::flatten_to_2d(dout_dims, dout_dims.size() - 1);
-    auto x_mat_dims = phi::flatten_to_2d(x_dims, x_dims.size() - 1);
+    auto dout_mat_dims = common::flatten_to_2d(dout_dims, dout_dims.size() - 1);
+    auto x_mat_dims = common::flatten_to_2d(x_dims, x_dims.size() - 1);
 
     PADDLE_ENFORCE_EQ(
         dout_mat_dims[1],
@@ -272,7 +272,7 @@ class FusedGemmEpilogueGradOp : public framework::OperatorWithKernel {
 
     if (ctx->HasOutput("DBias")) {
       int64_t dbias_dim = trans_y ? y_dims[0] : y_dims[1];
-      ctx->SetOutputDim("DBias", phi::make_ddim({dbias_dim}));
+      ctx->SetOutputDim("DBias", common::make_ddim({dbias_dim}));
     }
   }
 
diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
index 6d7319c1db2721..2ae9f65c4e5a27 100644
--- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
@@ -38,8 +38,8 @@ phi::funcs::MatmulFusedType GetFwdFusedEpilogueType(
         fused_type = FusedType::kMatmulBiasRelu;
       } else {
         fused_type = FusedType::kMatmulBiasReluWithReservedData;
-        int64_t reserve_size =
-            SizeOf(phi::DataType::BOOL) * phi::product(reserve_space->dims());
+        int64_t reserve_size = SizeOf(phi::DataType::BOOL) *
+                               common::product(reserve_space->dims());
         ctx.Alloc(reserve_space, phi::DataType::BOOL, reserve_size);
       }
     } else if (activation == "gelu") {
@@ -47,7 +47,8 @@ phi::funcs::MatmulFusedType GetFwdFusedEpilogueType(
         fused_type = FusedType::kMatmulBiasGelu;
       } else {
         fused_type = FusedType::kMatmulBiasGeluWithReservedData;
-        int64_t reserve_size = sizeof(T) * phi::product(reserve_space->dims());
+        int64_t reserve_size =
+            sizeof(T) * common::product(reserve_space->dims());
         ctx.Alloc<T>(reserve_space, reserve_size);
       }
     } else {
@@ -85,7 +86,7 @@ class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
     dev_ctx.Alloc<T>(out, out->numel() * sizeof(T));
     // (M * K) * (K * N)
     auto x_mat_dims =
-        phi::flatten_to_2d(x->dims(), trans_x ? 1 : x->dims().size() - 1);
+        common::flatten_to_2d(x->dims(), trans_x ? 1 : x->dims().size() - 1);
     int64_t M = trans_x ? x_mat_dims[1] : x_mat_dims[0];
     int64_t K = trans_y ? y->dims()[1] : y->dims()[0];
     int64_t N = trans_y ? y->dims()[0] : y->dims()[1];
@@ -142,7 +143,7 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel<T> {
 
     // (M * K) * (K * N)
     auto x_mat_dims =
-        phi::flatten_to_2d(x->dims(), trans_x ? 1 : x->dims().size() - 1);
+        common::flatten_to_2d(x->dims(), trans_x ? 1 : x->dims().size() - 1);
     int64_t M = trans_x ? x_mat_dims[1] : x_mat_dims[0];
     int64_t K = trans_y ? y->dims()[1] : y->dims()[0];
     int64_t N = trans_y ? y->dims()[0] : y->dims()[1];
diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op_xpu.cc b/paddle/fluid/operators/fused/fused_gemm_epilogue_op_xpu.cc
index 58d81ebf8be06a..fb6afbf5d256d8 100644
--- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op_xpu.cc
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op_xpu.cc
@@ -46,7 +46,7 @@ class FusedGemmEpilogueXPUKernel : public framework::OpKernel<T> {
             << " , activation = " << activation;
 
     auto x_mat_dims =
-        phi::flatten_to_2d(x->dims(), trans_x ? 1 : x->dims().size() - 1);
+        common::flatten_to_2d(x->dims(), trans_x ? 1 : x->dims().size() - 1);
 
     // (M * K) * (K * N) for new api use
     // int64_t M = trans_x ? x_mat_dims[1] : x_mat_dims[0];
@@ -168,7 +168,7 @@ class FusedGemmEpilogueXPUGradKernel : public framework::OpKernel<T> {
     }
 
     auto x_mat_dims =
-        phi::flatten_to_2d(x->dims(), trans_x ? 1 : x->dims().size() - 1);
+        common::flatten_to_2d(x->dims(), trans_x ? 1 : x->dims().size() - 1);
     phi::XpuFcInfo info_forward;
     phi::GetFCInfo(x_mat_dims, y->dims(), trans_x, trans_y, &info_forward);
 
diff --git a/paddle/fluid/operators/fused/fused_matmul_op.cc b/paddle/fluid/operators/fused/fused_matmul_op.cc
index 198fd61a150780..129f7e85386e70 100644
--- a/paddle/fluid/operators/fused/fused_matmul_op.cc
+++ b/paddle/fluid/operators/fused/fused_matmul_op.cc
@@ -37,7 +37,7 @@ static std::vector<int64_t> GetInputShape(phi::DDim dim,
   if (is_input_fused) {
     dim = dim.reshape(shape).transpose(axis);
   }
-  return phi::vectorize(dim);
+  return common::vectorize(dim);
 }
 
 class FusedMatmulOp : public framework::OperatorWithKernel {
@@ -50,8 +50,8 @@ class FusedMatmulOp : public framework::OperatorWithKernel {
     bool trans_x = ctx->Attrs().Get<bool>("trans_x");
     bool trans_y = ctx->Attrs().Get<bool>("trans_y");
 
-    std::vector<int64_t> dims_x = phi::vectorize(ctx->GetInputDim("X"));
-    std::vector<int64_t> dims_y = phi::vectorize(ctx->GetInputDim("Y"));
+    std::vector<int64_t> dims_x = common::vectorize(ctx->GetInputDim("X"));
+    std::vector<int64_t> dims_y = common::vectorize(ctx->GetInputDim("Y"));
     auto ndims_x = dims_x.size();
     auto ndims_y = dims_y.size();
     PADDLE_ENFORCE_GT(
@@ -112,7 +112,7 @@ class FusedMatmulOp : public framework::OperatorWithKernel {
       new_dims.push_back(N);  // NOLINT
     }
 
-    ctx->SetOutputDim("Out", phi::make_ddim(new_dims));
+    ctx->SetOutputDim("Out", common::make_ddim(new_dims));
     ctx->ShareLoD("X", "Out");
   };
 
diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cc b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cc
index d8ef46b040e8b5..3dbba2bf42ce44 100644
--- a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cc
+++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cc
@@ -101,7 +101,7 @@ class FusedSeqpoolCVMOp : public framework::OperatorWithKernel {
         } else {
           out_dim = {batch_size, dims[rank - 1] - cvm_offset};
         }
-        outs_dims[i] = phi::make_ddim(out_dim);
+        outs_dims[i] = common::make_ddim(out_dim);
       }
     } else {
       for (size_t i = 0; i < num_inputs; ++i) {
@@ -123,7 +123,7 @@ class FusedSeqpoolCVMOp : public framework::OperatorWithKernel {
         } else {
           out_dim = {-1, dims[rank - 1] - cvm_offset};
         }
-        outs_dims[i] = phi::make_ddim(out_dim);
+        outs_dims[i] = common::make_ddim(out_dim);
       }
     }
     ctx->SetOutputsDim("Out", outs_dims);
diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
index 0674fc419938c3..63f065e0fef496 100644
--- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
@@ -65,7 +65,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
         dev_ctx.Alloc<T>(temp_outs[0], temp_outs[0]->numel() * sizeof(T));
 
     DataLayout layout = DataLayout::kNCHW;
-    std::vector<int> in_dim = phi::vectorize<int>(input->dims());
+    std::vector<int> in_dim = common::vectorize<int>(input->dims());
 
     // ------------------- cudnn descriptors ---------------------
     PoolingMode pooling_mode;
@@ -87,9 +87,9 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
         pool_desc.descriptor(pooling_mode, k3x3, k1x1, k1x1);
 
     cudnnTensorDescriptor_t cudnn_input_desc =
-        input_desc.descriptor<T>(layout, phi::vectorize<int>(input->dims()));
-    cudnnTensorDescriptor_t pool_out_desc =
-        out_pool_desc.descriptor<T>(layout, phi::vectorize<int>(input->dims()));
+        input_desc.descriptor<T>(layout, common::vectorize<int>(input->dims()));
+    cudnnTensorDescriptor_t pool_out_desc = out_pool_desc.descriptor<T>(
+        layout, common::vectorize<int>(input->dims()));
 
     cudnnDataType_t cudnn_dtype = CudnnDataType<T>::type;
     cudnnTensorDescriptor_t* out_desc = new cudnnTensorDescriptor_t[4];
@@ -130,7 +130,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
                                        : CUDNN_DATA_FLOAT;
 
     for (int i = 0; i < 4; ++i) {
-      filter_dims.push_back(phi::vectorize<int>(filters[i]->dims()));
+      filter_dims.push_back(common::vectorize<int>(filters[i]->dims()));
       PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetFilterNdDescriptor(
           filter_desc[i], cudnn_dtype, format, 4, filter_dims[i].data()));
       bias_dims.push_back({1, filter_dims[i][0], 1, 1});
@@ -260,7 +260,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
     in_datas.push_back(static_cast<const void*>(input_data));
     in_datas.push_back(
         static_cast<const void*>(output_data + (oc0 + oc1) * h * w));
-    temp_outs[1]->Resize(phi::make_ddim(out_dims[2]));
+    temp_outs[1]->Resize(common::make_ddim(out_dims[2]));
     T* temp2_data =
         dev_ctx.Alloc<T>(temp_outs[1], temp_outs[1]->numel() * sizeof(T));
     in_datas.push_back(static_cast<const void*>(temp2_data + oc2 * h * w));
diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
index 4972db58043226..ada14e280a0f3c 100644
--- a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
@@ -353,15 +353,15 @@ class FusionLSTMMKLDNNKernel : public framework::OpKernel<T> {
     cell = cell;
     auto x_dims = input->dims();
     auto x_mat_dims = (x_dims.size() == 3 && x_dims[1] == 1)
-                          ? phi::flatten_to_2d(x_dims, 1)
+                          ? common::flatten_to_2d(x_dims, 1)
                           : x_dims;
     // Get attributes
     const bool is_reverse = ctx.Attr<bool>("is_reverse");
     const bool use_peepholes = ctx.Attr<bool>("use_peepholes");
 
     // Get tensor dimensions
-    const auto x_mat_dims_vec = phi::vectorize(x_mat_dims);
-    const auto weight_h_dims = phi::vectorize(weight_h->dims());
+    const auto x_mat_dims_vec = common::vectorize(x_mat_dims);
+    const auto weight_h_dims = common::vectorize(weight_h->dims());
     const auto& input_lod = input->lod()[0];
 
     // Calculate RNN dimensions
diff --git a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
index 1c8e0a1b56a977..4dd6a9a48a16d6 100644
--- a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
@@ -26,7 +26,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using phi::vectorize;
+using common::vectorize;
 using phi::funcs::OneDNNGetDataType;
 using phi::funcs::OneDNNMemDesc;
 using Direction = dnnl::rnn_direction;
diff --git a/paddle/fluid/operators/fused/multi_gru_op.cc b/paddle/fluid/operators/fused/multi_gru_op.cc
index 1ef675cb1d8f83..54d0860ef1ccfb 100644
--- a/paddle/fluid/operators/fused/multi_gru_op.cc
+++ b/paddle/fluid/operators/fused/multi_gru_op.cc
@@ -32,7 +32,7 @@ void MultiGRUOp::InferShape(framework::InferShapeContext* ctx) const {
   OP_INOUT_CHECK(ctx->HasOutput("Hidden"), "Output", "Hidden", "multi_gru");
   auto x_dims = ctx->GetInputDim("X");
   auto x_mat_dims = (x_dims.size() == 3 && x_dims[1] == 1)
-                        ? phi::flatten_to_2d(x_dims, 1)
+                        ? common::flatten_to_2d(x_dims, 1)
                         : x_dims;
   PADDLE_ENFORCE_EQ(
       x_mat_dims.size(),
diff --git a/paddle/fluid/operators/fused/resnet_basic_block_op.cc b/paddle/fluid/operators/fused/resnet_basic_block_op.cc
index d17e6c9872a029..58125a9b7f6740 100644
--- a/paddle/fluid/operators/fused/resnet_basic_block_op.cc
+++ b/paddle/fluid/operators/fused/resnet_basic_block_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/ddim.h"
 
 namespace paddle {
 namespace operators {
@@ -182,8 +182,8 @@ class ResNetBasicBlockOp : public framework::OperatorWithKernel {
     int out2_w = (out1_w + padding2 * 2 - filter2_size) / stride2 + 1;
     std::vector<int> out2_shape = {batch, output2_channel, out2_h, out2_w};
 
-    auto y_dims = phi::make_ddim(out2_shape);
-    auto conv1_dims = phi::make_ddim(out1_shape);
+    auto y_dims = common::make_ddim(out2_shape);
+    auto conv1_dims = common::make_ddim(out1_shape);
     ctx->SetOutputDim("Y", y_dims);
     ctx->SetOutputDim("Conv1", conv1_dims);
     ctx->SetOutputDim("SavedMean1", bn1_param_dims);
@@ -206,7 +206,7 @@ class ResNetBasicBlockOp : public framework::OperatorWithKernel {
 
     bool find_max = ctx->Attrs().Get<bool>("find_conv_input_max");
     if (find_max) {
-      auto max_dims = phi::make_ddim({6});
+      auto max_dims = common::make_ddim({6});
       ctx->SetOutputDim("MaxInput1", max_dims);
       ctx->SetOutputDim("MaxFilter1", max_dims);
       ctx->SetOutputDim("MaxInput2", max_dims);
diff --git a/paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc b/paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc
index 3855ea38544609..bd918924cdf09f 100644
--- a/paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc
+++ b/paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc
@@ -51,16 +51,16 @@ class ResnetBasicBlockAttr {
     auto conv1_out = ctx.Output<phi::DenseTensor>("Conv1");
     auto filter2 = ctx.Input<phi::DenseTensor>("Filter2");
     auto conv2_out = ctx.Output<phi::DenseTensor>("Conv2");
-    conv1_input_shape = phi::vectorize<int>(input1->dims());
-    conv1_output_shape = phi::vectorize<int>(conv1_out->dims());
-    conv1_filter_shape = phi::vectorize<int>(filter1->dims());
+    conv1_input_shape = common::vectorize<int>(input1->dims());
+    conv1_output_shape = common::vectorize<int>(conv1_out->dims());
+    conv1_filter_shape = common::vectorize<int>(filter1->dims());
     conv1_filter_numel = filter1->numel();
     conv1_input_numel = input1->numel();
     conv1_output_numel = conv1_out->numel();
 
-    conv2_input_shape = phi::vectorize<int>(conv1_out->dims());
-    conv2_output_shape = phi::vectorize<int>(conv2_out->dims());
-    conv2_filter_shape = phi::vectorize<int>(filter2->dims());
+    conv2_input_shape = common::vectorize<int>(conv1_out->dims());
+    conv2_output_shape = common::vectorize<int>(conv2_out->dims());
+    conv2_filter_shape = common::vectorize<int>(filter2->dims());
     conv2_filter_numel = filter2->numel();
     conv2_input_numel = conv1_out->numel();
     conv2_output_numel = conv2_out->numel();
@@ -68,9 +68,9 @@ class ResnetBasicBlockAttr {
     if (has_shortcut) {
       auto filter3 = ctx.Input<phi::DenseTensor>("Filter3");
       auto conv3_out = ctx.Output<phi::DenseTensor>("Conv3");
-      conv3_input_shape = phi::vectorize<int>(input1->dims());
-      conv3_output_shape = phi::vectorize<int>(conv3_out->dims());
-      conv3_filter_shape = phi::vectorize<int>(filter3->dims());
+      conv3_input_shape = common::vectorize<int>(input1->dims());
+      conv3_output_shape = common::vectorize<int>(conv3_out->dims());
+      conv3_filter_shape = common::vectorize<int>(filter3->dims());
       conv3_filter_numel = filter3->numel();
       conv3_input_numel = input1->numel();
       conv3_output_numel = conv3_out->numel();
@@ -139,16 +139,16 @@ class ResnetBasicBlockGradAttr {
     auto conv1_out = ctx.Input<phi::DenseTensor>("Conv1");
     auto filter2 = ctx.Input<phi::DenseTensor>("Filter2");
     auto conv2_out = ctx.Input<phi::DenseTensor>("Conv2");
-    conv1_input_shape = phi::vectorize<int>(input1->dims());
-    conv1_output_shape = phi::vectorize<int>(conv1_out->dims());
-    conv1_filter_shape = phi::vectorize<int>(filter1->dims());
+    conv1_input_shape = common::vectorize<int>(input1->dims());
+    conv1_output_shape = common::vectorize<int>(conv1_out->dims());
+    conv1_filter_shape = common::vectorize<int>(filter1->dims());
     conv1_filter_numel = filter1->numel();
     conv1_input_numel = input1->numel();
     conv1_output_numel = conv1_out->numel();
 
-    conv2_input_shape = phi::vectorize<int>(conv1_out->dims());
-    conv2_output_shape = phi::vectorize<int>(conv2_out->dims());
-    conv2_filter_shape = phi::vectorize<int>(filter2->dims());
+    conv2_input_shape = common::vectorize<int>(conv1_out->dims());
+    conv2_output_shape = common::vectorize<int>(conv2_out->dims());
+    conv2_filter_shape = common::vectorize<int>(filter2->dims());
     conv2_filter_numel = filter2->numel();
     conv2_input_numel = conv1_out->numel();
     conv2_output_numel = conv2_out->numel();
@@ -156,9 +156,9 @@ class ResnetBasicBlockGradAttr {
     if (has_shortcut) {
       auto filter3 = ctx.Input<phi::DenseTensor>("Filter3");
       auto conv3_out = ctx.Input<phi::DenseTensor>("Conv3");
-      conv3_input_shape = phi::vectorize<int>(input1->dims());
-      conv3_output_shape = phi::vectorize<int>(conv3_out->dims());
-      conv3_filter_shape = phi::vectorize<int>(filter3->dims());
+      conv3_input_shape = common::vectorize<int>(input1->dims());
+      conv3_output_shape = common::vectorize<int>(conv3_out->dims());
+      conv3_filter_shape = common::vectorize<int>(filter3->dims());
       conv3_filter_numel = filter3->numel();
       conv3_input_numel = input1->numel();
       conv3_output_numel = conv3_out->numel();
diff --git a/paddle/fluid/operators/fused/resnet_unit_op.cc b/paddle/fluid/operators/fused/resnet_unit_op.cc
index 948e8fdc9491e9..f1f2628119c155 100644
--- a/paddle/fluid/operators/fused/resnet_unit_op.cc
+++ b/paddle/fluid/operators/fused/resnet_unit_op.cc
@@ -29,7 +29,7 @@ static framework::DDim GetBitmaskDims(std::vector<int> out_shape) {
   int32_t c_int32_elems = ((c + 63) & ~63) / 32;
   int32_t nhw_int32_elems = ((nhw + 31) & ~31);
   std::vector<int> bitmask_shape = {nhw_int32_elems, c_int32_elems, 1};
-  return phi::make_ddim(bitmask_shape);
+  return common::make_ddim(bitmask_shape);
 }
 
 class ResNetUnitOp : public framework::OperatorWithKernel {
@@ -124,11 +124,11 @@ class ResNetUnitOp : public framework::OperatorWithKernel {
     const auto x_dims = ctx->GetInputDim("X");
     const auto w_dims = ctx->GetInputDim("FilterX");
     std::vector<int64_t> bn_param_shape =
-        phi::vectorize(ctx->GetInputDim("ScaleX"));
+        common::vectorize(ctx->GetInputDim("ScaleX"));
     if (1 == bn_param_shape.size()) {
       bn_param_shape = {1, 1, 1, bn_param_shape[0]};
     }
-    framework::DDim bn_param_dims = phi::make_ddim(bn_param_shape);
+    framework::DDim bn_param_dims = common::make_ddim(bn_param_shape);
     PADDLE_ENFORCE_EQ(
         x_dims.size(),
         4,
@@ -181,7 +181,7 @@ class ResNetUnitOp : public framework::OperatorWithKernel {
       out_shape.push_back(output_channel);
     }
 
-    auto y_dims = phi::make_ddim(out_shape);
+    auto y_dims = common::make_ddim(out_shape);
     auto bitmask_dims = GetBitmaskDims(out_shape);
     // Set dims of outputs
     ctx->SetOutputDim("Y", y_dims);
diff --git a/paddle/fluid/operators/fused/resnet_unit_op.cu b/paddle/fluid/operators/fused/resnet_unit_op.cu
index 7caa0b1caa1afa..5b126008bf6548 100644
--- a/paddle/fluid/operators/fused/resnet_unit_op.cu
+++ b/paddle/fluid/operators/fused/resnet_unit_op.cu
@@ -69,20 +69,20 @@ class ResNetUnitKernel : public framework::OpKernel<T> {
     bool is_train = !is_test && !use_global_stats;
     std::string act_type = ctx.Attr<std::string>("act_type");
 
-    auto input_x_shape = phi::vectorize<int>(input_x->dims());
-    auto filter_x_shape = phi::vectorize<int>(filter_x->dims());
+    auto input_x_shape = common::vectorize<int>(input_x->dims());
+    auto filter_x_shape = common::vectorize<int>(filter_x->dims());
     // std::swap used to convert shape of filter from conv2d when kernel size is
     // 1.
     if (filter_x_shape[1] != filter_x_shape[2] && 1 == filter_x_shape[2]) {
       std::swap(filter_x_shape[1], filter_x_shape[3]);
     }
     auto param_dims = scale_x->dims();
-    auto param_shape = phi::vectorize<int>(scale_x->dims());
+    auto param_shape = common::vectorize<int>(scale_x->dims());
     if (1 == param_shape.size()) {
       param_shape = {1, 1, 1, param_shape[0]};
     }
-    auto output_shape = phi::vectorize<int>(output->dims());
-    auto bitmask_shape = phi::vectorize<int>(bitmask->dims());
+    auto output_shape = common::vectorize<int>(output->dims());
+    auto bitmask_shape = common::vectorize<int>(bitmask->dims());
     int output_channel = filter_x_shape[0];
     int64_t ele_count = std::accumulate(output_shape.begin(),
                                         output_shape.end(),
@@ -157,8 +157,8 @@ class ResNetUnitKernel : public framework::OpKernel<T> {
       phi::DenseTensor *running_var_z =
           ctx.Output<phi::DenseTensor>("RunningVarZ");
 
-      auto input_z_shape = phi::vectorize<int>(input_z->dims());
-      auto filter_z_shape = phi::vectorize<int>(filter_z->dims());
+      auto input_z_shape = common::vectorize<int>(input_z->dims());
+      auto filter_z_shape = common::vectorize<int>(filter_z->dims());
 
       // 3.1 Conv for second input
       phi::DenseTensor sum_z;
@@ -273,11 +273,11 @@ class ResNetUnitGradKernel : public framework::OpKernel<T> {
     bool use_global_stats = ctx.Attr<bool>("use_global_stats");
     std::string act_type = ctx.Attr<std::string>("act_type");
 
-    auto x_shape = phi::vectorize<int>(x->dims());
-    auto filter_x_shape = phi::vectorize<int>(filter_x->dims());
-    auto param_shape = phi::vectorize<int>(scale_x->dims());
-    auto output_shape = phi::vectorize<int>(output->dims());
-    auto bitmask_shape = phi::vectorize<int>(bitmask->dims());
+    auto x_shape = common::vectorize<int>(x->dims());
+    auto filter_x_shape = common::vectorize<int>(filter_x->dims());
+    auto param_shape = common::vectorize<int>(scale_x->dims());
+    auto output_shape = common::vectorize<int>(output->dims());
+    auto bitmask_shape = common::vectorize<int>(bitmask->dims());
 
     auto place = ctx.GetPlace();
     auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
@@ -360,8 +360,8 @@ class ResNetUnitGradKernel : public framework::OpKernel<T> {
                          eps);
 
       // 1.3 Backward of Conv for z, get z_grad and filter_z_grad
-      auto z_shape = phi::vectorize<int>(z->dims());
-      auto filter_z_shape = phi::vectorize<int>(filter_z->dims());
+      auto z_shape = common::vectorize<int>(z->dims());
+      auto filter_z_shape = common::vectorize<int>(filter_z->dims());
       CudnnNormConvolutionGrad<T> conv_z_op(dev_ctx,
                                             z_shape,
                                             filter_z_shape,
diff --git a/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc b/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc
index 1e4ed290f43a98..c00e58f8463ab5 100644
--- a/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc
+++ b/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc
@@ -74,9 +74,9 @@ class ResNetUnitXPUKernel : public framework::OpKernel<T> {
         reinterpret_cast<XPUType *>(conv_out_x->mutable_data<T>(place))};
 
     std::vector<std::vector<int>> x_shape_list = {
-        phi::vectorize<int>(input_x->dims())};
+        common::vectorize<int>(input_x->dims())};
 
-    auto filter_x_shape = phi::vectorize<int>(filter_x->dims());
+    auto filter_x_shape = common::vectorize<int>(filter_x->dims());
     std::vector<int> ksize = {filter_x_shape[2], filter_x_shape[3]};
     if (!is_nchw) {
       ksize[0] = filter_x_shape[1];
@@ -122,9 +122,9 @@ class ResNetUnitXPUKernel : public framework::OpKernel<T> {
       conv_y_list.push_back(
           reinterpret_cast<XPUType *>(conv_out_z->mutable_data<T>(place)));
 
-      x_shape_list.push_back(phi::vectorize<int>(input_z->dims()));
+      x_shape_list.push_back(common::vectorize<int>(input_z->dims()));
 
-      auto filter_z_shape = phi::vectorize<int>(filter_z->dims());
+      auto filter_z_shape = common::vectorize<int>(filter_z->dims());
       std::vector<int> ksize_z = {filter_z_shape[2], filter_z_shape[3]};
       if (!is_nchw) {
         ksize_z[0] = filter_z_shape[1];
@@ -143,7 +143,7 @@ class ResNetUnitXPUKernel : public framework::OpKernel<T> {
     } else {
       if (fuse_add) {
         const phi::DenseTensor *input_z = ctx.Input<phi::DenseTensor>("Z");
-        auto input_z_shape = phi::vectorize<int>(input_z->dims());
+        auto input_z_shape = common::vectorize<int>(input_z->dims());
         x_list.push_back(reinterpret_cast<const XPUType *>(input_z->data<T>()));
         x_shape_list.push_back(input_z_shape);
         x_maxlist.push_back(nullptr);
@@ -239,9 +239,9 @@ class ResNetUnitGradXPUKernel : public framework::OpKernel<T> {
         reinterpret_cast<XPUType *>(filter_x_grad->mutable_data<T>(place))};
 
     std::vector<std::vector<int>> x_shape_list = {
-        phi::vectorize<int>(x->dims())};
+        common::vectorize<int>(x->dims())};
 
-    auto filter_x_shape = phi::vectorize<int>(filter_x->dims());
+    auto filter_x_shape = common::vectorize<int>(filter_x->dims());
     std::vector<int> x_ksize = {filter_x_shape[2], filter_x_shape[3]};
     if (!is_nchw) {
       x_ksize[0] = filter_x_shape[1];
@@ -298,9 +298,9 @@ class ResNetUnitGradXPUKernel : public framework::OpKernel<T> {
           reinterpret_cast<XPUType *>(z_grad->mutable_data<T>(place)));
       dw_list.push_back(
           reinterpret_cast<XPUType *>(filter_z_grad->mutable_data<T>(place)));
-      x_shape_list.push_back(phi::vectorize<int>(z->dims()));
+      x_shape_list.push_back(common::vectorize<int>(z->dims()));
 
-      auto filter_z_shape = phi::vectorize<int>(filter_z->dims());
+      auto filter_z_shape = common::vectorize<int>(filter_z->dims());
       std::vector<int> ksize_z = {filter_z_shape[2], filter_z_shape[3]};
       if (!is_nchw) {
         ksize_z[0] = filter_z_shape[1];
diff --git a/paddle/fluid/operators/generator/CMakeLists.txt b/paddle/fluid/operators/generator/CMakeLists.txt
index dc88ea0b3a5336..a47a0a295be8f4 100644
--- a/paddle/fluid/operators/generator/CMakeLists.txt
+++ b/paddle/fluid/operators/generator/CMakeLists.txt
@@ -356,7 +356,7 @@ file(APPEND ${op_utils_header}
 # Automatically generate the registration code of all arg map functions
 # and compile the corresponding target to avoid frequent code conflicts
 # when writing to same file
-register_op_utils(op_compat_infos DEPS phi)
+register_op_utils(op_compat_infos DEPS phi common)
 
 copy_if_different(${op_utils_header} ${op_utils_header_final})
 
diff --git a/paddle/fluid/operators/generator/get_expected_kernel_func.cc b/paddle/fluid/operators/generator/get_expected_kernel_func.cc
index d29311f4621b39..e6715189772ba1 100644
--- a/paddle/fluid/operators/generator/get_expected_kernel_func.cc
+++ b/paddle/fluid/operators/generator/get_expected_kernel_func.cc
@@ -65,7 +65,7 @@ static bool ReduceOpHasOptimizedOneDNNKernel(
 bool CanMKLDNNSupportPool(const framework::ExecutionContext& ctx) {
   if (ctx.Attr<bool>("adaptive") == false) return true;
   // oneDNN is supporting only unchangable in size pool window
-  auto src_tz = phi::vectorize(ctx.Input<phi::DenseTensor>("X")->dims());
+  auto src_tz = common::vectorize(ctx.Input<phi::DenseTensor>("X")->dims());
   if (!ctx.HasAttr("ksize")) {
     return false;
   }
@@ -228,7 +228,7 @@ phi::KernelKey GetSoftmaxExpectedKernelType(
     const framework::OperatorWithKernel* op_ptr) {
   // choose cudnn kernel if the runtime supported.
   std::string data_format = ctx.Attr<std::string>("data_format");
-  phi::DataLayout layout_ = phi::StringToDataLayout(data_format);
+  phi::DataLayout layout_ = common::StringToDataLayout(data_format);
   auto input_data_type = op_ptr->IndicateVarDataType(ctx, "X");
   if (input_data_type == framework::proto::VarType::FP16) {
     PADDLE_ENFORCE_EQ(
@@ -248,7 +248,7 @@ phi::KernelKey GetSoftmaxGradExpectedKernelType(
     const framework::OperatorWithKernel* op_ptr) {
   // choose cudnn kernel if the runtime supported.
   std::string data_format = ctx.Attr<std::string>("data_format");
-  phi::DataLayout layout_ = phi::StringToDataLayout(data_format);
+  phi::DataLayout layout_ = common::StringToDataLayout(data_format);
   auto input_data_type =
       op_ptr->IndicateVarDataType(ctx, framework::GradVarName("Out"));
   if (input_data_type == framework::proto::VarType::FP16) {
diff --git a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
index 9230e114bd3bb2..c88d36602bd79c 100644
--- a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
@@ -64,9 +64,9 @@ class CUDNNGridSampleOpKernel : public framework::OpKernel<T> {
     ScopedTensorDescriptor input_desc;
     ScopedTensorDescriptor output_desc;
     cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        DataLayout::kNCHW, phi::vectorize<int>(input->dims()));
+        DataLayout::kNCHW, common::vectorize<int>(input->dims()));
     cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-        DataLayout::kNCHW, phi::vectorize<int>(output->dims()));
+        DataLayout::kNCHW, common::vectorize<int>(output->dims()));
 
     PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSpatialTfSamplerForward(
         handle,
@@ -123,13 +123,13 @@ class CUDNNGridSampleGradOpKernel : public framework::OpKernel<T> {
     ScopedTensorDescriptor input_grad_desc;
     ScopedTensorDescriptor output_grad_desc;
     cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        DataLayout::kNCHW, phi::vectorize<int>(input->dims()));
+        DataLayout::kNCHW, common::vectorize<int>(input->dims()));
     cudnnTensorDescriptor_t cudnn_input_grad_desc =
-        input_grad_desc.descriptor<T>(DataLayout::kNCHW,
-                                      phi::vectorize<int>(input_grad->dims()));
+        input_grad_desc.descriptor<T>(
+            DataLayout::kNCHW, common::vectorize<int>(input_grad->dims()));
     cudnnTensorDescriptor_t cudnn_output_grad_desc =
         output_grad_desc.descriptor<T>(
-            DataLayout::kNCHW, phi::vectorize<int>(output_grad->dims()));
+            DataLayout::kNCHW, common::vectorize<int>(output_grad->dims()));
 
     PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSpatialTfSamplerBackward(
         handle,
diff --git a/paddle/fluid/operators/hash_op.cc b/paddle/fluid/operators/hash_op.cc
index e5fc57c6567b4f..03887561934b7b 100644
--- a/paddle/fluid/operators/hash_op.cc
+++ b/paddle/fluid/operators/hash_op.cc
@@ -52,7 +52,7 @@ class HashOp : public framework::OperatorWithKernel {
     int num_hash = ctx->Attrs().Get<int>("num_hash");
     HashOutputSize(dims, out_dims, num_hash);
 
-    ctx->SetOutputDim("Out", phi::make_ddim(out_dims));
+    ctx->SetOutputDim("Out", common::make_ddim(out_dims));
     ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
diff --git a/paddle/fluid/operators/hash_op.h b/paddle/fluid/operators/hash_op.h
index 700f7c1d70138a..268bcc273272d3 100644
--- a/paddle/fluid/operators/hash_op.h
+++ b/paddle/fluid/operators/hash_op.h
@@ -51,7 +51,7 @@ class HashKernel : public framework::OpKernel<T> {
 
     std::vector<int64_t> out_dims;
     HashOutputSize(in_dims, out_dims, num_hash);
-    out_t->Resize(phi::make_ddim(out_dims));
+    out_t->Resize(common::make_ddim(out_dims));
     auto* output = out_t->mutable_data<T>(context.GetPlace());
 
     auto seq_length = in_dims[0];
diff --git a/paddle/fluid/operators/index_select_op.h b/paddle/fluid/operators/index_select_op.h
index b13d83a57ee974..c06885633f3482 100644
--- a/paddle/fluid/operators/index_select_op.h
+++ b/paddle/fluid/operators/index_select_op.h
@@ -79,8 +79,8 @@ void IndexSelectInner(const framework::ExecutionContext& context,
   VLOG(3) << "Index_Select_Debug; outer_nums: " << outer_nums
           << "; slice_size: " << slice_size << "; index_size: " << index_size;
 
-  input->Resize(phi::make_ddim({outer_nums, input_dim[dim], slice_size}));
-  output->Resize(phi::make_ddim({outer_nums, index_size, slice_size}));
+  input->Resize(common::make_ddim({outer_nums, input_dim[dim], slice_size}));
+  output->Resize(common::make_ddim({outer_nums, index_size, slice_size}));
 
   auto input_tensor = phi::EigenTensor<T, 3>::From(*input);
   auto output_tensor = phi::EigenTensor<T, 3>::From(*output);
diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc
index 1af8b247de4479..a64bd3c8ac7f6e 100644
--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
@@ -36,7 +36,7 @@ static void Interpolate1DInferShapeCheck(framework::InferShapeContext* ctx) {
                         "Input(X) dimension is 3, but got method = %s .",
                         interp_method));
   const DataLayout data_layout =
-      phi::StringToDataLayout(ctx->Attrs().Get<std::string>("data_layout"));
+      common::StringToDataLayout(ctx->Attrs().Get<std::string>("data_layout"));
 
   if (ctx->HasInputs("SizeTensor")) {
     // top prority size
@@ -125,7 +125,7 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) {
                         "Input(X) dimension is 4, but got method is %s.",
                         interp_method));
   const DataLayout data_layout =
-      phi::StringToDataLayout(ctx->Attrs().Get<std::string>("data_layout"));
+      common::StringToDataLayout(ctx->Attrs().Get<std::string>("data_layout"));
 
   if (ctx->HasInputs("SizeTensor")) {
     // top prority size
@@ -220,7 +220,7 @@ static void Interpolate3DInferShapeCheck(framework::InferShapeContext* ctx) {
           "dimension is 5, but got method = %s .",
           interp_method));
   const DataLayout data_layout =
-      phi::StringToDataLayout(ctx->Attrs().Get<std::string>("data_layout"));
+      common::StringToDataLayout(ctx->Attrs().Get<std::string>("data_layout"));
 
   if (ctx->HasInputs("SizeTensor")) {
     // top prority size
@@ -353,7 +353,7 @@ class InterpolateOp : public framework::OperatorWithKernel {
       auto attrs = Attrs();
       auto ar = paddle::framework::AttrReader(attrs);
       const std::string data_format = ar.Get<std::string>("data_layout");
-      auto dl = phi::StringToDataLayout(data_format);
+      auto dl = common::StringToDataLayout(data_format);
       // Some models may have intentionally set "AnyLayout" for pool
       // op. Treat this as NCHW (default data_format value)
       if (dl != phi::DataLayout::kAnyLayout) {
diff --git a/paddle/fluid/operators/interpolate_op.cu b/paddle/fluid/operators/interpolate_op.cu
index a0e1410f52d3d3..bfbb15b076448a 100644
--- a/paddle/fluid/operators/interpolate_op.cu
+++ b/paddle/fluid/operators/interpolate_op.cu
@@ -916,7 +916,7 @@ static void Interpolate1DCUDAFwd(const framework::ExecutionContext& ctx,
   auto* input_data = input.data<T>();
 
   const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
   int n, c, in_d, in_h, in_w;
   ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
 
@@ -1008,7 +1008,7 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
   auto* input_data = input.data<T>();
 
   const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
   int n, c, in_d, in_h, in_w;
   ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
 
@@ -1160,7 +1160,7 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
   auto* input_data = input.data<T>();
 
   const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
   int n, c, in_d, in_h, in_w;
   ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
 
@@ -1291,7 +1291,7 @@ static void Interpolate1DCUDABwd(const framework::ExecutionContext& ctx,
                                  const phi::DenseTensor output_grad) {
   auto* input = ctx.Input<phi::DenseTensor>("X");
   const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
   int n, c, in_d, in_h, in_w;
   ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
 
@@ -1382,7 +1382,7 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
                                  const phi::DenseTensor output_grad) {
   auto* input = ctx.Input<phi::DenseTensor>("X");
   const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
   int n, c, in_d, in_h, in_w;
   ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
 
@@ -1528,7 +1528,7 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx,
                                  const phi::DenseTensor& output_grad) {
   auto* input = ctx.Input<phi::DenseTensor>("X");
   const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
   int n, c, in_d, in_h, in_w;
   ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
 
diff --git a/paddle/fluid/operators/interpolate_op.h b/paddle/fluid/operators/interpolate_op.h
index 6272017aa0da07..31767d68b9d3c9 100644
--- a/paddle/fluid/operators/interpolate_op.h
+++ b/paddle/fluid/operators/interpolate_op.h
@@ -36,7 +36,7 @@ inline std::vector<int> get_new_shape(
   for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
     auto tensor = list_new_shape_tensor[i];
     PADDLE_ENFORCE_EQ(tensor->dims(),
-                      phi::make_ddim({1}),
+                      common::make_ddim({1}),
                       platform::errors::InvalidArgument(
                           "The shape of dimension tensor should be [1],"
                           "but received d%.",
@@ -856,7 +856,7 @@ static void Interpolate1DCPUFwd(const framework::ExecutionContext& ctx,
                                 const phi::DenseTensor& input,
                                 phi::DenseTensor* output) {
   const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
   int n, c, in_d, in_h, in_w;
   ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
 
@@ -930,7 +930,7 @@ static void Interpolate2DCPUFwd(const framework::ExecutionContext& ctx,
                                 const phi::DenseTensor& input,
                                 phi::DenseTensor* output) {
   const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
   int n, c, in_d, in_h, in_w;
   ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
 
@@ -1047,7 +1047,7 @@ static void Interpolate3DCPUFwd(const framework::ExecutionContext& ctx,
                                 const phi::DenseTensor& input,
                                 phi::DenseTensor* output) {
   const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
   int n, c, in_d, in_h, in_w;
   ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
 
@@ -1160,7 +1160,7 @@ static void Interpolate1DCPUBwd(const framework::ExecutionContext& ctx,
                                 const phi::DenseTensor& output_grad) {
   auto* input = ctx.Input<phi::DenseTensor>("X");
   const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
   int n, c, in_d, in_h, in_w;
   ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
 
@@ -1234,7 +1234,7 @@ static void Interpolate2DCPUBwd(const framework::ExecutionContext& ctx,
                                 const phi::DenseTensor& output_grad) {
   auto* input = ctx.Input<phi::DenseTensor>("X");
   const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
   int n, c, in_d, in_h, in_w;
   ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
 
@@ -1345,7 +1345,7 @@ static void Interpolate3DCPUBwd(const framework::ExecutionContext& ctx,
                                 const phi::DenseTensor output_grad) {
   auto* input = ctx.Input<phi::DenseTensor>("X");
   const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
   int n, c, in_d, in_h, in_w;
   ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
 
diff --git a/paddle/fluid/operators/is_empty_op.h b/paddle/fluid/operators/is_empty_op.h
index 6f020cba1944d6..3c9dfbf58fae52 100644
--- a/paddle/fluid/operators/is_empty_op.h
+++ b/paddle/fluid/operators/is_empty_op.h
@@ -32,7 +32,7 @@ class IsEmptyOpKernel : public framework::OpKernel<T> {
     // always be allocated for CPUPlace. We reigister CUDA kernel for this op to
     // avoid the unnecessary data transform.
     output_tensor->mutable_data<bool>(platform::CPUPlace())[0] =
-        phi::product(input_tensor->dims()) == 0;
+        common::product(input_tensor->dims()) == 0;
   }
 };
 
diff --git a/paddle/fluid/operators/l1_norm_op.cc b/paddle/fluid/operators/l1_norm_op.cc
index 92f190c0025ed9..8f0b705c8de79f 100644
--- a/paddle/fluid/operators/l1_norm_op.cc
+++ b/paddle/fluid/operators/l1_norm_op.cc
@@ -27,7 +27,7 @@ class L1NormOp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "L1NormOp");
     OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "L1NormOp");
 
-    ctx->SetOutputDim("Out", phi::make_ddim({}));
+    ctx->SetOutputDim("Out", common::make_ddim({}));
   }
 };
 
diff --git a/paddle/fluid/operators/layout_utils.h b/paddle/fluid/operators/layout_utils.h
index 2faf47538ffa54..da49245812605e 100644
--- a/paddle/fluid/operators/layout_utils.h
+++ b/paddle/fluid/operators/layout_utils.h
@@ -35,31 +35,31 @@ inline void ResizeToChannelFirst(const framework::ExecutionContext& context,
     // input
     transformed_input->Resize(input->dims());
 
-    auto in_dims_vec = phi::vectorize(input->dims());
+    auto in_dims_vec = common::vectorize(input->dims());
     in_dims_vec[1] = input->dims()[4];
     in_dims_vec[2] = input->dims()[1];
     in_dims_vec[3] = input->dims()[2];
     in_dims_vec[4] = input->dims()[3];
-    transformed_input->Resize(phi::make_ddim(in_dims_vec));
+    transformed_input->Resize(common::make_ddim(in_dims_vec));
     transformed_input->mutable_data<T>(context.GetPlace());
 
   } else if (dim == 2) {
     // input
     transformed_input->Resize(input->dims());
 
-    auto in_dims_vec = phi::vectorize(input->dims());
+    auto in_dims_vec = common::vectorize(input->dims());
     in_dims_vec[1] = input->dims()[3];
     in_dims_vec[2] = input->dims()[1];
     in_dims_vec[3] = input->dims()[2];
-    transformed_input->Resize(phi::make_ddim(in_dims_vec));
+    transformed_input->Resize(common::make_ddim(in_dims_vec));
     transformed_input->mutable_data<T>(context.GetPlace());
   } else if (dim == 1) {
     transformed_input->Resize(input->dims());
 
-    auto in_dims_vec = phi::vectorize(input->dims());
+    auto in_dims_vec = common::vectorize(input->dims());
     in_dims_vec[1] = input->dims()[2];
     in_dims_vec[2] = input->dims()[1];
-    transformed_input->Resize(phi::make_ddim(in_dims_vec));
+    transformed_input->Resize(common::make_ddim(in_dims_vec));
     transformed_input->mutable_data<T>(context.GetPlace());
   }
 }
@@ -73,31 +73,31 @@ inline void ResizeToChannelLast(const framework::ExecutionContext& context,
     // input
     transformed_input->Resize(input->dims());
 
-    auto in_dims_vec = phi::vectorize(input->dims());
+    auto in_dims_vec = common::vectorize(input->dims());
     in_dims_vec[1] = input->dims()[2];
     in_dims_vec[2] = input->dims()[3];
     in_dims_vec[3] = input->dims()[4];
     in_dims_vec[4] = input->dims()[1];
-    transformed_input->Resize(phi::make_ddim(in_dims_vec));
+    transformed_input->Resize(common::make_ddim(in_dims_vec));
     transformed_input->mutable_data<T>(context.GetPlace());
 
   } else if (dim == 2) {
     // input
     transformed_input->Resize(input->dims());
 
-    auto in_dims_vec = phi::vectorize(input->dims());
+    auto in_dims_vec = common::vectorize(input->dims());
     in_dims_vec[1] = input->dims()[2];
     in_dims_vec[2] = input->dims()[3];
     in_dims_vec[3] = input->dims()[1];
-    transformed_input->Resize(phi::make_ddim(in_dims_vec));
+    transformed_input->Resize(common::make_ddim(in_dims_vec));
     transformed_input->mutable_data<T>(context.GetPlace());
   } else if (dim == 1) {
     transformed_input->Resize(input->dims());
 
-    auto in_dims_vec = phi::vectorize(input->dims());
+    auto in_dims_vec = common::vectorize(input->dims());
     in_dims_vec[1] = input->dims()[2];
     in_dims_vec[2] = input->dims()[1];
-    transformed_input->Resize(phi::make_ddim(in_dims_vec));
+    transformed_input->Resize(common::make_ddim(in_dims_vec));
     transformed_input->mutable_data<T>(context.GetPlace());
   }
 }
diff --git a/paddle/fluid/operators/linear_chain_crf_op.h b/paddle/fluid/operators/linear_chain_crf_op.h
index 49387240625c18..9a27db56a9c31e 100644
--- a/paddle/fluid/operators/linear_chain_crf_op.h
+++ b/paddle/fluid/operators/linear_chain_crf_op.h
@@ -129,7 +129,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
     // Now, all the inputs and outputs should be on the CPU memory.
     phi::DenseTensor emission_row_max;
     emission_row_max.mutable_data<T>(
-        phi::make_ddim({static_cast<int64_t>(batch_size), 1}),
+        common::make_ddim({static_cast<int64_t>(batch_size), 1}),
         platform::CPUPlace());
     auto& place =
         *ctx.template device_context<phi::CPUContext>().eigen_device();
diff --git a/paddle/fluid/operators/lite/ut_helper.h b/paddle/fluid/operators/lite/ut_helper.h
index b085f332781b0b..3d574b1f844c87 100644
--- a/paddle/fluid/operators/lite/ut_helper.h
+++ b/paddle/fluid/operators/lite/ut_helper.h
@@ -99,7 +99,7 @@ void CreateTensor(framework::Scope* scope,
                   const std::vector<int64_t>& shape) {
   auto* var = scope->Var(name);
   auto* tensor = var->GetMutable<phi::DenseTensor>();
-  auto dims = phi::make_ddim(shape);
+  auto dims = common::make_ddim(shape);
   tensor->Resize(dims);
   platform::Place place = platform::CPUPlace();
   RandomizeTensor(tensor, place);
diff --git a/paddle/fluid/operators/lod_reset_op.h b/paddle/fluid/operators/lod_reset_op.h
index a4af52472ca9cd..a468577ab9aa1f 100644
--- a/paddle/fluid/operators/lod_reset_op.h
+++ b/paddle/fluid/operators/lod_reset_op.h
@@ -96,7 +96,7 @@ class LoDResetKernel : public framework::OpKernel<T> {
             "The last value of 'Target LoD''s last level LoD should be equal "
             "to the first dimension of Input(X). But received the 'Target LoD' "
             "is %s, Input(X)'s shape is %s.",
-            phi::make_ddim(level0),
+            common::make_ddim(level0),
             in->dims()));
     for (size_t i = 0; i < level0.size() - 1; ++i) {
       PADDLE_ENFORCE_GE(level0[i + 1],
@@ -104,7 +104,7 @@ class LoDResetKernel : public framework::OpKernel<T> {
                         platform::errors::InvalidArgument(
                             "'Target LoD' should be an ascending "
                             "vector. But received the Target LoD is %s.",
-                            phi::make_ddim(level0)));
+                            common::make_ddim(level0)));
     }
 
     // cast level0 to size_t
diff --git a/paddle/fluid/operators/lookup_table_dequant_op.cc b/paddle/fluid/operators/lookup_table_dequant_op.cc
index f9258f9e0185cd..93826aab0d5739 100644
--- a/paddle/fluid/operators/lookup_table_dequant_op.cc
+++ b/paddle/fluid/operators/lookup_table_dequant_op.cc
@@ -66,7 +66,7 @@ class LookupTableDequantOp : public framework::OperatorWithKernel {
             ids_dims));
 
     auto output_dims =
-        phi::vectorize(phi::slice_ddim(ids_dims, 0, ids_rank - 1));
+        common::vectorize(common::slice_ddim(ids_dims, 0, ids_rank - 1));
     PADDLE_ENFORCE_GE(table_dims[1],
                       2,
                       platform::errors::InvalidArgument(
@@ -76,7 +76,7 @@ class LookupTableDequantOp : public framework::OperatorWithKernel {
                           table_dims));
 
     output_dims.push_back((table_dims[1] - 2) * 4);
-    ctx->SetOutputDim("Out", phi::make_ddim(output_dims));
+    ctx->SetOutputDim("Out", common::make_ddim(output_dims));
 
     if (ctx->GetOutputsVarType("Out")[0] ==
         framework::proto::VarType::LOD_TENSOR) {
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index 6bb9f9ee19e42c..a8185691c45aae 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -56,9 +56,9 @@ class LookupTableOp : public framework::OperatorWithKernel {
             ids_dims));
 
     auto output_dims =
-        phi::vectorize(phi::slice_ddim(ids_dims, 0, ids_rank - 1));
+        common::vectorize(common::slice_ddim(ids_dims, 0, ids_rank - 1));
     output_dims.push_back(table_dims[1]);
-    ctx->SetOutputDim("Out", phi::make_ddim(output_dims));
+    ctx->SetOutputDim("Out", common::make_ddim(output_dims));
 
     if (ctx->GetOutputsVarType("Out")[0] ==
         framework::proto::VarType::LOD_TENSOR) {
diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu
index 32946d65785a97..ba8af995429a39 100644
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@@ -192,7 +192,7 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
       auto *d_output_data = d_output->data<T>();
       auto d_output_dims = d_output->dims();
       auto d_output_dims_2d =
-          phi::flatten_to_2d(d_output_dims, d_output_dims.size() - 1);
+          common::flatten_to_2d(d_output_dims, d_output_dims.size() - 1);
       PADDLE_ENFORCE_EQ(d_table_value->dims(),
                         d_output_dims_2d,
                         platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index b467428eeafd3e..21f0bf6a957aea 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -207,7 +207,7 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
 
       auto d_output_dims = d_output->dims();
       auto d_output_dims_2d =
-          phi::flatten_to_2d(d_output_dims, d_output_dims.size() - 1);
+          common::flatten_to_2d(d_output_dims, d_output_dims.size() - 1);
       PADDLE_ENFORCE_EQ(d_table_value->dims(),
                         d_output_dims_2d,
                         platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu
index 11c35293ebe345..edd8b20da160c5 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.cu
+++ b/paddle/fluid/operators/lookup_table_v2_op.cu
@@ -187,7 +187,7 @@ struct LookupTableV2GradCUDAFunctor {
       auto *d_output_data = d_output->template data<T>();
       auto d_output_dims = d_output->dims();
       auto d_output_dims_2d =
-          phi::flatten_to_2d(d_output_dims, d_output_dims.size() - 1);
+          common::flatten_to_2d(d_output_dims, d_output_dims.size() - 1);
       PADDLE_ENFORCE_EQ(d_table_value->dims(),
                         d_output_dims_2d,
                         platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/lookup_table_v2_op.h b/paddle/fluid/operators/lookup_table_v2_op.h
index 52c93f26b7e8a8..82dbac8b21dfc2 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.h
+++ b/paddle/fluid/operators/lookup_table_v2_op.h
@@ -206,7 +206,7 @@ struct LookupTableV2GradCPUFunctor {
 
       auto d_output_dims = d_output->dims();
       auto d_output_dims_2d =
-          phi::flatten_to_2d(d_output_dims, d_output_dims.size() - 1);
+          common::flatten_to_2d(d_output_dims, d_output_dims.size() - 1);
       PADDLE_ENFORCE_EQ(d_table_value->dims(),
                         d_output_dims_2d,
                         platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
index a1e328fce5942c..bf4c72a2133b69 100644
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -54,9 +54,9 @@ struct LRNFunctor<phi::CPUContext, T> {
       auto in_dims = input.dims();
       std::vector<int64_t> shape(
           {in_dims[0], in_dims[3], in_dims[1], in_dims[2]});
-      in_transpose.mutable_data<T>(phi::make_ddim(shape), place);
-      mid_transpose.mutable_data<T>(phi::make_ddim(shape), place);
-      out_transpose.mutable_data<T>(phi::make_ddim(shape), place);
+      in_transpose.mutable_data<T>(common::make_ddim(shape), place);
+      mid_transpose.mutable_data<T>(common::make_ddim(shape), place);
+      out_transpose.mutable_data<T>(common::make_ddim(shape), place);
       std::vector<int> axis = {0, 3, 1, 2};
       transpose(dev_ctx, input, &in_transpose, axis);
     } else {
@@ -238,7 +238,7 @@ class LRNOp : public framework::OperatorWithKernel {
       auto attrs = Attrs();
       auto ar = paddle::framework::AttrReader(attrs);
       const std::string data_format = ar.Get<std::string>("data_format");
-      auto dl = phi::StringToDataLayout(data_format);
+      auto dl = common::StringToDataLayout(data_format);
       // Some models may have intentionally set "AnyLayout" for lrn
       // op. Treat this as NCHW (default data_format value)
       if (dl != phi::DataLayout::kAnyLayout) {
@@ -361,7 +361,7 @@ class LRNOpGrad : public framework::OperatorWithKernel {
       auto attrs = Attrs();
       auto ar = paddle::framework::AttrReader(attrs);
       const std::string data_format = ar.Get<std::string>("data_format");
-      auto dl = phi::StringToDataLayout(data_format);
+      auto dl = common::StringToDataLayout(data_format);
       // Some models may have intentionally set "AnyLayout" for lrn
       // op. Treat this as NCHW (default data_format value)
       if (dl != phi::DataLayout::kAnyLayout) {
diff --git a/paddle/fluid/operators/lrn_op.h b/paddle/fluid/operators/lrn_op.h
index 15ebb4df74f47d..4d1cc268d48b6f 100644
--- a/paddle/fluid/operators/lrn_op.h
+++ b/paddle/fluid/operators/lrn_op.h
@@ -56,7 +56,7 @@ class LRNKernel : public framework::OpKernel<T> {
 
     const std::string data_layout_str = ctx.Attr<std::string>("data_format");
     const phi::DataLayout data_layout =
-        phi::StringToDataLayout(data_layout_str);
+        common::StringToDataLayout(data_layout_str);
     // NCHW
     int N = x_dims[0];
     int C = (data_layout != DataLayout::kNHWC ? x_dims[1] : x_dims[3]);
@@ -147,7 +147,7 @@ class LRNGradKernel : public framework::OpKernel<T> {
     const phi::DenseTensor& mid = *ctx.Input<phi::DenseTensor>("MidOut");
     const std::string data_layout_str = ctx.Attr<std::string>("data_format");
     const phi::DataLayout data_layout =
-        phi::StringToDataLayout(data_layout_str);
+        common::StringToDataLayout(data_layout_str);
 
     auto x_g = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     x_g->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/match_matrix_tensor_op.cc b/paddle/fluid/operators/match_matrix_tensor_op.cc
index 271a027c456236..7055f3ca95efe0 100644
--- a/paddle/fluid/operators/match_matrix_tensor_op.cc
+++ b/paddle/fluid/operators/match_matrix_tensor_op.cc
@@ -182,8 +182,8 @@ void MatchMatrixTensorOP::InferShape(framework::InferShapeContext* ctx) const {
   out_dims_vec.push_back(1);
   std::vector<int64_t> tmp_dims_vec{tmp_dim_0};
   tmp_dims_vec.push_back(1);
-  ctx->SetOutputDim("Out", phi::make_ddim(out_dims_vec));
-  ctx->SetOutputDim("Tmp", phi::make_ddim(tmp_dims_vec));
+  ctx->SetOutputDim("Out", common::make_ddim(out_dims_vec));
+  ctx->SetOutputDim("Tmp", common::make_ddim(tmp_dims_vec));
 }
 
 void MatchMatrixTensorOpGrad::InferShape(
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index af14333b9d1ea0..0e0423bd64ff45 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -6,20 +6,20 @@ if(WITH_XPU)
 endif()
 
 # please add new math_library in alphabetical order
-math_library(concat_and_split DEPS phi)
-math_library(context_project DEPS phi)
+math_library(concat_and_split DEPS phi common)
+math_library(context_project DEPS phi common)
 math_library(cos_sim_functor)
 math_library(depthwise_conv)
 math_library(sample_prob)
-math_library(sampler DEPS phi)
+math_library(sampler DEPS phi common)
 
 if(WITH_XPU)
-  math_library(beam_search DEPS phi beam_search_xpu)
+  math_library(beam_search DEPS phi common beam_search_xpu)
 else()
-  math_library(beam_search DEPS phi)
+  math_library(beam_search DEPS phi common)
 endif()
 
 math_library(unpooling)
 math_library(prelu)
 math_library(bert_encoder_functor)
-math_library(tree2col DEPS phi)
+math_library(tree2col DEPS phi common)
diff --git a/paddle/fluid/operators/math/beam_search.cc b/paddle/fluid/operators/math/beam_search.cc
index be8734076da3b4..aeff6c394c429a 100644
--- a/paddle/fluid/operators/math/beam_search.cc
+++ b/paddle/fluid/operators/math/beam_search.cc
@@ -67,7 +67,7 @@ class BeamSearchFunctor<phi::CPUContext, T> {
         0,
         [](size_t a, std::vector<Item> &b) { return a + b.size(); });
     // the output tensor shape should be [num_instances, 1]
-    auto dims = phi::make_ddim(
+    auto dims = common::make_ddim(
         std::vector<int64_t>({static_cast<int>(num_instances), 1}));
     auto *selected_ids_data =
         selected_ids->mutable_data<int64_t>(dims, platform::CPUPlace());
diff --git a/paddle/fluid/operators/math/beam_search.cu b/paddle/fluid/operators/math/beam_search.cu
index bd8e905389e812..098f40ab526b10 100644
--- a/paddle/fluid/operators/math/beam_search.cu
+++ b/paddle/fluid/operators/math/beam_search.cu
@@ -432,7 +432,7 @@ class BeamSearchFunctor<phi::GPUContext, T> {
 
     // Reserve a big enough memory.
     auto selected_dims =
-        phi::make_ddim({static_cast<int64_t>(num_seqs * beam_size), 1});
+        common::make_ddim({static_cast<int64_t>(num_seqs * beam_size), 1});
     int64_t* selected_ids_data =
         selected_ids->mutable_data<int64_t>(selected_dims, context.GetPlace());
     float* selected_scores_data =
@@ -521,7 +521,7 @@ class BeamSearchFunctor<phi::GPUContext, T> {
     selected_scores->set_lod(selected_lod);
     if (selected_lod[1].back() < num_seqs * beam_size) {
       auto final_selected_dims =
-          phi::make_ddim({static_cast<int64_t>(selected_lod[1].back()), 1});
+          common::make_ddim({static_cast<int64_t>(selected_lod[1].back()), 1});
       selected_ids->Resize(final_selected_dims);
       selected_scores->Resize(final_selected_dims);
       if (parent_idx) {
diff --git a/paddle/fluid/operators/math/beam_search_xpu.cc b/paddle/fluid/operators/math/beam_search_xpu.cc
index 5451b5a3138896..4ac0e3d886017a 100644
--- a/paddle/fluid/operators/math/beam_search_xpu.cc
+++ b/paddle/fluid/operators/math/beam_search_xpu.cc
@@ -92,7 +92,7 @@ class BeamSearchFunctor<platform::XPUDeviceContext, T> {
         0,
         [](size_t a, std::vector<Item> &b) { return a + b.size(); });
     // the output tensor shape should be [num_instances, 1]
-    auto dims = phi::make_ddim(
+    auto dims = common::make_ddim(
         std::vector<int64_t>({static_cast<int>(num_instances), 1}));
     auto *selected_ids_data =
         selected_ids->mutable_data<int64_t>(dims, platform::CPUPlace());
diff --git a/paddle/fluid/operators/math/context_project.h b/paddle/fluid/operators/math/context_project.h
index 0b6dc510f477fa..20211160b7e5ed 100644
--- a/paddle/fluid/operators/math/context_project.h
+++ b/paddle/fluid/operators/math/context_project.h
@@ -130,13 +130,13 @@ class ContextProjectFunctor {
              context_length,
              sequence_width});  // output_height, output_width,
         // input_channels, filter_height, filter_width
-        out_t.Resize(phi::make_ddim(output_shape));
+        out_t.Resize(common::make_ddim(output_shape));
 
         std::vector<int64_t> input_shape(
             {1,
              input_row_end - input_row_begin,
              sequence_width});  // input_channels, input_height, input_width
-        in_t.Resize(phi::make_ddim(input_shape));
+        in_t.Resize(common::make_ddim(input_shape));
         im2col_ocf(context, in_t, dilation, stride, padding, &out_t);
         out_t.Resize({sequence_height, context_length * sequence_width});
       }
@@ -265,13 +265,13 @@ class ContextProjectGradFunctor {
                context_length,
                sequence_width});  // output_height, output_width,
           // input_channels, filter_height, filter_width
-          out_t.Resize(phi::make_ddim(output_shape));
+          out_t.Resize(common::make_ddim(output_shape));
 
           std::vector<int64_t> input_shape(
               {1,
                input_row_end - input_row_begin,
                sequence_width});  // input_channels, input_height, input_width
-          in_t.Resize(phi::make_ddim(input_shape));
+          in_t.Resize(common::make_ddim(input_shape));
 
           col2im_ocf(context, out_t, dilation, stride, padding, &in_t);
           out_t.Resize({sequence_height, context_length * sequence_width});
diff --git a/paddle/fluid/operators/math/eigen_values_vectors.h b/paddle/fluid/operators/math/eigen_values_vectors.h
index f4198acfd830c7..8d6b0b99f9d528 100644
--- a/paddle/fluid/operators/math/eigen_values_vectors.h
+++ b/paddle/fluid/operators/math/eigen_values_vectors.h
@@ -132,13 +132,13 @@ struct MatrixEighFunctor<phi::CPUContext, T> {
             framework::TransToProtoVarType(input.dtype()))) {
       lrwork = std::max<int>(1, static_cast<int>(rwork_opt));
       rwork_data = rwork_tensor.mutable_data<ValueType>(
-          phi::make_ddim({lrwork}), ctx.GetPlace());
+          common::make_ddim({lrwork}), ctx.GetPlace());
     }
     phi::DenseTensor iwork_tensor, work_tensor;
-    auto *iwork_data = iwork_tensor.mutable_data<int>(phi::make_ddim({liwork}),
-                                                      ctx.GetPlace());
+    auto *iwork_data = iwork_tensor.mutable_data<int>(
+        common::make_ddim({liwork}), ctx.GetPlace());
     auto *work_data =
-        work_tensor.mutable_data<T>(phi::make_ddim({lwork}), ctx.GetPlace());
+        work_tensor.mutable_data<T>(common::make_ddim({lwork}), ctx.GetPlace());
 
     for (auto i = 0; i < batch_size; i++) {
       auto *value_data = out_value + i * values_stride;
diff --git a/paddle/fluid/operators/math/sample_prob.cu b/paddle/fluid/operators/math/sample_prob.cu
index 0c6b49729546cd..bf028c4ada3695 100644
--- a/paddle/fluid/operators/math/sample_prob.cu
+++ b/paddle/fluid/operators/math/sample_prob.cu
@@ -19,12 +19,12 @@ limitations under the License. */
 #include <iostream>
 #include <vector>
 
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/math/sample_prob.h"
 #include "paddle/fluid/operators/math/sampler.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/math/sample_prob.h b/paddle/fluid/operators/math/sample_prob.h
index 7c60be68415520..524ba826a57047 100644
--- a/paddle/fluid/operators/math/sample_prob.h
+++ b/paddle/fluid/operators/math/sample_prob.h
@@ -17,11 +17,11 @@ limitations under the License. */
 #include <unordered_set>
 #include <vector>
 
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/math/sampler.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/ddim.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index df66ab400f40bf..895a427bae6e20 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -40,7 +40,7 @@ static framework::DDim RowMatrixFromVector(const framework::DDim &x_dim) {
   if (x_dim.size() > 1) {
     return x_dim;
   }
-  return phi::make_ddim({1, x_dim[0]});
+  return common::make_ddim({1, x_dim[0]});
 }
 
 /**
@@ -51,7 +51,7 @@ static framework::DDim ColumnMatrixFromVector(const framework::DDim &y_dim) {
   if (y_dim.size() > 1) {
     return y_dim;
   }
-  return phi::make_ddim({y_dim[0], 1});
+  return common::make_ddim({y_dim[0], 1});
 }
 
 #if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
@@ -676,11 +676,11 @@ class MatMulOp : public framework::OperatorWithKernel {
 
     std::vector<int64_t> dim_out;
     if (mat_dim_x.batch_size_ != 0) {
-      dim_out = phi::vectorize(dim_x);
+      dim_out = common::vectorize(dim_x);
       dim_out[dim_out.size() - 2] = mat_dim_x.height_;
       dim_out[dim_out.size() - 1] = dim_out_y;
     } else if (mat_dim_y.batch_size_ != 0) {
-      dim_out = phi::vectorize(dim_y);
+      dim_out = common::vectorize(dim_y);
       dim_out[dim_out.size() - 2] = mat_dim_x.height_;
       dim_out[dim_out.size() - 1] = dim_out_y;
     } else {
@@ -696,7 +696,7 @@ class MatMulOp : public framework::OperatorWithKernel {
       dim_out.resize(dim_out.size() - 1);
     }
 
-    phi::DDim ddim_out = phi::make_ddim(dim_out);
+    phi::DDim ddim_out = common::make_ddim(dim_out);
 
     context->SetOutputDim("Out", ddim_out);
     context->ShareLoD("X", "Out");
diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc
index 3f0fd7bfef2dcc..3ed27460e16b6c 100644
--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
@@ -86,16 +86,16 @@ class MergeLoDTensorOp : public framework::OperatorBase {
     framework::DDim in_dims;
     if (in_true.IsInitialized()) {
       rank = in_true.dims().size();
-      in_dims = phi::slice_ddim(in_true.dims(), 1, rank);
+      in_dims = common::slice_ddim(in_true.dims(), 1, rank);
     } else {
       rank = in_false.dims().size();
-      in_dims = phi::slice_ddim(in_false.dims(), 1, rank);
+      in_dims = common::slice_ddim(in_false.dims(), 1, rank);
     }
 
-    auto in_dim_vec = phi::vectorize(in_dims);
+    auto in_dim_vec = common::vectorize(in_dims);
     in_dim_vec.insert(in_dim_vec.begin(), batch_size);
 
-    framework::DDim out_dims = phi::make_ddim(in_dim_vec);
+    framework::DDim out_dims = common::make_ddim(in_dim_vec);
     out->Resize(out_dims);
 
     out->mutable_data(place, data_type);
diff --git a/paddle/fluid/operators/metrics/precision_recall_op.cc b/paddle/fluid/operators/metrics/precision_recall_op.cc
index 413cd8546011be..63385cb59171fa 100644
--- a/paddle/fluid/operators/metrics/precision_recall_op.cc
+++ b/paddle/fluid/operators/metrics/precision_recall_op.cc
@@ -106,7 +106,7 @@ class PrecisionRecallOp : public framework::OperatorWithKernel {
       if (ctx->IsRuntime()) {
         PADDLE_ENFORCE_EQ(
             weights_dims,
-            phi::make_ddim({max_probs_dims[0], 1}),
+            common::make_ddim({max_probs_dims[0], 1}),
             platform::errors::InvalidArgument(
                 "The shape of PrecisionRecallOp Input(Weights) should be "
                 "[batch_size, 1]. But the shape we received is [%d, %d]",
@@ -120,7 +120,7 @@ class PrecisionRecallOp : public framework::OperatorWithKernel {
       if (ctx->IsRuntime()) {
         PADDLE_ENFORCE_EQ(
             states_dims,
-            phi::make_ddim({cls_num, 4}),
+            common::make_ddim({cls_num, 4}),
             platform::errors::InvalidArgument(
                 "The shape of PrecisionRecallOp Input(StatesInfo) should be "
                 "[class_number, 4]. But the shape we received is [%d, %d]",
diff --git a/paddle/fluid/operators/minus_op.cc b/paddle/fluid/operators/minus_op.cc
index 8c33a5da1baff9..64bc176d971492 100644
--- a/paddle/fluid/operators/minus_op.cc
+++ b/paddle/fluid/operators/minus_op.cc
@@ -48,7 +48,7 @@ class MinusOp : public framework::OperatorWithKernel {
     auto y_dims = ctx->GetInputDim("Y");
 
     if (ctx->IsRuntime() ||
-        (phi::product(x_dims) > 0 && phi::product(y_dims) > 0)) {
+        (common::product(x_dims) > 0 && common::product(y_dims) > 0)) {
       PADDLE_ENFORCE_EQ(
           x_dims,
           y_dims,
diff --git a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
index c0a68fe126c27b..34e9679b29bb64 100644
--- a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
@@ -33,7 +33,7 @@ class InterpolateOneDNNHandler
                            phi::DenseTensor* out)
       : phi::funcs::OneDNNHandlerNoCachingT<T, dnnl::resampling_forward>(
             engine, cpu_place) {
-    const auto dst_tz = phi::vectorize(out->dims());
+    const auto dst_tz = common::vectorize(out->dims());
     const auto dst_md = memory::desc(
         dst_tz, phi::funcs::OneDNNGetDataType<T>(), OneDNNMemoryFormat::any);
     this->AcquireForwardPrimitiveDescriptor(
@@ -49,7 +49,7 @@ class InterpolateOneDNNKernel : public framework::OpKernel<T> {
     const auto& in_dims = x->dims();
 
     const framework::DDim in_dhw_dims =
-        phi::slice_ddim(in_dims, 2, in_dims.size());
+        common::slice_ddim(in_dims, 2, in_dims.size());
 
     std::vector<int> out_dims;
     out_dims.reserve(5);
@@ -102,7 +102,7 @@ class InterpolateOneDNNKernel : public framework::OpKernel<T> {
       if (scale.size() == 3 && scale[0] > 0.0f && scale[1] > 0.0f &&
           scale[2] > 0.0f) {
         int j = 0;
-        std::vector<int64_t> in_dhw_vec = phi::vectorize(in_dhw_dims);
+        std::vector<int64_t> in_dhw_vec = common::vectorize(in_dhw_dims);
         std::transform(
             in_dhw_vec.begin(),
             in_dhw_vec.end(),
@@ -138,7 +138,7 @@ class InterpolateOneDNNKernel : public framework::OpKernel<T> {
                                      : dnnl::algorithm::resampling_linear;
 
     const auto out_dims_vec = ComputeOutputShape(ctx);
-    framework::DDim dim_out = phi::make_ddim(out_dims_vec);
+    framework::DDim dim_out = common::make_ddim(out_dims_vec);
     out->Resize(dim_out);
 
     InterpolateOneDNNHandler<T> handler(
diff --git a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
index 497a7186b537de..d2b715a5f56e6a 100644
--- a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
@@ -87,7 +87,7 @@ class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto& dev_ctx = ctx.template device_context<phi::OneDNNContext>();
     const auto& onednn_engine = dev_ctx.GetEngine();
 
-    auto src_tz = phi::vectorize(x->dims());
+    auto src_tz = common::vectorize(x->dims());
     PADDLE_ENFORCE_EQ(begin_norm_axis,
                       (src_tz.size() - 1),
                       platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
index a1dad17392a22e..80af1b00b743cc 100644
--- a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
@@ -18,11 +18,11 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 
 namespace {
+using common::vectorize;
 using dnnl::memory;
 using paddle::framework::ExecutionContext;
 using paddle::framework::GradVarName;
 using phi::OneDNNContext;
-using phi::vectorize;
 using phi::funcs::OneDNNGetDataType;
 
 // Reshape a rank-3 tensor from P x M x N to (P * M) x N.
@@ -467,7 +467,7 @@ class MatMulMKLDNNKernel : public paddle::framework::OpKernel<T> {
                 (*y_bd_dims)[i]));
         (out_dims)[i] = std::max((*x_bd_dims)[i], (*y_bd_dims)[i]);
       }
-      out->Resize(phi::make_ddim((out_dims)));
+      out->Resize(common::make_ddim((out_dims)));
     }
   }
 };
diff --git a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
index 48e8e12ccab142..63b373be5ad156 100644
--- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
@@ -49,7 +49,7 @@ class QuantOpKernel : public framework::OpKernel<T> {
 
     auto& dev_ctx = ctx.template device_context<phi::OneDNNContext>();
 
-    auto x_tz = phi::vectorize<int64_t>(x->dims());
+    auto x_tz = common::vectorize<int64_t>(x->dims());
 
     const bool is_negative_input = ctx.Attr<bool>("is_negative_input");
     const bool bfloat16 = ctx.Attr<bool>("bfloat16");
diff --git a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
index e41baa39b47c58..99e25f98ce3598 100644
--- a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
@@ -62,7 +62,7 @@ class ReQuantOpKernel : public framework::OpKernel<T> {
 
     auto& dev_ctx = ctx.template device_context<phi::OneDNNContext>();
 
-    auto src_tz = phi::vectorize(input->dims());
+    auto src_tz = common::vectorize(input->dims());
 
     auto src_paddle_dt = input->dtype();
     auto dst_paddle_dt = with_shift ? DataType::UINT8 : src_paddle_dt;
diff --git a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
index d1bbfe42293724..5e5e2f8c19abe1 100644
--- a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
@@ -38,7 +38,7 @@ static std::vector<int> extract_shape(
   for (const auto& tensor : list_new_shape_tensor) {
     PADDLE_ENFORCE_EQ(
         tensor->dims(),
-        phi::make_ddim({1}),
+        common::make_ddim({1}),
         platform::errors::InvalidArgument(
             "If the element type of 'shape' in ReshapeOp is phi::DenseTensor, "
             "the element's shape must be [1]. But received the element's shape "
@@ -68,7 +68,7 @@ class ReshapeMKLDNNKernel : public framework::OpKernel<T> {
     framework::DDim x_dims, out_dims;
     InferInOutShape(ctx, x_dims, out_dims);
 
-    auto x_vec_dims = phi::vectorize(x_dims);
+    auto x_vec_dims = common::vectorize(x_dims);
 
     auto x_type = phi::funcs ::ToOneDNNDataType(x->dtype());
     phi::funcs::ReorderOneDNNHandler reorder_handler(
@@ -89,7 +89,7 @@ class ReshapeMKLDNNKernel : public framework::OpKernel<T> {
     astream.wait();
 
     out->Resize(out_dims);
-    auto reshape_dims = out_dims.size() != 0 ? phi::vectorize(out_dims)
+    auto reshape_dims = out_dims.size() != 0 ? common::vectorize(out_dims)
                                              : std::vector<int64_t>{1};
     out->set_mem_desc(reorder_dst_memory_p->get_desc().reshape(reshape_dims));
   }
@@ -160,15 +160,15 @@ class ReshapeMKLDNNKernel : public framework::OpKernel<T> {
     auto x = ctx.Input<phi::DenseTensor>("X");
     x_dims = x->dims();
     auto axes = ctx.Attr<int>("axis");
-    out_dims = phi::make_ddim(
+    out_dims = common::make_ddim(
         Flatten2Kernel<phi::CPUContext, float>::GetOutputShape(axes, x_dims));
   }
 
  protected:
   static framework::DDim ValidateShape(const std::vector<int>& shape,
                                        const framework::DDim& in_dims) {
-    const int64_t in_size = phi::product(in_dims);
-    auto in_dims_vec = phi::vectorize(in_dims);
+    const int64_t in_size = common::product(in_dims);
+    auto in_dims_vec = common::vectorize(in_dims);
     bool all_positive = std::all_of(in_dims_vec.cbegin(),
                                     in_dims_vec.cend(),
                                     [](int64_t i) { return i > 0; });
@@ -188,7 +188,7 @@ class ReshapeMKLDNNKernel : public framework::OpKernel<T> {
             platform::errors::InvalidArgument(
                 "Only one dimension value of 'shape' in ReshapeOp can "
                 "be -1. But received shape = [%s], shape[%d] is also -1.",
-                phi::make_ddim(shape),
+                common::make_ddim(shape),
                 i));
         unk_dim_idx = i;
       } else if (shape[i] == copy_dim_val) {
@@ -200,7 +200,7 @@ class ReshapeMKLDNNKernel : public framework::OpKernel<T> {
                 "the input tensor X's dimensions. "
                 "But received shape = [%s], shape[%d] = 0, X's shape = [%s], "
                 "X's dimensions = %d.",
-                phi::make_ddim(shape),
+                common::make_ddim(shape),
                 i,
                 in_dims,
                 in_dims.size()));
@@ -212,7 +212,7 @@ class ReshapeMKLDNNKernel : public framework::OpKernel<T> {
                 "Each dimension value of 'shape' in ReshapeOp must not "
                 "be negative except one unknown dimension. "
                 "But received  shape = [%s], shape[%d] = %d.",
-                phi::make_ddim(shape),
+                common::make_ddim(shape),
                 i,
                 shape[i]));
       }
@@ -240,7 +240,7 @@ class ReshapeMKLDNNKernel : public framework::OpKernel<T> {
                 "'shape' is [%s], known capacity of 'shape' is %d.",
                 in_dims,
                 in_size,
-                phi::make_ddim(shape),
+                common::make_ddim(shape),
                 capacity));
       } else {
         output_shape[unk_dim_idx] = -1;
@@ -258,11 +258,11 @@ class ReshapeMKLDNNKernel : public framework::OpKernel<T> {
                 "[%s], the capacity of 'shape' is %d.",
                 in_dims,
                 in_size,
-                phi::make_ddim(shape),
+                common::make_ddim(shape),
                 capacity));
       }
     }
-    return phi::make_ddim(output_shape);
+    return common::make_ddim(output_shape);
   }
 };
 
@@ -284,8 +284,9 @@ class ReshapeGradMKLDNNKernel : public ReshapeMKLDNNKernel<T, op_name> {
     framework::DDim dx_dims;
     InferOutputShapeInGrad(ctx, dx_dims);
 
-    auto dout_vec_dims = dout->dims().size() != 0 ? phi::vectorize(dout->dims())
-                                                  : std::vector<int64_t>{1};
+    auto dout_vec_dims = dout->dims().size() != 0
+                             ? common::vectorize(dout->dims())
+                             : std::vector<int64_t>{1};
 
     auto dout_type = phi::funcs::ToOneDNNDataType(dout->dtype());
     phi::funcs::ReorderOneDNNHandler reorder_handler(
@@ -305,8 +306,8 @@ class ReshapeGradMKLDNNKernel : public ReshapeMKLDNNKernel<T, op_name> {
     astream.wait();
 
     dx->Resize(dx_dims);
-    const auto reshape_dims =
-        dx_dims.size() != 0 ? phi::vectorize(dx_dims) : std::vector<int64_t>{1};
+    const auto reshape_dims = dx_dims.size() != 0 ? common::vectorize(dx_dims)
+                                                  : std::vector<int64_t>{1};
     reorder_dst_memory_p->get_desc().reshape(reshape_dims);
   }
 
@@ -345,7 +346,7 @@ class ReshapeGradMKLDNNKernel : public ReshapeMKLDNNKernel<T, op_name> {
       const framework::ExecutionContext& ctx,
       framework::DDim& dx_dims) const {  // NOLINT
     auto xshape_dims = ctx.Input<phi::DenseTensor>("XShape")->dims();
-    dx_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
+    dx_dims = common::slice_ddim(xshape_dims, 1, xshape_dims.size());
   }
 
   void InferShapeFlattenGradOp(const framework::ExecutionContext& ctx,
diff --git a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
index ee6712df4734bd..f3ab0e8459cc48 100644
--- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
@@ -44,7 +44,7 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       return;
     }
 
-    auto x_vec_dims = phi::vectorize(x->dims());
+    auto x_vec_dims = common::vectorize(x->dims());
 
     auto x_type = phi::funcs::ToOneDNNDataType(x->dtype());
     phi::funcs::ReorderOneDNNHandler reorder_handler(
@@ -105,7 +105,7 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
       return;
     }
 
-    auto dout_vec_dims = phi::vectorize(dout->dims());
+    auto dout_vec_dims = common::vectorize(dout->dims());
     auto dout_type = phi::funcs::ToOneDNNDataType(dout->dtype());
 
     phi::funcs::ReorderOneDNNHandler reorder_handler(
diff --git a/paddle/fluid/operators/modified_huber_loss_op.cc b/paddle/fluid/operators/modified_huber_loss_op.cc
index b44c795b6e5368..c6d553865277ed 100644
--- a/paddle/fluid/operators/modified_huber_loss_op.cc
+++ b/paddle/fluid/operators/modified_huber_loss_op.cc
@@ -38,7 +38,7 @@ class ModifiedHuberLossOp : public framework::OperatorWithKernel {
                                           x_dims.size()));
 
     if (ctx->IsRuntime() ||
-        (phi::product(x_dims) > 0 && phi::product(y_dims) > 0)) {
+        (common::product(x_dims) > 0 && common::product(y_dims) > 0)) {
       PADDLE_ENFORCE_EQ(
           x_dims,
           y_dims,
diff --git a/paddle/fluid/operators/modified_huber_loss_op.cu b/paddle/fluid/operators/modified_huber_loss_op.cu
index f811b0ad9d6d64..d063e8d1cb4d57 100644
--- a/paddle/fluid/operators/modified_huber_loss_op.cu
+++ b/paddle/fluid/operators/modified_huber_loss_op.cu
@@ -49,7 +49,7 @@ class ModifiedHuberLossGradGPUKernel : public framework::OpKernel<T> {
     auto* out0 = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     if (out0) {
-      auto counts = phi::product(in1->dims());
+      auto counts = common::product(in1->dims());
       auto y_ptr = thrust::device_pointer_cast(in0->data<T>());
       auto inter_val_ptr = thrust::device_pointer_cast(in1->data<T>());
       auto out_grad_ptr = thrust::device_pointer_cast(in2->data<T>());
diff --git a/paddle/fluid/operators/modified_huber_loss_op.h b/paddle/fluid/operators/modified_huber_loss_op.h
index 571482ce475886..4330abde2a828a 100644
--- a/paddle/fluid/operators/modified_huber_loss_op.h
+++ b/paddle/fluid/operators/modified_huber_loss_op.h
@@ -92,7 +92,7 @@ class ModifiedHuberLossGradCPUKernel : public framework::OpKernel<T> {
       const T* y_ptr = in0->data<T>();
       const T* inter_val_ptr = in1->data<T>();
       const T* out_grad_ptr = in2->data<T>();
-      size_t counts = static_cast<size_t>(phi::product(in1->dims()));
+      size_t counts = static_cast<size_t>(common::product(in1->dims()));
       T* x_grad_ptr = out0->mutable_data<T>(context.GetPlace());
       for (size_t i = 0; i < counts; ++i) {
         if (inter_val_ptr[i] < -1) {
diff --git a/paddle/fluid/operators/nccl/nccl_op.cu.cc b/paddle/fluid/operators/nccl/nccl_op.cu.cc
index 7dae16afafdf11..abb24cc8cae10d 100644
--- a/paddle/fluid/operators/nccl/nccl_op.cu.cc
+++ b/paddle/fluid/operators/nccl/nccl_op.cu.cc
@@ -110,7 +110,7 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
     if (root == gpu_id) {
       recvbuffer = out->mutable_data<T>(ctx.GetPlace());
     } else {
-      out->Resize(phi::make_ddim({0}));
+      out->Resize(common::make_ddim({0}));
     }
     VLOG(3) << "gpu : " << gpu_id << " invoke reduce. send " << x->numel()
             << " recv " << out->numel();
@@ -155,7 +155,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
     } else {
       auto* out = ctx.Output<phi::DenseTensor>("Out");
       VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. recv buffer "
-              << phi::product(out->dims());
+              << common::product(out->dims());
       PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::ncclBcast(out->mutable_data<T>(ctx.GetPlace()),
                                        out->numel(),
diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc
index 477d5aea7e8839..f4320cd0b6796e 100644
--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
@@ -100,7 +100,7 @@ class NCEOp : public framework::OperatorWithKernel {
     std::vector<int64_t> out_dims;
     out_dims.push_back(x_dims[0]);
     out_dims.push_back(1);
-    ctx->SetOutputDim("Cost", phi::make_ddim(out_dims));
+    ctx->SetOutputDim("Cost", common::make_ddim(out_dims));
 
     if (!is_test) {
       // set dims of output(SampleOut)
@@ -108,8 +108,8 @@ class NCEOp : public framework::OperatorWithKernel {
       sample_out_dims.push_back(x_dims[0]);
       sample_out_dims.push_back(
           (num_true_classes == -1) ? -1 : (num_neg_samples + num_true_classes));
-      ctx->SetOutputDim("SampleLogits", phi::make_ddim(sample_out_dims));
-      ctx->SetOutputDim("SampleLabels", phi::make_ddim(sample_out_dims));
+      ctx->SetOutputDim("SampleLogits", common::make_ddim(sample_out_dims));
+      ctx->SetOutputDim("SampleLabels", common::make_ddim(sample_out_dims));
     }
   }
 
diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h
index f8983c0db9a1fa..a21c7c816e191d 100644
--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
@@ -165,10 +165,10 @@ class NCEKernel : public framework::OpKernel<T> {
           (num_true_classes == -1) ? -1 : (num_neg_samples + num_true_classes));
 
       sample_labels = &sample_labels_tmp;
-      sample_labels->Resize(phi::make_ddim(sample_out_dims));
+      sample_labels->Resize(common::make_ddim(sample_out_dims));
 
       sample_out = &sample_out_tmp;
-      sample_out->Resize(phi::make_ddim(sample_out_dims));
+      sample_out->Resize(common::make_ddim(sample_out_dims));
     } else {
       sample_labels = context.Output<phi::DenseTensor>("SampleLabels");
       sample_out = context.Output<phi::DenseTensor>("SampleLogits");
diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
index c1f9650c62376a..6c64c6a1f72ffb 100644
--- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
@@ -55,14 +55,14 @@ class DecayedAdagradOp : public framework::OperatorWithKernel {
         ctx->HasOutput("MomentOut"), "Output", "MomentOut", "DecayedAdagradOp");
 
     auto lr_dims = ctx->GetInputDim("LearningRate");
-    PADDLE_ENFORCE_NE(phi::product(lr_dims),
+    PADDLE_ENFORCE_NE(common::product(lr_dims),
                       0,
                       platform::errors::InvalidArgument(
                           "Maybe the Input variable LearningRate has not "
                           "been initialized. You may need to confirm "
                           "if you put exe.run(startup_program) "
                           "after optimizer.minimize function."));
-    PADDLE_ENFORCE_EQ(phi::product(lr_dims),
+    PADDLE_ENFORCE_EQ(common::product(lr_dims),
                       1,
                       platform::errors::InvalidArgument(
                           "LearningRate should have one element"));
diff --git a/paddle/fluid/operators/optimizers/dpsgd_op.cc b/paddle/fluid/operators/optimizers/dpsgd_op.cc
index 0da5ae57b0932f..d8762b8bd719a7 100644
--- a/paddle/fluid/operators/optimizers/dpsgd_op.cc
+++ b/paddle/fluid/operators/optimizers/dpsgd_op.cc
@@ -54,12 +54,12 @@ class DpsgdOp : public framework::OperatorWithKernel {
                           "Output(ParamOut) of DpsgdOp should not be null."));
 
     auto lr_dims = ctx->GetInputDim("LearningRate");
-    PADDLE_ENFORCE_EQ(phi::product(lr_dims),
+    PADDLE_ENFORCE_EQ(common::product(lr_dims),
                       1,
                       platform::errors::InvalidArgument(
                           "Learning rate should have 1 dimension. But Received "
                           "LearningRate's dims [%s].",
-                          phi::product(lr_dims)));
+                          common::product(lr_dims)));
     auto param_dims = ctx->GetInputDim("Param");
     PADDLE_ENFORCE_EQ(
         param_dims,
diff --git a/paddle/fluid/operators/optimizers/ftrl_op.cc b/paddle/fluid/operators/optimizers/ftrl_op.cc
index 7c757d3fddc25b..e6eadadc17b6cd 100644
--- a/paddle/fluid/operators/optimizers/ftrl_op.cc
+++ b/paddle/fluid/operators/optimizers/ftrl_op.cc
@@ -52,18 +52,18 @@ class FTRLOp : public framework::OperatorWithKernel {
                           ctx->GetInputDim("Grad")));
 
     auto lr_dim = ctx->GetInputDim("LearningRate");
-    PADDLE_ENFORCE_NE(phi::product(lr_dim),
+    PADDLE_ENFORCE_NE(common::product(lr_dim),
                       0,
                       platform::errors::InvalidArgument(
                           "Maybe the Input variable LearningRate has not "
                           "been initialized. You may need to confirm "
                           "if you put exe.run(startup_program) "
                           "after optimizer.minimize function."));
-    PADDLE_ENFORCE_EQ(phi::product(lr_dim),
+    PADDLE_ENFORCE_EQ(common::product(lr_dim),
                       1,
                       platform::errors::InvalidArgument(
                           "Learning Rate should be a scalar, but got %d",
-                          phi::product(lr_dim)));
+                          common::product(lr_dim)));
 
     ctx->SetOutputDim("ParamOut", param_dim);
     ctx->SetOutputDim("SquaredAccumOut", param_dim);
diff --git a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc
index 625db9f375ab02..0c5a9721e279ba 100644
--- a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc
+++ b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc
@@ -24,7 +24,7 @@ class Pow2DecayWithLinearWarmupOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
-    auto dim = phi::make_ddim({1});
+    auto dim = common::make_ddim({1});
     ctx->SetOutputDim("LearningRateOut", dim);
     ctx->SetOutputDim("StepOut", dim);
   }
diff --git a/paddle/fluid/operators/optimizers/proximal_gd_op.cc b/paddle/fluid/operators/optimizers/proximal_gd_op.cc
index 08cc29ce9eb8db..5190d65ee0af26 100644
--- a/paddle/fluid/operators/optimizers/proximal_gd_op.cc
+++ b/paddle/fluid/operators/optimizers/proximal_gd_op.cc
@@ -44,7 +44,7 @@ class ProximalGDOp : public framework::OperatorWithKernel {
 
     auto lr_dim = ctx->GetInputDim("LearningRate");
     PADDLE_ENFORCE_EQ(
-        phi::product(lr_dim),
+        common::product(lr_dim),
         1,
         platform::errors::InvalidArgument(
             "Learning Rate should be a scalar. But received dimmensions:[%s]",
diff --git a/paddle/fluid/operators/optimizers/sparse_momentum_op.h b/paddle/fluid/operators/optimizers/sparse_momentum_op.h
index d29b4b8fb2e5a4..4c47fd2b621784 100644
--- a/paddle/fluid/operators/optimizers/sparse_momentum_op.h
+++ b/paddle/fluid/operators/optimizers/sparse_momentum_op.h
@@ -151,7 +151,7 @@ class SparseMomentumOp : public framework::OperatorWithKernel {
                    "VelocityOut",
                    "SparseMomentum");
 
-    auto lr_dims = phi::product(ctx->GetInputDim("LearningRate"));
+    auto lr_dims = common::product(ctx->GetInputDim("LearningRate"));
     PADDLE_ENFORCE_EQ(lr_dims != 0 && lr_dims == 1,
                       true,
                       platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/pad2d_op.cc b/paddle/fluid/operators/pad2d_op.cc
index 29d2807b239709..6529bbc29fcfe1 100644
--- a/paddle/fluid/operators/pad2d_op.cc
+++ b/paddle/fluid/operators/pad2d_op.cc
@@ -467,7 +467,7 @@ class Pad2dOp : public framework::OperatorWithKernel {
       }
     }
 
-    ctx->SetOutputDim("Out", phi::make_ddim(out_dims));
+    ctx->SetOutputDim("Out", common::make_ddim(out_dims));
     ctx->ShareLoD("X", /*->*/ "Out");
   }
 
@@ -499,7 +499,7 @@ class Pad2dOp : public framework::OperatorWithKernel {
       auto ar = paddle::framework::AttrReader(attrs);
       const std::string data_format = ar.Get<std::string>("data_format");
       return phi::KernelKey(tensor.place(),
-                            phi::StringToDataLayout(data_format),
+                            common::StringToDataLayout(data_format),
                             expected_kernel_type.dtype());
     }
 #endif
diff --git a/paddle/fluid/operators/partial_sum_op.cc b/paddle/fluid/operators/partial_sum_op.cc
index 0bba0381d20933..3cbb0e1c6e2083 100644
--- a/paddle/fluid/operators/partial_sum_op.cc
+++ b/paddle/fluid/operators/partial_sum_op.cc
@@ -86,7 +86,7 @@ class PartialSumOp : public framework::OperatorWithKernel {
     std::vector<int64_t> out_dims(2);
     out_dims[0] = batch_size;
     out_dims[1] = (length == -1) ? input_len - start_index : length;
-    ctx->SetOutputDim("Out", phi::make_ddim(out_dims));
+    ctx->SetOutputDim("Out", common::make_ddim(out_dims));
     ctx->ShareLoD("X", /*->*/ "Out");
   }
 
diff --git a/paddle/fluid/operators/positive_negative_pair_op.cc b/paddle/fluid/operators/positive_negative_pair_op.cc
index 72236c012c357c..96d8bbaa6f772f 100644
--- a/paddle/fluid/operators/positive_negative_pair_op.cc
+++ b/paddle/fluid/operators/positive_negative_pair_op.cc
@@ -40,7 +40,7 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {
                    "NeutralPair",
                    "positive_negative_pair");
 
-    auto scalar_dim = phi::make_ddim({1});
+    auto scalar_dim = common::make_ddim({1});
     if (ctx->HasInput("AccumulatePositivePair") ||
         ctx->HasInput("AccumulateNegativePair") ||
         ctx->HasInput("AccumulateNeutralPair")) {
diff --git a/paddle/fluid/operators/pscore/CMakeLists.txt b/paddle/fluid/operators/pscore/CMakeLists.txt
index dea89806bc202c..25314c72b8033f 100755
--- a/paddle/fluid/operators/pscore/CMakeLists.txt
+++ b/paddle/fluid/operators/pscore/CMakeLists.txt
@@ -21,6 +21,7 @@ if(WITH_ARM_BRPC)
     sendrecv_rpc
     arm_brpc
     phi
+    common
     glog
     snappy
     device_context)
@@ -39,6 +40,7 @@ else()
     sendrecv_rpc
     ${EXTERNAL_BRPC_DEPS}
     phi
+    common
     zlib
     device_context)
 endif()
diff --git a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
index 45aef43caeeb48..aafc8ab7faad1f 100644
--- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
+++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
@@ -64,12 +64,12 @@ class DistributedLookupTableOp : public framework::OperatorWithKernel {
 
     for (auto &ids_dim : ids_dims) {
       if (lookup_table_version == "lookup_table") {
-        outputs_dims.push_back(phi::make_ddim({ids_dim[0], table_dims[1]}));
+        outputs_dims.push_back(common::make_ddim({ids_dim[0], table_dims[1]}));
       } else if (lookup_table_version == "lookup_table_v2") {
         outputs_dims.push_back(
-            phi::make_ddim({static_cast<int64_t>(ids_dim[0]),
-                            static_cast<int64_t>(ids_dim[1]),
-                            static_cast<int64_t>(table_dims[1])}));
+            common::make_ddim({static_cast<int64_t>(ids_dim[0]),
+                               static_cast<int64_t>(ids_dim[1]),
+                               static_cast<int64_t>(table_dims[1])}));
       }
     }
 
diff --git a/paddle/fluid/operators/pscore/distributed_lookup_table_op.h b/paddle/fluid/operators/pscore/distributed_lookup_table_op.h
index 9b99089c141192..414500c2faac3a 100644
--- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.h
+++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.h
@@ -119,10 +119,10 @@ class DistributedLookupTableKernel : public framework::OpKernel<T> {
         auto *id_tensor = id_vars[i]->GetMutable<phi::DenseTensor>();
         auto *out_tensor = out_vars[i]->GetMutable<phi::DenseTensor>();
 
-        auto id_dims = phi::vectorize<int64_t>(id_tensor->dims());
-        out_tensor->Resize(phi::make_ddim({static_cast<int64_t>(id_dims[0]),
-                                           static_cast<int64_t>(id_dims[1]),
-                                           static_cast<int64_t>(emb_dim)}));
+        auto id_dims = common::vectorize<int64_t>(id_tensor->dims());
+        out_tensor->Resize(common::make_ddim({static_cast<int64_t>(id_dims[0]),
+                                              static_cast<int64_t>(id_dims[1]),
+                                              static_cast<int64_t>(emb_dim)}));
       }
     }
   }
diff --git a/paddle/fluid/operators/pscore/fake_init_op.cc b/paddle/fluid/operators/pscore/fake_init_op.cc
index cefd0ee5855f2b..cd919cb7ca0bf0 100644
--- a/paddle/fluid/operators/pscore/fake_init_op.cc
+++ b/paddle/fluid/operators/pscore/fake_init_op.cc
@@ -21,7 +21,7 @@ class FakeInitInferShape : public framework::InferShapeBase {
   void operator()(framework::InferShapeContext *ctx) const override {
     OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "FakeInit");
     auto &shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
-    ctx->SetOutputDim("Out", phi::make_ddim(shape));
+    ctx->SetOutputDim("Out", common::make_ddim(shape));
   }
 };
 
@@ -38,10 +38,10 @@ class FakeInitOp : public framework::OperatorBase {
 
     if (out_var.IsType<phi::DenseTensor>()) {
       tensor = out_var.GetMutable<phi::DenseTensor>();
-      tensor->Resize(phi::make_ddim(Attr<std::vector<int64_t>>("shape")));
+      tensor->Resize(common::make_ddim(Attr<std::vector<int64_t>>("shape")));
     } else if (out_var.IsType<phi::SelectedRows>()) {
       tensor = out_var.GetMutable<phi::SelectedRows>()->mutable_value();
-      tensor->Resize(phi::make_ddim(Attr<std::vector<int64_t>>("shape")));
+      tensor->Resize(common::make_ddim(Attr<std::vector<int64_t>>("shape")));
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "fake init op's output only"
diff --git a/paddle/fluid/operators/pull_box_extended_sparse_op.cc b/paddle/fluid/operators/pull_box_extended_sparse_op.cc
index f0799f75862bc4..75918b9ad62a48 100644
--- a/paddle/fluid/operators/pull_box_extended_sparse_op.cc
+++ b/paddle/fluid/operators/pull_box_extended_sparse_op.cc
@@ -54,14 +54,15 @@ class PullBoxExtendedSparseOp : public framework::OperatorWithKernel {
                             "Shape error in %lu id, the last dimension of the "
                             "'Ids' tensor must be 1.",
                             i));
-      auto out_dim = phi::vectorize(phi::slice_ddim(ids_dims, 0, ids_rank - 1));
+      auto out_dim =
+          common::vectorize(common::slice_ddim(ids_dims, 0, ids_rank - 1));
       out_dim.push_back(emb_size);
-      outs_dims[i] = phi::make_ddim(out_dim);
+      outs_dims[i] = common::make_ddim(out_dim);
 
       auto out_extended_dim =
-          phi::vectorize(phi::slice_ddim(ids_dims, 0, ids_rank - 1));
+          common::vectorize(common::slice_ddim(ids_dims, 0, ids_rank - 1));
       out_extended_dim.push_back(emb_extended_size);
-      outs_extended_dims[i] = phi::make_ddim(out_extended_dim);
+      outs_extended_dims[i] = common::make_ddim(out_extended_dim);
     }
     ctx->SetOutputsDim("Out", outs_dims);
     ctx->SetOutputsDim("OutExtend", outs_extended_dims);
diff --git a/paddle/fluid/operators/pull_box_sparse_op.cc b/paddle/fluid/operators/pull_box_sparse_op.cc
index a8f91c85485c7c..d37cc35a599450 100644
--- a/paddle/fluid/operators/pull_box_sparse_op.cc
+++ b/paddle/fluid/operators/pull_box_sparse_op.cc
@@ -45,9 +45,10 @@ class PullBoxSparseOp : public framework::OperatorWithKernel {
                             "Shape error in %lu id, the last dimension of the "
                             "'Ids' tensor must be 1.",
                             i));
-      auto out_dim = phi::vectorize(phi::slice_ddim(ids_dims, 0, ids_rank - 1));
+      auto out_dim =
+          common::vectorize(common::slice_ddim(ids_dims, 0, ids_rank - 1));
       out_dim.push_back(hidden_size);
-      outs_dims[i] = phi::make_ddim(out_dim);
+      outs_dims[i] = common::make_ddim(out_dim);
     }
     ctx->SetOutputsDim("Out", outs_dims);
     for (size_t i = 0; i < n_ids; ++i) {
diff --git a/paddle/fluid/operators/pull_gpups_sparse_op.cc b/paddle/fluid/operators/pull_gpups_sparse_op.cc
index afaa9af3fda20a..6055632f5681a1 100644
--- a/paddle/fluid/operators/pull_gpups_sparse_op.cc
+++ b/paddle/fluid/operators/pull_gpups_sparse_op.cc
@@ -53,9 +53,10 @@ class PullGpuPSSparseOp : public framework::OperatorWithKernel {
                             "Shape error in %lu id, the last dimension of the "
                             "'Ids' tensor must be 1.",
                             i));
-      auto out_dim = phi::vectorize(phi::slice_ddim(ids_dims, 0, ids_rank - 1));
+      auto out_dim =
+          common::vectorize(common::slice_ddim(ids_dims, 0, ids_rank - 1));
       out_dim.push_back(embedding_size);
-      outs_dims[i] = phi::make_ddim(out_dim);
+      outs_dims[i] = common::make_ddim(out_dim);
     }
     ctx->SetOutputsDim("Out", outs_dims);
     for (size_t i = 0; i < n_ids; ++i) {
diff --git a/paddle/fluid/operators/pull_sparse_op.cc b/paddle/fluid/operators/pull_sparse_op.cc
index 4850bf33ae89cd..55a6af8466b863 100644
--- a/paddle/fluid/operators/pull_sparse_op.cc
+++ b/paddle/fluid/operators/pull_sparse_op.cc
@@ -47,9 +47,10 @@ class PullSparseOp : public framework::OperatorWithKernel {
                             "Shape error in %lu id, the last dimension of "
                             " the 'Ids' tensor must be 1.",
                             i));
-      auto out_dim = phi::vectorize(phi::slice_ddim(ids_dims, 0, ids_rank - 1));
+      auto out_dim =
+          common::vectorize(common::slice_ddim(ids_dims, 0, ids_rank - 1));
       out_dim.push_back(hidden_size);
-      outs_dims[i] = phi::make_ddim(out_dim);
+      outs_dims[i] = common::make_ddim(out_dim);
     }
     ctx->SetOutputsDim("Out", outs_dims);
     for (size_t i = 0; i < n_ids; ++i) {
diff --git a/paddle/fluid/operators/pull_sparse_v2_op.cc b/paddle/fluid/operators/pull_sparse_v2_op.cc
index 993950c360c12c..d134607d3c4bb2 100644
--- a/paddle/fluid/operators/pull_sparse_v2_op.cc
+++ b/paddle/fluid/operators/pull_sparse_v2_op.cc
@@ -40,9 +40,9 @@ class PullSparseV2Op : public framework::OperatorWithKernel {
     outs_dims.resize(n_ids);
     for (size_t i = 0; i < n_ids; ++i) {
       const auto ids_dims = all_ids_dim[i];
-      auto out_dim = phi::vectorize(ids_dims);
+      auto out_dim = common::vectorize(ids_dims);
       out_dim.push_back(hidden_size);
-      outs_dims[i] = phi::make_ddim(out_dim);
+      outs_dims[i] = common::make_ddim(out_dim);
     }
     ctx->SetOutputsDim("Out", outs_dims);
     for (size_t i = 0; i < n_ids; ++i) {
diff --git a/paddle/fluid/operators/pyramid_hash_op.cc b/paddle/fluid/operators/pyramid_hash_op.cc
index 787797544ef849..45373070d95f96 100644
--- a/paddle/fluid/operators/pyramid_hash_op.cc
+++ b/paddle/fluid/operators/pyramid_hash_op.cc
@@ -218,7 +218,7 @@ class PyramidHashOP : public framework::OperatorWithKernel {
       // something to do in runtime.
     } else {
       // compile time
-      ctx->SetOutputDim("Out", phi::make_ddim({-1, num_emb}));
+      ctx->SetOutputDim("Out", common::make_ddim({-1, num_emb}));
       ctx->SetOutputDim("X_Temp_Out", x_dims);
       ctx->ShareLoD("X", /*->*/ "Out");
     }
@@ -295,7 +295,7 @@ class CPUPyramidHashOPKernel : public framework::OpKernel<T> {
     const auto& offset = bottom->lod()[0];
     const auto* bottom_data_ori = bottom->data<int32_t>();
     auto* buff = ctx.Output<phi::DenseTensor>("X_Temp_Out");
-    buff->Resize(phi::make_ddim({bottom->dims()[0], bottom->dims()[1]}));
+    buff->Resize(common::make_ddim({bottom->dims()[0], bottom->dims()[1]}));
     float* bottom_data = buff->mutable_data<float>(ctx.GetPlace());
     for (int i = 0; i < bottom->dims()[0]; i++) {
       bottom_data[i] = bottom_data_ori[i];  // NOLINT
@@ -332,7 +332,7 @@ class CPUPyramidHashOPKernel : public framework::OpKernel<T> {
       }
     }
 
-    drop_pos->Resize(phi::make_ddim(
+    drop_pos->Resize(common::make_ddim(
         {bottom->dims()[0] * bottom->dims()[1] * _pyramid_layer, 1}));
     std::vector<size_t> drop_pos_offset;
     drop_pos_offset.resize(offset.size());
@@ -380,7 +380,7 @@ class CPUPyramidHashOPKernel : public framework::OpKernel<T> {
     framework::LoD top_lod;
     top_lod.push_back(top_offset);
     top->set_lod(top_lod);
-    top->Resize(phi::make_ddim({top_l, _num_emb}));
+    top->Resize(common::make_ddim({top_l, _num_emb}));
     auto* top_data = top->mutable_data<T>(ctx.GetPlace());
 
     framework::LoD drop_pos_lod;
diff --git a/paddle/fluid/operators/quantize_linear_op.h b/paddle/fluid/operators/quantize_linear_op.h
index d6c3b3d2e50ae8..1f1bfc3dea73bd 100644
--- a/paddle/fluid/operators/quantize_linear_op.h
+++ b/paddle/fluid/operators/quantize_linear_op.h
@@ -14,13 +14,13 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/fake_quantize_op.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/transform.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/kernels/cast_kernel.h"
 
@@ -70,7 +70,7 @@ class QuantizeLinearKernel : public framework::OpKernel<T> {
         auto* in_accum = context.Input<phi::DenseTensor>("InAccum");
         auto* in_state = context.Input<phi::DenseTensor>("InState");
         phi::DenseTensor tmp_scale;
-        tmp_scale.Resize(phi::make_dim(1));
+        tmp_scale.Resize(common::make_dim(1));
         T* cur_scale_data = dev_ctx.template Alloc<T>(&tmp_scale);
 
         FindAbsMaxFunctor<DeviceContext, T>()(
diff --git a/paddle/fluid/operators/randperm_op.h b/paddle/fluid/operators/randperm_op.h
index 988b5d475ee31e..96981a4728402a 100644
--- a/paddle/fluid/operators/randperm_op.h
+++ b/paddle/fluid/operators/randperm_op.h
@@ -54,7 +54,7 @@ class RandpermKernel : public framework::OpKernel<T> {
 
     } else {
       phi::DenseTensor tmp_tensor;
-      tmp_tensor.Resize(phi::make_ddim({n}));
+      tmp_tensor.Resize(common::make_ddim({n}));
       T* tmp_data = tmp_tensor.mutable_data<T>(platform::CPUPlace());
       random_permate<T>(tmp_data, n, seed);
       framework::TensorCopy(tmp_tensor, ctx.GetPlace(), out_tensor);
diff --git a/paddle/fluid/operators/range_op.h b/paddle/fluid/operators/range_op.h
index e59d4f3cfcaddd..195ef276b957e7 100644
--- a/paddle/fluid/operators/range_op.h
+++ b/paddle/fluid/operators/range_op.h
@@ -58,7 +58,7 @@ class CPURangeKernel : public framework::OpKernel<T> {
     auto* out = context.Output<phi::DenseTensor>("Out");
     int64_t size = 0;
     GetSize(start, end, step, &size);
-    out->Resize(phi::make_ddim({size}));
+    out->Resize(common::make_ddim({size}));
     T* out_data = out->mutable_data<T>(context.GetPlace());
     T value = start;
     for (int64_t i = 0; i < size; ++i) {
diff --git a/paddle/fluid/operators/rank_attention.cu.h b/paddle/fluid/operators/rank_attention.cu.h
index 6726a2defc4b83..7077bd7a7aa4cd 100644
--- a/paddle/fluid/operators/rank_attention.cu.h
+++ b/paddle/fluid/operators/rank_attention.cu.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/phi/core/utils/dim.h"
+#include "paddle/common/dim.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/read_file_op.cc b/paddle/fluid/operators/read_file_op.cc
index 0330cac93396c6..239589847673b0 100644
--- a/paddle/fluid/operators/read_file_op.cc
+++ b/paddle/fluid/operators/read_file_op.cc
@@ -38,7 +38,7 @@ class CPUReadFileKernel : public framework::OpKernel<T> {
 
     auto* out = ctx.Output<phi::DenseTensor>("Out");
     std::vector<int64_t> out_shape = {file_size};
-    out->Resize(phi::make_ddim(out_shape));
+    out->Resize(common::make_ddim(out_shape));
 
     uint8_t* data = out->mutable_data<T>(ctx.GetPlace());
 
@@ -57,7 +57,7 @@ class ReadFileOp : public framework::OperatorWithKernel {
                           "Output(Out) of ReadFileOp is null."));
 
     auto out_dims = std::vector<int>(1, -1);
-    ctx->SetOutputDim("Out", phi::make_ddim(out_dims));
+    ctx->SetOutputDim("Out", common::make_ddim(out_dims));
   }
 
  protected:
diff --git a/paddle/fluid/operators/reader/create_py_reader_op.cc b/paddle/fluid/operators/reader/create_py_reader_op.cc
index 6b935adf9a2f38..57cf3370563936 100644
--- a/paddle/fluid/operators/reader/create_py_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_py_reader_op.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/operators/reader/py_reader.h"
 #include "paddle/fluid/operators/reader/reader_op_registry.h"
-#include "paddle/phi/core/ddim.h"
 
 namespace paddle {
 namespace operators {
@@ -68,7 +68,7 @@ class CreatePyReaderOp : public framework::OperatorBase {
       int shape_end_index = shape_start_index + rank;
       auto shape = std::vector<int>(shape_concat.begin() + shape_start_index,
                                     shape_concat.begin() + shape_end_index);
-      dims.push_back(phi::make_ddim(shape));
+      dims.push_back(common::make_ddim(shape));
       shape_start_index = shape_end_index;
     }
 
diff --git a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
index f3dbc5c49c2ddc..da265a6fce76da 100644
--- a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
+++ b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
@@ -18,10 +18,10 @@
 #include <utility>
 #include <vector>
 
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/operators/reader/blocking_queue.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/phi/core/ddim.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/reader/reader_op_registry.cc b/paddle/fluid/operators/reader/reader_op_registry.cc
index ff9d4260230dd6..e62d728b6f0171 100644
--- a/paddle/fluid/operators/reader/reader_op_registry.cc
+++ b/paddle/fluid/operators/reader/reader_op_registry.cc
@@ -31,7 +31,7 @@ std::vector<framework::DDim> RestoreShapes(const std::vector<int>& shape_concat,
   for (int len : ranks) {
     auto start_it = shape_concat.begin() + offset;
     auto end_it = start_it + len;
-    res.push_back(phi::make_ddim(std::vector<int>(start_it, end_it)));
+    res.push_back(common::make_ddim(std::vector<int>(start_it, end_it)));
     offset += len;
   }
   return res;
diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc
index 0362e618c2bc57..a5d4ce5e298289 100644
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -203,9 +203,9 @@ void RecurrentBase::LinkTensor(const framework::Scope &src_scope,
 // (seq_len, shape) -> return [seq_len] + list(shape)
 framework::DDim RecurrentBase::PrependDims(size_t seq_len,
                                            const framework::DDim &src) {
-  auto dims = phi::vectorize(src);
+  auto dims = common::vectorize(src);
   dims.insert(dims.begin(), static_cast<int64_t>(seq_len));
-  return phi::make_ddim(dims);
+  return common::make_ddim(dims);
 }
 
 RecurrentOp::RecurrentOp(const std::string &type,
@@ -253,9 +253,9 @@ void RecurrentOp::RunImpl(const framework::Scope &scope,
                                          phi::DenseTensor *inside) {
                              inside->ShareDataWith(outside.Slice(
                                  seq_offset, seq_offset + 1));  // NOLINT
-                             auto dims = phi::vectorize(inside->dims());
+                             auto dims = common::vectorize(inside->dims());
                              dims.erase(dims.begin());
-                             inside->Resize(phi::make_ddim(dims));
+                             inside->Resize(common::make_ddim(dims));
                            });
 
     if (has_state) {
@@ -383,9 +383,9 @@ void RecurrentGradOp::RunImpl(const framework::Scope &scope,
         [&](const phi::DenseTensor &outside, phi::DenseTensor *inside) {
           inside->ShareDataWith(
               outside.Slice(seq_offset, seq_offset + 1));  // NOLINT
-          auto dims = phi::vectorize(inside->dims());
+          auto dims = common::vectorize(inside->dims());
           dims.erase(dims.begin());
-          inside->Resize(phi::make_ddim(dims));
+          inside->Resize(common::make_ddim(dims));
         },
         true /*is_backward*/);
     auto og_set = List2Set(Inputs(kOutputGrads));
@@ -495,7 +495,7 @@ void RecurrentGradOp::RunImpl(const framework::Scope &scope,
           framework::AttributeMap attrs;
           attrs["dtype"] =
               framework::TransToProtoVarType(inside_tensor.dtype());
-          attrs["shape"] = phi::vectorize<int>(inside_tensor.dims());
+          attrs["shape"] = common::vectorize<int>(inside_tensor.dims());
           attrs["value"] = 0.0f;
 
           auto zero_op =
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index 3d262627970515..e69492501c1173 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -544,11 +544,11 @@ class ReduceBaseOp : public framework::OperatorWithKernel {
     if (reduce_all) {
       if (keep_dim)
         ctx->SetOutputDim("Out",
-                          phi::make_ddim(std::vector<int64_t>(x_rank, 1)));
+                          common::make_ddim(std::vector<int64_t>(x_rank, 1)));
       else
         ctx->SetOutputDim("Out", {1});
     } else {
-      auto dims_vector = vectorize(x_dims);
+      auto dims_vector = common::vectorize(x_dims);
       if (keep_dim) {
         for (size_t i = 0; i < dims.size(); ++i) {
           dims_vector[dims[i]] = 1;
@@ -565,7 +565,7 @@ class ReduceBaseOp : public framework::OperatorWithKernel {
       if (!keep_dim && dims_vector.size() == 0) {
         dims_vector.push_back(1);
       }
-      auto out_dims = phi::make_ddim(dims_vector);
+      auto out_dims = common::make_ddim(dims_vector);
       ctx->SetOutputDim("Out", out_dims);
       if (dims.size() > 0 && dims[0] != 0) {
         // Only pass LoD when not reducing on the first dim.
@@ -810,7 +810,7 @@ class ReduceCudaGradKernel : public framework::OpKernel<T> {
     // get reduce_dim and reduce_num for reduce_mean_grad
     int dim_size = in_x->dims().size();
     std::vector<int> reduce_dims = GetReduceDim(dims, dim_size, reduce_all);
-    auto update_dims = vectorize(d_x->dims());
+    auto update_dims = common::vectorize(d_x->dims());
     int reduce_num = 1;
     for (auto i : reduce_dims) {
       reduce_num *= (in_x->dims())[i];
@@ -819,7 +819,7 @@ class ReduceCudaGradKernel : public framework::OpKernel<T> {
     // make new tensor
     phi::DenseTensor new_d_out(d_out->type());
     new_d_out.ShareDataWith(*d_out);
-    new_d_out.Resize(phi::make_ddim(update_dims));
+    new_d_out.Resize(common::make_ddim(update_dims));
     auto& dev_ctx = context.cuda_device_context();
     if (out_dtype > 0) {
       d_x->mutable_data(dev_ctx.GetPlace(), pt_out_dtype);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op_function.h b/paddle/fluid/operators/reduce_ops/reduce_op_function.h
index 3176e489f89b30..dd9f22d25c86c1 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op_function.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op_function.h
@@ -58,13 +58,13 @@ void ReduceFunctor(const DeviceContext& context,
   DDim out_dims = output->dims();
   if (keep_dim && x_rank > 1) {
     const int kDelFlag = -2;
-    auto dims_vector = phi::vectorize(out_dims);
+    auto dims_vector = common::vectorize(out_dims);
     for (size_t i = 0; i < dims_ref.size(); ++i) {
       dims_vector[dims_ref[i]] = kDelFlag;
     }
     dims_vector.erase(remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
                       dims_vector.end());
-    out_dims = phi::make_ddim(dims_vector);
+    out_dims = common::make_ddim(dims_vector);
   }
   auto& place = *context.eigen_device();
   Functor functor;
@@ -90,7 +90,7 @@ void ReduceGradFunctor(const DeviceContext& context,
   auto x_grad = EigenTensor<T, D>::From(*output);
   auto x_rank = static_cast<int>(x.dimensions().size());
   auto x_dims = input0.dims();
-  auto reduced_dims_v = phi::vectorize(x_dims);
+  auto reduced_dims_v = common::vectorize(x_dims);
   std::vector<int> dims_ref = dims;
   Eigen::array<int, D> broadcast_dim;
   for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
@@ -104,7 +104,7 @@ void ReduceGradFunctor(const DeviceContext& context,
     broadcast_dim[dims_ref[i]] = x_dims[dims_ref[i]];
     broad_cats_times *= x_dims[dims_ref[i]];
   }
-  auto reduced_dims = phi::make_ddim(reduced_dims_v);
+  auto reduced_dims = common::make_ddim(reduced_dims_v);
   auto x_reduce = EigenTensor<T, D>::From(input1, reduced_dims);
   auto x_reduce_grad = EigenTensor<T, D>::From(input2, reduced_dims);
 
diff --git a/paddle/fluid/operators/repeat_interleave_op.cc b/paddle/fluid/operators/repeat_interleave_op.cc
index 44d022f4d5fbce..15b4b80cb739b8 100644
--- a/paddle/fluid/operators/repeat_interleave_op.cc
+++ b/paddle/fluid/operators/repeat_interleave_op.cc
@@ -39,7 +39,7 @@ class RepeatInterleaveOp : public framework::OperatorWithKernel {
 
     auto input_dim = ctx->GetInputDim("X");
     auto dim = ctx->Attrs().Get<int>("dim");
-    auto output_dim = phi::vectorize(input_dim);
+    auto output_dim = common::vectorize(input_dim);
     PADDLE_ENFORCE_EQ(
         dim < input_dim.size() && dim >= (0 - input_dim.size()),
         true,
@@ -78,7 +78,7 @@ class RepeatInterleaveOp : public framework::OperatorWithKernel {
       output_dim[dim] = input_dim[dim] * repeats;
     }
     VLOG(3) << "infershap out " << output_dim[dim];
-    ctx->SetOutputDim("Out", phi::make_ddim(output_dim));
+    ctx->SetOutputDim("Out", common::make_ddim(output_dim));
     auto type = ctx->GetInputsVarType("X")[0];
     if (type == framework::proto::VarType::LOD_TENSOR) {
       ctx->ShareLoD("X", /*->*/ "Out");
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 3a57b6da5642ab..30d4fb0cf9ad4c 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -99,7 +99,7 @@ class ReshapeOp : public framework::OperatorWithKernel {
           infer_shape[i] = static_cast<int>(in_dims[static_cast<int>(i)]);
         }
       }
-      auto infer_out_dims = phi::make_ddim(infer_shape);
+      auto infer_out_dims = common::make_ddim(infer_shape);
       ctx->SetOutputDim("Out", infer_out_dims);
       return;
     }
@@ -112,7 +112,7 @@ class ReshapeOp : public framework::OperatorWithKernel {
         num_ele *= static_cast<int>(shape_dims[i]);
       }
       auto vec_dims = std::vector<int>(num_ele, -1);
-      auto out_dims = phi::make_ddim(vec_dims);
+      auto out_dims = common::make_ddim(vec_dims);
       ctx->SetOutputDim("Out", out_dims);
       ctx->ShareLoD("X", /*->*/ "Out");
       return;
@@ -137,8 +137,8 @@ class ReshapeOp : public framework::OperatorWithKernel {
 
   static framework::DDim ValidateShape(const std::vector<int> shape,
                                        const framework::DDim &in_dims) {
-    const int64_t in_size = phi::product(in_dims);
-    auto in_dims_vec = phi::vectorize(in_dims);
+    const int64_t in_size = common::product(in_dims);
+    auto in_dims_vec = common::vectorize(in_dims);
     bool all_positive = std::all_of(in_dims_vec.cbegin(),
                                     in_dims_vec.cend(),
                                     [](int64_t i) { return i > 0; });
@@ -158,7 +158,7 @@ class ReshapeOp : public framework::OperatorWithKernel {
             platform::errors::InvalidArgument(
                 "Only one dimension value of 'shape' in ReshapeOp can "
                 "be -1. But received shape = [%s], shape[%d] is also -1.",
-                phi::make_ddim(shape),
+                common::make_ddim(shape),
                 i));
         unk_dim_idx = static_cast<int>(i);
       } else if (shape[i] == copy_dim_val) {
@@ -170,7 +170,7 @@ class ReshapeOp : public framework::OperatorWithKernel {
                 "the input tensor X's dimensions. "
                 "But received shape = [%s], shape[%d] = 0, X's shape = [%s], "
                 "X's dimensions = %d.",
-                phi::make_ddim(shape),
+                common::make_ddim(shape),
                 i,
                 in_dims,
                 in_dims.size()));
@@ -182,7 +182,7 @@ class ReshapeOp : public framework::OperatorWithKernel {
                 "Each dimension value of 'shape' in ReshapeOp must not "
                 "be negative except one unknown dimension. "
                 "But received  shape = [%s], shape[%d] = %d.",
-                phi::make_ddim(shape),
+                common::make_ddim(shape),
                 i,
                 shape[i]));
       }
@@ -212,7 +212,7 @@ class ReshapeOp : public framework::OperatorWithKernel {
                 "'shape' is [%s], known capacity of 'shape' is %d.",
                 in_dims,
                 in_size,
-                phi::make_ddim(shape),
+                common::make_ddim(shape),
                 capacity));
       } else {
         output_shape[unk_dim_idx] = -1;
@@ -230,7 +230,7 @@ class ReshapeOp : public framework::OperatorWithKernel {
                 "[%s], the capacity of 'shape' is %d.",
                 in_dims,
                 in_size,
-                phi::make_ddim(shape),
+                common::make_ddim(shape),
                 capacity));
       }
     }
@@ -249,11 +249,11 @@ class ReshapeOp : public framework::OperatorWithKernel {
               "capacity of 'Out' is %d.",
               in_dims,
               in_size,
-              phi::make_ddim(shape),
+              common::make_ddim(shape),
               capacity));
     }
 
-    return phi::make_ddim(output_shape);
+    return common::make_ddim(output_shape);
   }
 
  protected:
@@ -529,7 +529,7 @@ class Reshape2Op : public ReshapeOp {
       for (int i = 0; i < x_dims.size(); ++i) {
         xshape_dims[i + 1] = x_dims[i];
       }
-      ctx->SetOutputDim("XShape", phi::make_ddim(xshape_dims));
+      ctx->SetOutputDim("XShape", common::make_ddim(xshape_dims));
       ctx->ShareLoD("X", /*->*/ "XShape");
     }
     ReshapeOp::InferShape(ctx);
diff --git a/paddle/fluid/operators/sampling_id_op.cc b/paddle/fluid/operators/sampling_id_op.cc
index 785d148f79df0b..5df5270976ca42 100644
--- a/paddle/fluid/operators/sampling_id_op.cc
+++ b/paddle/fluid/operators/sampling_id_op.cc
@@ -41,7 +41,7 @@ class SamplingIdOp : public framework::OperatorWithKernel {
             input_dims.size()));
 
     auto dim0 = input_dims[0];
-    framework::DDim dims = phi::make_ddim({dim0});
+    framework::DDim dims = common::make_ddim({dim0});
     ctx->SetOutputDim("Out", dims);
     ctx->ShareLoD("X", "Out");
   }
diff --git a/paddle/fluid/operators/sampling_id_op.h b/paddle/fluid/operators/sampling_id_op.h
index 38c0ea3834af7b..730d84c2a651e1 100644
--- a/paddle/fluid/operators/sampling_id_op.h
+++ b/paddle/fluid/operators/sampling_id_op.h
@@ -74,7 +74,7 @@ class SamplingIdKernel : public framework::OpKernel<T> {
     out_dim.push_back(static_cast<int64_t>(batch_size));
 
     phi::DenseTensor* output = context.Output<phi::DenseTensor>("Out");
-    output->Resize(phi::make_ddim(out_dim));
+    output->Resize(common::make_ddim(out_dim));
     output->mutable_data<T>(context.GetPlace());
     framework::TensorFromVector(ids, context.device_context(), output);
   }
diff --git a/paddle/fluid/operators/sequence_ops/CMakeLists.txt b/paddle/fluid/operators/sequence_ops/CMakeLists.txt
index 1bd10f19e03594..38ac50b0d6434a 100644
--- a/paddle/fluid/operators/sequence_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/sequence_ops/CMakeLists.txt
@@ -6,5 +6,5 @@ endif()
 register_operators()
 
 if(WITH_UNITY_BUILD)
-  target_link_libraries(paddle_operators_sequence_ops_unity phi)
+  target_link_libraries(paddle_operators_sequence_ops_unity phi common)
 endif()
diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc
index 3ef695b111993c..dd65162b3aad46 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc
@@ -66,7 +66,7 @@ class SequenceConcatOp : public framework::OperatorWithKernel {
     std::vector<int64_t> out_dims;
     for (auto &x_dim : x_dims) {
       if (out_dims.empty()) {
-        out_dims = phi::vectorize(x_dim);
+        out_dims = common::vectorize(x_dim);
       }
       batch_size += x_dim[0];
       PADDLE_ENFORCE_NE(
@@ -75,25 +75,25 @@ class SequenceConcatOp : public framework::OperatorWithKernel {
           platform::errors::InvalidArgument(
               "The first dim of SequenceConcatOp inputs must not be 0."));
       if (feature_size == 0) {
-        feature_size = phi::product(x_dim) / x_dim[0];
+        feature_size = common::product(x_dim) / x_dim[0];
       } else {
         PADDLE_ENFORCE_EQ(
             feature_size,
-            phi::product(x_dim) / x_dim[0],
+            common::product(x_dim) / x_dim[0],
             platform::errors::InvalidArgument(
                 "Each input of SequenceConcatOp inputs must have same feature "
                 "size, But "
                 "the feature size we received is %d, the feature size of 1st "
                 "input is %d",
                 feature_size,
-                phi::product(x_dim) / x_dim[0]));
+                common::product(x_dim) / x_dim[0]));
       }
     }
     if (batch_size < 0) {
       batch_size = -1;  // Normalize batch size for compile time.
     }
     out_dims[0] = batch_size;
-    context->SetOutputDim("Out", phi::make_ddim(out_dims));
+    context->SetOutputDim("Out", common::make_ddim(out_dims));
     if (!context->IsRuntime()) {  // Runtime LoD infershape will be computed
       // in Kernel.
       context->ShareLoD("X", "Out");
diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h
index 3eb7e51cfe0c6e..c66f4065a58f15 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h
@@ -35,7 +35,7 @@ class SequenceEnumerateKernel : public framework::OpKernel<T> {
             "Input(X) phi::DenseTensor of SequenceEnumerateOp does not contain "
             "LoD information."));
 
-    auto in_dims = phi::vectorize<int>(in->dims());
+    auto in_dims = common::vectorize<int>(in->dims());
     auto lod0 = in->lod()[0];
     PADDLE_ENFORCE_EQ(
         static_cast<uint64_t>(in_dims[0]),
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
index 5c1058ebf16062..053c439814e957 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
@@ -70,7 +70,7 @@ struct SequenceExpandAsFunctor<phi::GPUContext, T> {
                   const phi::Vector<size_t> &ref_lod, /*expand referenced lod*/
                   phi::DenseTensor *out) {
     int height = x.dims()[0];
-    int width = phi::product(x.dims()) / height;
+    int width = common::product(x.dims()) / height;
 
     const int kThreadsPerBlock = 1024;
     int thread_x = kThreadsPerBlock;
@@ -100,7 +100,7 @@ struct SequenceExpandAsGradFunctor<phi::GPUContext, T> {
                   const phi::Vector<size_t> &ref_lod, /*expand based lod*/
                   phi::DenseTensor *dx) {
     int height = dx->dims()[0];
-    int width = phi::product(dx->dims()) / height;
+    int width = common::product(dx->dims()) / height;
 
     const int kThreadsPerBlock = 1024;
     int thread_x = kThreadsPerBlock;
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h
index be195d72b5665c..26f428b165256b 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h
@@ -47,7 +47,7 @@ struct SequenceExpandAsFunctor<phi::CPUContext, T> {
                   const phi::Vector<size_t> &ref_lod, /*expand referenced lod*/
                   phi::DenseTensor *out) {
     int64_t height = x.dims()[0];
-    int64_t width = phi::product(x.dims()) / height;
+    int64_t width = common::product(x.dims()) / height;
 
     const T *in_data = x.data<T>();
     T *out_data = out->mutable_data<T>(context.GetPlace());
@@ -124,7 +124,7 @@ struct SequenceExpandAsGradFunctor<phi::CPUContext, T> {
                   const phi::Vector<size_t> &ref_lod, /*expand referenced lod*/
                   phi::DenseTensor *dx) {
     int64_t height = dx->dims()[0];
-    int64_t width = phi::product(dx->dims()) / height;
+    int64_t width = common::product(dx->dims()) / height;
 
     const T *dout_data = dout.data<T>();
     T *dx_data = dx->mutable_data<T>(context.GetPlace());
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
index b70b750daba915..e46e17418a490b 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
@@ -199,7 +199,7 @@ struct SequenceExpandGradFunctor<phi::GPUContext, T> {
                   const phi::Vector<size_t>& x_lod,   /*expand source lod*/
                   const phi::Vector<size_t>& ref_lod, /*expand based lod*/
                   LoDTensor* dx) {
-    int x_item_length = phi::product(dx->dims()) / dx->dims()[0];
+    int x_item_length = common::product(dx->dims()) / dx->dims()[0];
     phi::Vector<size_t> out_offset(x_lod.size());
     GetOutputOffset(x_lod, ref_lod, &out_offset);
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.h b/paddle/fluid/operators/sequence_ops/sequence_expand_op.h
index 767942b4db8746..0f53249cfbc240 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.h
@@ -173,7 +173,7 @@ struct SequenceExpandGradFunctor<phi::CPUContext, T> {
         int x_seq_len = x_end - x_start;
         if (x_seq_len == 0) continue;
         auto dx_sub = dx->Slice(x_start, x_end);
-        dx_sub.Resize(phi::flatten_to_1d(dx_sub.dims()));
+        dx_sub.Resize(common::flatten_to_1d(dx_sub.dims()));
         int dout_end = dout_offset + repeat_num * x_seq_len;
         auto dout_sub = dout.Slice(dout_offset, dout_end);
         dout_sub.Resize({repeat_num, dx_sub.dims()[0]});
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
index ac78b18602360d..a7f3869e307537 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
@@ -52,11 +52,11 @@ class SequencePadOp : public framework::OperatorWithKernel {
                           "The rank of SequencePadOp Input(X) can't be less "
                           "than 2. But the rank we received is %d",
                           x_dims.size()));
-    auto time_step_dims = phi::slice_ddim(x_dims, 1, x_dims.size());
+    auto time_step_dims = common::slice_ddim(x_dims, 1, x_dims.size());
     auto pad_value_dims = ctx->GetInputDim("PadValue");
     PADDLE_ENFORCE_EQ(
-        pad_value_dims == phi::make_ddim({1}) ||
-            pad_value_dims == phi::make_ddim({}) ||
+        pad_value_dims == common::make_ddim({1}) ||
+            pad_value_dims == common::make_ddim({}) ||
             pad_value_dims == time_step_dims,
         true,
         platform::errors::InvalidArgument(
@@ -127,12 +127,12 @@ class SequencePadOp : public framework::OperatorWithKernel {
 
     std::vector<int> out_dims_vec{out_dim_0, padded_length};
     std::vector<int> len_dims_vec{out_dim_0};
-    auto time_step_dims_vec = phi::vectorize<int>(time_step_dims);
+    auto time_step_dims_vec = common::vectorize<int>(time_step_dims);
     out_dims_vec.insert(out_dims_vec.end(),
                         time_step_dims_vec.begin(),
                         time_step_dims_vec.end());
-    ctx->SetOutputDim("Out", phi::make_ddim(out_dims_vec));
-    ctx->SetOutputDim("Length", phi::make_ddim(len_dims_vec));
+    ctx->SetOutputDim("Out", common::make_ddim(out_dims_vec));
+    ctx->SetOutputDim("Length", common::make_ddim(len_dims_vec));
   }
 
  protected:
diff --git a/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cc b/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cc
index ff956ab2d1d4f7..6e34f76fbd37d0 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cc
@@ -16,7 +16,7 @@
 
 #include <memory>
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_slice_op.h b/paddle/fluid/operators/sequence_ops/sequence_slice_op.h
index 19c074ef5f75d7..50a3e976334753 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_slice_op.h
@@ -130,8 +130,8 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
     out->Resize(out_dims);
     out->set_lod(out_lod);
 
-    auto in_stride = phi::stride(in->dims());
-    auto out_stride = phi::stride(out->dims());
+    auto in_stride = common::stride(in->dims());
+    auto out_stride = common::stride(out->dims());
 
     size_t out_offset = 0;
     for (size_t i = 0; i < n; ++i) {
@@ -193,9 +193,9 @@ class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
         Tensor out_grad_t =
             out_grad->Slice(static_cast<int>(out_lod[0][i]),
                             static_cast<int>(out_lod[0][i + 1]));
-        auto out_grad_stride = phi::stride(out_grad_t.dims());
+        auto out_grad_stride = common::stride(out_grad_t.dims());
 
-        auto x_grad_stride = phi::stride(x_grad->dims());
+        auto x_grad_stride = common::stride(x_grad->dims());
 
         Tensor x_grad_t = x_grad->Slice(
             static_cast<int>(lod[0][i] + offset_data[i]),
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc
index 3b2583bbe42996..01f7bb3e928902 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc
@@ -57,8 +57,8 @@ class SequenceSoftmaxCUDNNKernel : public framework::OpKernel<T> {
 
       // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos)
       framework::DDim dims_i =
-          // phi::make_ddim({1UL, end_pos - start_pos, 1UL, 1UL});
-          phi::make_ddim({1UL, end_pos - start_pos});
+          // common::make_ddim({1UL, end_pos - start_pos, 1UL, 1UL});
+          common::make_ddim({1UL, end_pos - start_pos});
       x_i.Resize(dims_i);
       out_i.Resize(dims_i);
       phi::funcs::SoftmaxCUDNNFunctor<T, phi::GPUContext>()(
@@ -91,7 +91,7 @@ class SequenceSoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
       Tensor x_grad_i = x_grad->Slice(start_pos, end_pos);
 
       // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos)
-      framework::DDim dims_i = phi::make_ddim({1UL, end_pos - start_pos});
+      framework::DDim dims_i = common::make_ddim({1UL, end_pos - start_pos});
       out_i.Resize(dims_i);
       out_grad_i.Resize(dims_i);
       x_grad_i.Resize(dims_i);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
index 0ca5514900d460..12d4f72a91169e 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
@@ -41,7 +41,8 @@ class SequenceSoftmaxOp : public framework::OperatorWithKernel {
     auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
     phi::DataLayout layout_ = DataLayout::kAnyLayout;
     if (ctx.HasAttr("data_format")) {
-      layout_ = phi::StringToDataLayout(ctx.Attr<std::string>("data_format"));
+      layout_ =
+          common::StringToDataLayout(ctx.Attr<std::string>("data_format"));
     }
     return phi::KernelKey(
         ctx.GetPlace(), layout_, phi::TransToPhiDataType(input_data_type));
@@ -126,7 +127,8 @@ class SequenceSoftmaxGradOp : public framework::OperatorWithKernel {
     auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Out");
     phi::DataLayout layout_ = DataLayout::kAnyLayout;
     if (ctx.HasAttr("data_format")) {
-      layout_ = phi::StringToDataLayout(ctx.Attr<std::string>("data_format"));
+      layout_ =
+          common::StringToDataLayout(ctx.Attr<std::string>("data_format"));
     }
     return phi::KernelKey(
         ctx.GetPlace(), layout_, phi::TransToPhiDataType(input_data_type));
diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc
index 33fa45f27972ed..6088b8181646ba 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc
@@ -79,7 +79,7 @@ class SequenceUnpadOp : public framework::OperatorWithKernel {
         out_dims_vec.push_back(x_dims[i]);
       }
     }
-    ctx->SetOutputDim("Out", phi::make_ddim(out_dims_vec));
+    ctx->SetOutputDim("Out", common::make_ddim(out_dims_vec));
     if (!ctx->IsRuntime()) {
       ctx->SetLoDLevel("Out", 1);
     }
diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h
index aff7ce2392d6c2..cc38fd510ef1ea 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h
@@ -64,7 +64,7 @@ class SequenceUnpadOpKernel : public framework::OpKernel<T> {
         out_dims_vec.push_back(x_t->dims()[i]);
       }
     }
-    out_t->Resize(phi::make_ddim(out_dims_vec));
+    out_t->Resize(common::make_ddim(out_dims_vec));
 
     // after set the lod of output, allocate the memory
     out_t->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/shuffle_batch_op.cc b/paddle/fluid/operators/shuffle_batch_op.cc
index 61b3f30b390100..0b5a7bf5540abd 100644
--- a/paddle/fluid/operators/shuffle_batch_op.cc
+++ b/paddle/fluid/operators/shuffle_batch_op.cc
@@ -63,7 +63,7 @@ class ShuffleBatchOp : public framework::OperatorWithKernel {
     ctx->ShareLoD("X", "Out");
     ctx->ShareDim("Seed", "SeedOut");
     ctx->ShareLoD("Seed", "SeedOut");
-    ctx->SetOutputDim("ShuffleIdx", phi::make_ddim({-1}));
+    ctx->SetOutputDim("ShuffleIdx", common::make_ddim({-1}));
   }
 
  protected:
diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc
index e12c5b1e7febb2..16b895ce557a77 100644
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
@@ -154,7 +154,7 @@ class SliceOp : public framework::OperatorWithKernel {
 #ifdef PADDLE_WITH_DNNL
       auto input_data_type =
           framework::OperatorWithKernel::IndicateVarDataType(ctx, "Input");
-      auto vec_dims = phi::vectorize(in_tensor.dims());
+      auto vec_dims = common::vectorize(in_tensor.dims());
       bool all_zero_dims = std::all_of(
           vec_dims.cbegin(), vec_dims.cend(), [](int64_t i) { return i == 0; });
       if (!all_zero_dims && this->CanMKLDNNBeUsed(ctx, input_data_type)) {
diff --git a/paddle/fluid/operators/sparse_attention_op.cu b/paddle/fluid/operators/sparse_attention_op.cu
index 086de1fd706fd5..117de1c1a55dfb 100644
--- a/paddle/fluid/operators/sparse_attention_op.cu
+++ b/paddle/fluid/operators/sparse_attention_op.cu
@@ -649,7 +649,7 @@ std::vector<phi::DenseTensor> GetSplitTensor(phi::DenseTensor* input) {
   for (int i = 1; i < new_dims.size(); i++) {
     new_dims[i] = dims[i + 1];
   }
-  input->Resize(phi::make_ddim(new_dims));
+  input->Resize(common::make_ddim(new_dims));
   return input->Split(1, 0);
 }
 
diff --git a/paddle/fluid/operators/split_op.h b/paddle/fluid/operators/split_op.h
index e6545cc8d8d995..aaee366a4636a7 100644
--- a/paddle/fluid/operators/split_op.h
+++ b/paddle/fluid/operators/split_op.h
@@ -79,7 +79,7 @@ static inline std::vector<framework::DDim> UpdateOutsDims(
                 "Only one dimension value of Attr(num_or_sections) "
                 "in SplitOp can be -1. "
                 "But received Attr(num_or_sections) = [%s].",
-                phi::make_ddim(sections)));
+                common::make_ddim(sections)));
       }
 
       if (unk_dim_idx != -1) {
@@ -95,7 +95,7 @@ static inline std::vector<framework::DDim> UpdateOutsDims(
                 "size "
                 "along the split dimension. But received Attr(num_or_sections) "
                 "= [%s], input(X)'s shape = [%s], Attr(dim) = %d.",
-                phi::make_ddim(sections),
+                common::make_ddim(sections),
                 in_dims,
                 axis));
         if (each_section_is_known) {
@@ -110,7 +110,7 @@ static inline std::vector<framework::DDim> UpdateOutsDims(
                 "size "
                 "along the split dimension. But received Attr(num_or_sections)"
                 " = [%s], input(X)'s shape = [%s], Attr(dim) = %d.",
-                phi::make_ddim(sections),
+                common::make_ddim(sections),
                 in_dims,
                 axis));
       }
diff --git a/paddle/fluid/operators/spp_op.cc b/paddle/fluid/operators/spp_op.cc
index 9fe04caf3e1003..98072746e8eee7 100644
--- a/paddle/fluid/operators/spp_op.cc
+++ b/paddle/fluid/operators/spp_op.cc
@@ -80,7 +80,7 @@ class SppOp : public framework::OperatorWithKernel {
     int outlen =
         ((std::pow(4, pyramid_height) - 1) / (4 - 1)) * in_x_dims[1];  // NOLINT
     std::vector<int64_t> output_shape({in_x_dims[0], outlen});
-    ctx->SetOutputDim("Out", phi::make_ddim(output_shape));
+    ctx->SetOutputDim("Out", common::make_ddim(output_shape));
   }
 };
 
diff --git a/paddle/fluid/operators/spp_op.h b/paddle/fluid/operators/spp_op.h
index bf810e8825570b..5d3f4a78020a02 100644
--- a/paddle/fluid/operators/spp_op.h
+++ b/paddle/fluid/operators/spp_op.h
@@ -34,7 +34,7 @@ class SppKernel : public framework::OpKernel<T> {
     std::string pooling_type =
         context.template Attr<std::string>("pooling_type");
     out->mutable_data<T>(context.GetPlace());
-    auto out_stride = phi::stride(out->dims());
+    auto out_stride = common::stride(out->dims());
     int input_h = in_x->dims()[2];
     int input_w = in_x->dims()[3];
     size_t output_offset = 0;
@@ -51,7 +51,7 @@ class SppKernel : public framework::OpKernel<T> {
       phi::DenseTensor out_level;
       std::vector<int64_t> output_shape_vec(
           {in_x->dims()[0], in_x->dims()[1], bins, bins});
-      framework::DDim output_shape(phi::make_ddim(output_shape_vec));
+      framework::DDim output_shape(common::make_ddim(output_shape_vec));
       out_level.mutable_data<T>(output_shape, context.GetPlace());
       // pooling
       if (pooling_type == "max") {
@@ -92,10 +92,10 @@ class SppKernel : public framework::OpKernel<T> {
       std::vector<int64_t> output_flatten_shape_vec(
           {in_x->dims()[0], output_flatten_w});
       framework::DDim output_flatten_shape(
-          phi::make_ddim(output_flatten_shape_vec));
+          common::make_ddim(output_flatten_shape_vec));
       out_level.Resize(output_flatten_shape);
       // concat
-      auto out_level_stride = phi::stride(out_level.dims());
+      auto out_level_stride = common::stride(out_level.dims());
       phi::funcs::StridedMemcpy<T>(
           context.template device_context<DeviceContext>(),
           out_level.data<T>(),
@@ -127,7 +127,7 @@ class SppGradKernel : public framework::OpKernel<T> {
         zero;
     in_x_grad->mutable_data<T>(context.GetPlace());
     zero(device_ctx, in_x_grad, static_cast<T>(0));
-    auto out_stride = phi::stride(out->dims());
+    auto out_stride = common::stride(out->dims());
     int input_h = in_x->dims()[2];
     int input_w = in_x->dims()[3];
     size_t out_offset = 0;
@@ -146,10 +146,11 @@ class SppGradKernel : public framework::OpKernel<T> {
       int out_flatten_w = in_x->dims()[1] * bins * bins;
       std::vector<int64_t> out_flatten_shape_vec(
           {in_x->dims()[0], out_flatten_w});
-      framework::DDim out_flatten_shape(phi::make_ddim(out_flatten_shape_vec));
+      framework::DDim out_flatten_shape(
+          common::make_ddim(out_flatten_shape_vec));
       out_level.mutable_data<T>(out_flatten_shape, context.GetPlace());
       outgrad_level.mutable_data<T>(out_flatten_shape, context.GetPlace());
-      auto flatten_stride = phi::stride(out_level.dims());
+      auto flatten_stride = common::stride(out_level.dims());
       // memcpy
       phi::funcs::StridedMemcpy<T>(
           context.template device_context<DeviceContext>(),
@@ -174,7 +175,7 @@ class SppGradKernel : public framework::OpKernel<T> {
           (input_h - kernel_size_h + 2 * padding_h) / kernel_size_h + 1);
       out_shape_vec.push_back(
           (input_w - kernel_size_w + 2 * padding_w) / kernel_size_w + 1);
-      framework::DDim out_shape(phi::make_ddim(out_shape_vec));
+      framework::DDim out_shape(common::make_ddim(out_shape_vec));
       out_level.ShareDataWith(out_level);
       out_level.Resize(out_shape);
       outgrad_level.ShareDataWith(outgrad_level);
diff --git a/paddle/fluid/operators/squeeze_op.h b/paddle/fluid/operators/squeeze_op.h
index 6f0da1d42e5467..10ff809d608886 100644
--- a/paddle/fluid/operators/squeeze_op.h
+++ b/paddle/fluid/operators/squeeze_op.h
@@ -85,7 +85,7 @@ framework::DDim GetOutputShape(const std::vector<int> squeeze_dims,
       output_shape.push_back(in_dims[i]);
     }
   }
-  return phi::make_ddim(output_shape);
+  return common::make_ddim(output_shape);
 }
 
 template <typename DeviceContext, typename T>
@@ -119,7 +119,7 @@ class Squeeze2GradKernel : public framework::OpKernel<T> {
     // auto in_dims = d_x->dims();
 
     auto xshape_dims = ctx.Input<phi::DenseTensor>("XShape")->dims();
-    auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
+    auto x_dims = common::slice_ddim(xshape_dims, 1, xshape_dims.size());
 
     d_x->mutable_data(ctx.GetPlace(), d_out->type());
     framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x);
diff --git a/paddle/fluid/operators/stft_op.cc b/paddle/fluid/operators/stft_op.cc
index 14c1af81c16103..34f6ee854dd7bc 100644
--- a/paddle/fluid/operators/stft_op.cc
+++ b/paddle/fluid/operators/stft_op.cc
@@ -75,7 +75,7 @@ class StftOp : public framework::OperatorWithKernel {
     }
     output_shape.push_back(n_frames);
 
-    ctx->SetOutputDim("Out", phi::make_ddim(output_shape));
+    ctx->SetOutputDim("Out", common::make_ddim(output_shape));
   }
 
  protected:
diff --git a/paddle/fluid/operators/string/faster_tokenizer_op.h b/paddle/fluid/operators/string/faster_tokenizer_op.h
index b054531e4e05ef..1f848cb393fae2 100644
--- a/paddle/fluid/operators/string/faster_tokenizer_op.h
+++ b/paddle/fluid/operators/string/faster_tokenizer_op.h
@@ -176,11 +176,12 @@ class FasterTokenizerKernel : public framework::OpKernel<T> {
     }
 
     input_ids->Resize(
-        phi::make_ddim({static_cast<int64_t>(batch_size),
-                        static_cast<int64_t>(batch_max_seq_len)}));
+        common::make_ddim({static_cast<int64_t>(batch_size),
+                           static_cast<int64_t>(batch_max_seq_len)}));
     auto* input_ids_data = input_ids->mutable_data<T>(ctx.GetPlace());
-    seg_ids->Resize(phi::make_ddim({static_cast<int64_t>(batch_size),
-                                    static_cast<int64_t>(batch_max_seq_len)}));
+    seg_ids->Resize(
+        common::make_ddim({static_cast<int64_t>(batch_size),
+                           static_cast<int64_t>(batch_max_seq_len)}));
     auto* seg_ids_data = seg_ids->mutable_data<T>(ctx.GetPlace());
 
     auto pad_token_id = tokenizer.GetPadTokenID();
diff --git a/paddle/fluid/operators/svd_helper.h b/paddle/fluid/operators/svd_helper.h
index ccf5cd09a08426..caa31565d4cf3d 100644
--- a/paddle/fluid/operators/svd_helper.h
+++ b/paddle/fluid/operators/svd_helper.h
@@ -20,6 +20,7 @@
 #include <Eigen/SVD>
 #include <iostream>
 
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/diag_op.h"
@@ -27,7 +28,6 @@
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
@@ -85,8 +85,8 @@ static std::vector<int> GetBroadcastShape(InTensors ins) {
   auto x_dim = ins[0]->dims();
   auto y_dim = ins[1]->dims();
   std::vector<int> broadcast_shape =
-      (x_dim.size() > y_dim.size() ? phi::vectorize<int>(x_dim)
-                                   : phi::vectorize<int>(y_dim));
+      (x_dim.size() > y_dim.size() ? common::vectorize<int>(x_dim)
+                                   : common::vectorize<int>(y_dim));
   int rank_min = std::min(x_dim.size(), y_dim.size());
   int rank_x = x_dim.size();
   int rank_y = y_dim.size();
@@ -301,10 +301,10 @@ struct DeviceIndependenceTensorOperations {
     phi::DenseTensor ret;
     auto a_dim = mat_a.dims();
     auto b_dim = mat_b.dims();
-    std::vector<int> x_vec = phi::vectorize<int>(a_dim);
+    std::vector<int> x_vec = common::vectorize<int>(a_dim);
     x_vec[x_vec.size() - 2] = a_dim[a_dim.size() - (trans_a ? 1 : 2)];
     x_vec[x_vec.size() - 1] = b_dim[b_dim.size() - (trans_b ? 2 : 1)];
-    ret.Resize(phi::make_ddim(x_vec));
+    ret.Resize(common::make_ddim(x_vec));
     ret.mutable_data<T>(context.GetPlace());
     auto blas = GetBlas();
     auto mat_a_discrib = phi::funcs::CreateMatrixDescriptor(a_dim, 0, trans_a);
@@ -318,7 +318,7 @@ struct DeviceIndependenceTensorOperations {
     // transpose the last two dimision
     phi::DenseTensor ret;
     auto x_dim = x.dims();
-    auto x_vec = phi::vectorize<int>(x_dim);
+    auto x_vec = common::vectorize<int>(x_dim);
     int rank = x_vec.size();
     std::swap(x_vec[rank - 1], x_vec[rank - 2]);
     std::vector<int> out_shape = x_vec;
@@ -328,7 +328,7 @@ struct DeviceIndependenceTensorOperations {
     }
     std::swap(axis[rank - 1], axis[rank - 2]);
     auto& dev_ctx = context.template device_context<DeviceContext>();
-    ret.Resize(phi::make_ddim(x_vec));
+    ret.Resize(common::make_ddim(x_vec));
     ret.mutable_data<T>(context.GetPlace());
     switch (rank) {
       DITO_TRANSPOSE_RANK_CASE(2);
@@ -397,7 +397,7 @@ struct DeviceIndependenceTensorOperations {
     for (int i = 0; i < num_dims - 1; ++i) {
       out_shape.push_back(x.dims()[i]);
     }
-    out.Resize(phi::make_ddim(out_shape));
+    out.Resize(common::make_ddim(out_shape));
     int order = x.dims()[num_dims - 1];
     int stride_out = order * order;
     int stride_in = order + 1;
@@ -414,7 +414,7 @@ struct DeviceIndependenceTensorOperations {
                                   const phi::DenseTensor& y) {
     phi::DenseTensor ret;
     std::vector<int> out_shape = GetBroadcastShape({&x, &y});
-    ret.Resize(phi::make_ddim(out_shape));
+    ret.Resize(common::make_ddim(out_shape));
     ElementwiseComputeEx<RealMulComplexFunctor<T>, DeviceContext, T>(
         context, &x, &y, -1, RealMulComplexFunctor<T>(), &ret);
     return ret;
@@ -432,7 +432,7 @@ struct DeviceIndependenceTensorOperations {
       out_vector.device(place) = x_vector / y_vector;
     } else {
       std::vector<int> out_shape = GetBroadcastShape({&x, &y});
-      ret.Resize(phi::make_ddim(out_shape));
+      ret.Resize(common::make_ddim(out_shape));
       ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(
           context, &x, &y, -1, DivFunctor<T>(), &ret);
     }
@@ -442,7 +442,7 @@ struct DeviceIndependenceTensorOperations {
     // element wise add, support numpy broadcast.
     phi::DenseTensor ret;
     std::vector<int> out_shape = GetBroadcastShape({&x, &y});
-    ret.Resize(phi::make_ddim(out_shape));
+    ret.Resize(common::make_ddim(out_shape));
     ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(
         context, &x, &y, -1, AddFunctor<T>(), &ret);
     return ret;
@@ -450,7 +450,7 @@ struct DeviceIndependenceTensorOperations {
   phi::DenseTensor Mul(const phi::DenseTensor& x, const phi::DenseTensor& y) {
     phi::DenseTensor ret;
     std::vector<int> out_shape = GetBroadcastShape({&x, &y});
-    ret.Resize(phi::make_ddim(out_shape));
+    ret.Resize(common::make_ddim(out_shape));
     ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
         context, &x, &y, -1, MulFunctor<T>(), &ret);
     return ret;
@@ -476,7 +476,7 @@ struct DeviceIndependenceTensorOperations {
   phi::DenseTensor Sub(const phi::DenseTensor& x, const phi::DenseTensor& y) {
     phi::DenseTensor ret;
     std::vector<int> out_shape = GetBroadcastShape({&x, &y});
-    ret.Resize(phi::make_ddim(out_shape));
+    ret.Resize(common::make_ddim(out_shape));
     if (platform::is_gpu_place(context.GetPlace())) {
 #if defined(__NVCC__) || defined(__HIPCC__)
       // For GPU, there is no need to define XxxInverseFunctor and call
@@ -501,7 +501,7 @@ struct DeviceIndependenceTensorOperations {
     // don't copy data, only change the dims
     phi::DenseTensor out;
     out.ShareDataWith(x);
-    std::vector<int> out_shape = phi::vectorize<int>(x.dims());
+    std::vector<int> out_shape = common::vectorize<int>(x.dims());
     if (axis >= 0) {
       auto index = (out_shape.begin() + axis);
       out_shape.insert(index, 1);
@@ -509,12 +509,12 @@ struct DeviceIndependenceTensorOperations {
       auto index = (out_shape.end() + axis + 1);
       out_shape.insert(index, 1);
     }
-    out.Resize(phi::make_ddim(out_shape));
+    out.Resize(common::make_ddim(out_shape));
     return out;
   }
   phi::DenseTensor Fill(std::vector<int> shape, float fill_value) {
     phi::DenseTensor ret;
-    ret.Resize(phi::make_ddim(shape));
+    ret.Resize(common::make_ddim(shape));
     ret.mutable_data<T>(context.GetPlace());
     auto& dev_ctx = context.template device_context<DeviceContext>();
     phi::funcs::SetConstant<DeviceContext, T>()(dev_ctx, &ret, T(fill_value));
@@ -535,7 +535,7 @@ struct DeviceIndependenceTensorOperations {
                          std::vector<int> ends) {
     phi::DenseTensor ret;
     std::vector<int> new_axes = axes;
-    std::vector<int> out_shape = phi::vectorize<int>(x.dims());
+    std::vector<int> out_shape = common::vectorize<int>(x.dims());
     size_t rank = out_shape.size();
     PADDLE_ENFORCE_EQ(
         axes.size(),
@@ -566,7 +566,7 @@ struct DeviceIndependenceTensorOperations {
       offset[new_axes[i]] = starts[i];
       extends[new_axes[i]] = ends[i] - starts[i];
     }
-    ret.Resize(phi::make_ddim(out_shape));
+    ret.Resize(common::make_ddim(out_shape));
     ret.mutable_data<T>(context.GetPlace());
     switch (rank) {
       DITO_SLICE_RANK_CASE(1);
@@ -596,7 +596,7 @@ struct DeviceIndependenceTensorOperations {
         x_rank,
         2,
         platform::errors::InvalidArgument("Rank must be at least 2."));
-    std::vector<int> out_shape = phi::vectorize<int>(x.dims());
+    std::vector<int> out_shape = common::vectorize<int>(x.dims());
     return CreateOpRunAndReturnTensor("tril_triu", inputs, attrs, out_shape);
   }
 
@@ -613,8 +613,8 @@ struct DeviceIndependenceTensorOperations {
     auto x_dims = x.dims();
     auto y_dims = y.dims();
     auto y_dims_n = y_dims.size();
-    std::vector<int64_t> x_dims_vec = phi::vectorize<int64_t>(x_dims);
-    std::vector<int64_t> y_dims_vec = phi::vectorize<int64_t>(y_dims);
+    std::vector<int64_t> x_dims_vec = common::vectorize<int64_t>(x_dims);
+    std::vector<int64_t> y_dims_vec = common::vectorize<int64_t>(y_dims);
     std::vector<int64_t> x_dims_vec_cut(x_dims_vec.begin(),
                                         x_dims_vec.end() - 2);
     std::vector<int64_t> y_dims_vec_cut(y_dims_vec.begin(),
@@ -646,7 +646,7 @@ struct DeviceIndependenceTensorOperations {
     if (out_dims[axis_] < 0) {
       out_dims[axis_] = -1;
     }
-    std::vector<int> out_shape = phi::vectorize<int>(out_dims);
+    std::vector<int> out_shape = common::vectorize<int>(out_dims);
     return CreateOpRunAndReturnTensor("concat", inputs, attrs, out_shape);
   }
 
@@ -755,8 +755,8 @@ struct DeviceIndependenceTensorOperations {
     auto out_var = local_scope.Var("tmp_Out");  // return the Out
     // create Out phi::DenseTensor and allocat memory
     out_var->GetMutable<phi::DenseTensor>()->mutable_data<T>(
-        phi::make_ddim(out_shape), context.GetPlace());
-    // phi::make_ddim(out_shape)
+        common::make_ddim(out_shape), context.GetPlace());
+    // common::make_ddim(out_shape)
     framework::VariableNameMap op_inputs;
     int counter = 0;
     for (auto item : inputs) {
@@ -780,7 +780,7 @@ struct DeviceIndependenceTensorOperations {
     op->Run(local_scope, context.GetPlace());
     phi::DenseTensor out;
     out.ShareDataWith(*(out_var->GetMutable<phi::DenseTensor>()));
-    out.Resize(phi::make_ddim(out_shape));
+    out.Resize(common::make_ddim(out_shape));
     context.scope().DeleteScope(&local_scope);
     return out;
   }
diff --git a/paddle/fluid/operators/sync_batch_norm_op.cu b/paddle/fluid/operators/sync_batch_norm_op.cu
index 7f618db46976a9..af69594f992cde 100644
--- a/paddle/fluid/operators/sync_batch_norm_op.cu
+++ b/paddle/fluid/operators/sync_batch_norm_op.cu
@@ -51,7 +51,7 @@ void SyncBatchNormKernel(const Context& ctx,
 
   double epsilon = epsilon_f;
   const bool trainable_stats = trainable_statistics;
-  const DataLayout layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout layout = common::StringToDataLayout(data_layout_str);
   bool test_mode = is_test && (!trainable_statistics);
   const auto& x_dims = x.dims();
   PADDLE_ENFORCE_GE(x_dims.size(),
diff --git a/paddle/fluid/operators/sync_batch_norm_utils.h b/paddle/fluid/operators/sync_batch_norm_utils.h
index 7c14f6dfac324d..c132a91bb5346c 100644
--- a/paddle/fluid/operators/sync_batch_norm_utils.h
+++ b/paddle/fluid/operators/sync_batch_norm_utils.h
@@ -30,9 +30,9 @@ namespace cub = hipcub;
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/distributed/collective/process_group_nccl.h"
 #endif
+#include "paddle/common/layout.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/kernels/funcs/norm_utils.cu.h"
 #include "paddle/phi/kernels/funcs/norm_utils.h"
@@ -412,7 +412,7 @@ void SyncBatchNormGradFunctor(
     DenseTensor *bias_grad) {
   double epsilon = static_cast<double>(epsilon_f);
 
-  const DataLayout layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout layout = common::StringToDataLayout(data_layout_str);
 
   const auto *d_y = &y_grad;
 
diff --git a/paddle/fluid/operators/tdm_child_op.cc b/paddle/fluid/operators/tdm_child_op.cc
index 64c67d67b776ab..7b9932ffb4a62c 100644
--- a/paddle/fluid/operators/tdm_child_op.cc
+++ b/paddle/fluid/operators/tdm_child_op.cc
@@ -89,10 +89,10 @@ class TDMChildOp : public framework::OperatorWithKernel {
             info_dims.size(),
             info_dims));
 
-    auto output_dims = phi::vectorize(input_dims);
+    auto output_dims = common::vectorize(input_dims);
     output_dims.push_back(child_nums);
-    ctx->SetOutputDim("Child", phi::make_ddim(output_dims));
-    ctx->SetOutputDim("LeafMask", phi::make_ddim(output_dims));
+    ctx->SetOutputDim("Child", common::make_ddim(output_dims));
+    ctx->SetOutputDim("LeafMask", common::make_ddim(output_dims));
 
     if (ctx->GetOutputsVarType("Child")[0] ==
         framework::proto::VarType::LOD_TENSOR) {
diff --git a/paddle/fluid/operators/tdm_sampler_op.cc b/paddle/fluid/operators/tdm_sampler_op.cc
index f5e835ca2f7a79..d516af77183653 100644
--- a/paddle/fluid/operators/tdm_sampler_op.cc
+++ b/paddle/fluid/operators/tdm_sampler_op.cc
@@ -101,15 +101,16 @@ class TDMSamplerOp : public framework::OperatorWithKernel {
     }
 
     auto input_dims = ctx->GetInputDim("X");
-    auto ddim = phi::make_ddim({-1, sample_res_length});
+    auto ddim = common::make_ddim({-1, sample_res_length});
     if (ctx->IsRuntime()) {
-      auto output_dims = phi::vectorize(input_dims);
+      auto output_dims = common::vectorize(input_dims);
       auto batch_size = output_dims[0];
-      ctx->SetOutputDim("Out", phi::make_ddim({batch_size, sample_res_length}));
+      ctx->SetOutputDim("Out",
+                        common::make_ddim({batch_size, sample_res_length}));
       ctx->SetOutputDim("Labels",
-                        phi::make_ddim({batch_size, sample_res_length}));
+                        common::make_ddim({batch_size, sample_res_length}));
       ctx->SetOutputDim("Mask",
-                        phi::make_ddim({batch_size, sample_res_length}));
+                        common::make_ddim({batch_size, sample_res_length}));
     } else {
       ctx->SetOutputDim("Out", ddim);
       ctx->SetOutputDim("Labels", ddim);
diff --git a/paddle/fluid/operators/tdm_sampler_op.h b/paddle/fluid/operators/tdm_sampler_op.h
index f3b55c4a5cc34b..984e6666e1f915 100644
--- a/paddle/fluid/operators/tdm_sampler_op.h
+++ b/paddle/fluid/operators/tdm_sampler_op.h
@@ -60,7 +60,7 @@ void TDMSamplerInner(const framework::ExecutionContext &context,
   }
   VLOG(3) << "TDM: sample res length: " << sample_res_length;
 
-  auto travel_dim = phi::vectorize<int>(travel_lod_tensor.dims());
+  auto travel_dim = common::vectorize<int>(travel_lod_tensor.dims());
   auto total_sample_nums = input_ids_num * sample_res_length;
 
   // get all data
diff --git a/paddle/fluid/operators/temporal_shift_op.cu b/paddle/fluid/operators/temporal_shift_op.cu
index 68f8153e88e600..51b75832d078ac 100644
--- a/paddle/fluid/operators/temporal_shift_op.cu
+++ b/paddle/fluid/operators/temporal_shift_op.cu
@@ -165,7 +165,7 @@ class TemporalShiftOpCUDAKernel : public framework::OpKernel<T> {
     int t = ctx.Attr<int>("seg_num");
     float shift_ratio = ctx.Attr<float>("shift_ratio");
     const std::string data_format_str = ctx.Attr<std::string>("data_format");
-    const DataLayout data_layout = phi::StringToDataLayout(data_format_str);
+    const DataLayout data_layout = common::StringToDataLayout(data_format_str);
 
     const int nt = input->dims()[0];
     const int c = (data_layout == DataLayout::kNCHW ? input->dims()[1]
@@ -184,8 +184,8 @@ class TemporalShiftOpCUDAKernel : public framework::OpKernel<T> {
     const int c2 = static_cast<int>(c * 2 * shift_ratio);
 
     framework::DDim out_dims =
-        (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w})
-                                          : phi::make_ddim({nt, h, w, c}));
+        (data_layout == DataLayout::kNCHW ? common::make_ddim({nt, c, h, w})
+                                          : common::make_ddim({nt, h, w, c}));
     const T* input_data = input->data<T>();
     T* output_data = output->mutable_data<T>(out_dims, ctx.GetPlace());
 
@@ -219,7 +219,7 @@ class TemporalShiftGradOpCUDAKernel : public framework::OpKernel<T> {
     int t = ctx.Attr<int>("seg_num");
     float shift_ratio = ctx.Attr<float>("shift_ratio");
     const std::string data_format_str = ctx.Attr<std::string>("data_format");
-    const DataLayout data_layout = phi::StringToDataLayout(data_format_str);
+    const DataLayout data_layout = common::StringToDataLayout(data_format_str);
 
     const int nt = output_grad->dims()[0];
     const int c = (data_layout == DataLayout::kNCHW ? output_grad->dims()[1]
@@ -238,8 +238,8 @@ class TemporalShiftGradOpCUDAKernel : public framework::OpKernel<T> {
     const int c2 = static_cast<int>(c * 2 * shift_ratio);
 
     framework::DDim in_grad_dims =
-        (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w})
-                                          : phi::make_ddim({nt, h, w, c}));
+        (data_layout == DataLayout::kNCHW ? common::make_ddim({nt, c, h, w})
+                                          : common::make_ddim({nt, h, w, c}));
     const T* output_grad_data = output_grad->data<T>();
     T* input_grad_data =
         input_grad->mutable_data<T>(in_grad_dims, ctx.GetPlace());
diff --git a/paddle/fluid/operators/temporal_shift_op.h b/paddle/fluid/operators/temporal_shift_op.h
index ec2533316e107f..47007a10038b4c 100644
--- a/paddle/fluid/operators/temporal_shift_op.h
+++ b/paddle/fluid/operators/temporal_shift_op.h
@@ -97,7 +97,7 @@ class TemporalShiftGradKernel : public framework::OpKernel<T> {
     int t = ctx.Attr<int>("seg_num");
     float shift_ratio = ctx.Attr<float>("shift_ratio");
     const std::string data_format_str = ctx.Attr<std::string>("data_format");
-    const DataLayout data_layout = phi::StringToDataLayout(data_format_str);
+    const DataLayout data_layout = common::StringToDataLayout(data_format_str);
 
     const int nt = output_grad->dims()[0];
     const int c = (data_layout == DataLayout::kNCHW ? output_grad->dims()[1]
@@ -116,8 +116,8 @@ class TemporalShiftGradKernel : public framework::OpKernel<T> {
     const int c2 = static_cast<int>(c * 2 * shift_ratio);
 
     framework::DDim in_grad_dims =
-        (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w})
-                                          : phi::make_ddim({nt, h, w, c}));
+        (data_layout == DataLayout::kNCHW ? common::make_ddim({nt, c, h, w})
+                                          : common::make_ddim({nt, h, w, c}));
     const T* output_grad_data = output_grad->data<T>();
     T* input_grad_data =
         input_grad->mutable_data<T>(in_grad_dims, ctx.GetPlace());
diff --git a/paddle/fluid/operators/tensor_array_to_tensor_op.cc b/paddle/fluid/operators/tensor_array_to_tensor_op.cc
index 6188106f64bfab..69c7446d85d470 100644
--- a/paddle/fluid/operators/tensor_array_to_tensor_op.cc
+++ b/paddle/fluid/operators/tensor_array_to_tensor_op.cc
@@ -109,9 +109,9 @@ class LoDTensorArray2TensorOp : public framework::OperatorBase {
         }
       }
     }
-    auto vec = phi::vectorize<int>(out_dims);
+    auto vec = common::vectorize<int>(out_dims);
     vec.insert(vec.begin() + axis, inx.size());  // NOLINT
-    out.Resize(phi::make_ddim(vec));
+    out.Resize(common::make_ddim(vec));
 
     LodTensorArray2LodTensorVector(scope, base_name, Input("X"), &names);
 
@@ -178,16 +178,16 @@ class LoDTensorArray2TensorOpInferShape : public framework::InferShapeBase {
     if (ctx->IsRuntime()) return;
     auto dims = ctx->GetInputDim("X");
     // if the shape is empty
-    if (dims == phi::make_ddim({0UL})) return;
+    if (dims == common::make_ddim({0UL})) return;
     // otherwise, suppose the shape of array is the shape of tensor in the
     // array, which is consistent with what tensor_array_read_write dose
     auto axis = ctx->Attrs().Get<int>("axis");
     auto use_stack = ctx->Attrs().Get<bool>("use_stack");
     if (use_stack) {
-      auto dim_vec = phi::vectorize<int>(dims);
+      auto dim_vec = common::vectorize<int>(dims);
       // use -1 for the stack dim size
       dim_vec.insert(dim_vec.begin() + axis, -1);
-      dims = phi::make_ddim(dim_vec);
+      dims = common::make_ddim(dim_vec);
     } else {
       // use -1 for the concat dim size
       dims[axis] = -1;
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index 77b0ebe1ce4937..8c75a7bc00f1c8 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -23,6 +23,7 @@
 #include <utility>
 #include <vector>
 
+#include "paddle/common/errors.h"
 #include "paddle/fluid/framework/data_device_transform.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -39,7 +40,6 @@
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/kernels/cast_kernel.h"
 #include "paddle/phi/kernels/funcs/data_type_transform.h"
 #include "paddle/utils/string/string_helper.h"
@@ -288,7 +288,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
             scope, name_real);
         VLOG(4) << "trt engine runtime input name(" << name << "), dims("
                 << t.dims() << ")";
-        auto t_shape = phi::vectorize<int32_t>(t.dims());
+        auto t_shape = common::vectorize<int32_t>(t.dims());
         runtime_input_shape.insert(std::make_pair(name, t_shape));
         // We need collect value range for shape tensor for Paddle-TRT's use.
         // To be noticed, this method to identify all shape tensors is based on
@@ -446,7 +446,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
         if (param_names_.count(x)) continue;
         auto &t = inference::analysis::GetFromScope<phi::DenseTensor>(scope, x);
         calib_buffers[x] = t.memory_size();
-        auto t_shape = phi::vectorize(t.dims());
+        auto t_shape = common::vectorize(t.dims());
         runtime_batch = t_shape[0];
       }
       calib_res->calib_ = std::make_unique<TRTInt8Calibrator>(
@@ -549,7 +549,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
         framework::TensorCopy(t, dev_place, dev_ctx, &out);
         t.ShareDataWith(out);
       }
-      auto t_shape = phi::vectorize<int64_t>(t.dims());
+      auto t_shape = common::vectorize<int64_t>(t.dims());
 
       // This must be a zero dimension tensor.
       // At present, we convert it to a 1D tensor to feed them into Trt.
@@ -736,7 +736,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
           platform::errors::NotFound(
               "Output variable %s is not found in TensorRT subgraph.", y));
       auto *fluid_t = fluid_v->GetMutable<phi::DenseTensor>();
-      fluid_t->Resize(phi::make_ddim(ddim));
+      fluid_t->Resize(common::make_ddim(ddim));
 
       PADDLE_ENFORCE_LT(bind_index,
                         num_bindings,
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
index 23ccf702685577..04130e3f242397 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
@@ -37,7 +37,7 @@ void CreateCUDATensor(framework::Scope* scope,
                       const std::vector<int64_t>& shape) {
   auto* var = scope->Var(name);
   auto* tensor = var->GetMutable<phi::DenseTensor>();
-  auto dims = phi::make_ddim(shape);
+  auto dims = common::make_ddim(shape);
   tensor->Resize(dims);
   platform::CUDAPlace place;
   phi::GPUContext ctx(place);
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
index f1674bc5005a0b..ef6172b6965f22 100644
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -87,7 +87,7 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
 
     framework::DDim inputdims = input->dims();
     const int64_t input_height =
-        phi::product(phi::slice_ddim(inputdims, 0, inputdims.size() - 1));
+        common::product(common::slice_ddim(inputdims, 0, inputdims.size() - 1));
     const int64_t input_width = inputdims[inputdims.size() - 1];
     const auto& dev_ctx = ctx.cuda_device_context();
     if ((input_width <= 1024 || k >= 128 || k == input_width)) {
@@ -157,7 +157,7 @@ class TopkOpGradCUDAKernel : public framework::OpKernel<T> {
 
     framework::DDim xdims = x->dims();
     const size_t row =
-        phi::product(phi::slice_ddim(xdims, 0, xdims.size() - 1));
+        common::product(common::slice_ddim(xdims, 0, xdims.size() - 1));
     const size_t col = xdims[xdims.size() - 1];
     const auto& dev_ctx = context.cuda_device_context();
     const int kMaxHeight = 2048;
diff --git a/paddle/fluid/operators/top_k_op.h b/paddle/fluid/operators/top_k_op.h
index 27f246415a94ca..f8fa53e2ad5056 100644
--- a/paddle/fluid/operators/top_k_op.h
+++ b/paddle/fluid/operators/top_k_op.h
@@ -49,7 +49,7 @@ class TopkKernel : public framework::OpKernel<T> {
     // reshape input to a flattern matrix(like flat_inner_dims)
     framework::DDim inputdims = input->dims();
     const size_t row =
-        phi::product(phi::slice_ddim(inputdims, 0, inputdims.size() - 1));
+        common::product(common::slice_ddim(inputdims, 0, inputdims.size() - 1));
     const size_t col = inputdims[inputdims.size() - 1];
     Eigen::DSizes<int, 2> flat2dims(row, col);
 // NOTE: eigen shape doesn't affect paddle tensor.
@@ -106,7 +106,7 @@ class TopkGradKernel : public framework::OpKernel<T> {
 
     framework::DDim xdims = x->dims();
     const size_t row =
-        phi::product(phi::slice_ddim(xdims, 0, xdims.size() - 1));
+        common::product(common::slice_ddim(xdims, 0, xdims.size() - 1));
     const size_t col = xdims[xdims.size() - 1];
 
     memset(x_grad_data, 0, row * col * sizeof(T));
diff --git a/paddle/fluid/operators/top_k_op_xpu.cc b/paddle/fluid/operators/top_k_op_xpu.cc
index df1725265ebde7..55d3fa8624a8cd 100644
--- a/paddle/fluid/operators/top_k_op_xpu.cc
+++ b/paddle/fluid/operators/top_k_op_xpu.cc
@@ -63,7 +63,7 @@ class TopkXPUKernel : public framework::OpKernel<T> {
     // reshape input to a flattern matrix(like flat_inner_dims)
     framework::DDim inputdims = input->dims();
     const size_t row =
-        phi::product(phi::slice_ddim(inputdims, 0, inputdims.size() - 1));
+        common::product(common::slice_ddim(inputdims, 0, inputdims.size() - 1));
     const size_t col = inputdims[inputdims.size() - 1];
 
     // int sorted_topk(Context* ctx, const T* x, T* y, int* index, int m, int n,
diff --git a/paddle/fluid/operators/transfer_layout_op.h b/paddle/fluid/operators/transfer_layout_op.h
index edb78e87e52334..52633640fa95bc 100644
--- a/paddle/fluid/operators/transfer_layout_op.h
+++ b/paddle/fluid/operators/transfer_layout_op.h
@@ -93,7 +93,7 @@ class TransferLayoutFunctor {
         }
         auto out_tz = out_tensor.dims().size() == 0
                           ? std::vector<int64_t>{1}
-                          : phi::vectorize(out_tensor.dims());
+                          : common::vectorize(out_tensor.dims());
         dnnl::memory::data_type in_type =
             phi::funcs::ToOneDNNDataType(in_tensor.dtype());
 
@@ -134,7 +134,7 @@ class TransferLayoutFunctor {
                        const phi::DenseTensor &in,
                        phi::DenseTensor *out) const {
     PADDLE_ENFORCE_EQ(
-        phi::arity(in.dims()),
+        common::arity(in.dims()),
         4,
         platform::errors::InvalidArgument(
             "Input dimension arity only can be 4, the input dimension is %s.",
@@ -149,7 +149,7 @@ class TransferLayoutFunctor {
       dst_dim[i] = src_dim[axis[i]];
     }
 
-    out->Resize(phi::make_ddim(dst_dim));
+    out->Resize(common::make_ddim(dst_dim));
     out->mutable_data(in.place(), in.type());
 
     framework::VisitDataType(
diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc
index 652f88fec8f45a..417299d24db072 100644
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -25,7 +25,7 @@ phi::KernelKey TransposeOp::GetExpectedKernelType(
     const framework::ExecutionContext &ctx) const {
   auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
   auto &data_format = ctx.Attr<std::string>("data_format");
-  phi::DataLayout layout_ = phi::StringToDataLayout(data_format);
+  phi::DataLayout layout_ = common::StringToDataLayout(data_format);
   return phi::KernelKey(
       ctx.GetPlace(), layout_, phi::TransToPhiDataType(data_type));
 }
@@ -101,7 +101,7 @@ class TransposeOpGrad : public framework::OperatorWithKernel {
     auto data_type = OperatorWithKernel::IndicateVarDataType(
         ctx, framework::GradVarName("Out"));
     std::string data_format = ctx.Attr<std::string>("data_format");
-    phi::DataLayout layout_ = phi::StringToDataLayout(data_format);
+    phi::DataLayout layout_ = common::StringToDataLayout(data_format);
     return phi::KernelKey(
         ctx.GetPlace(), layout_, phi::TransToPhiDataType(data_type));
   }
@@ -121,7 +121,7 @@ void Transpose2Op::InferShape(framework::InferShapeContext *ctx) const {
   for (int i = 0; i < in_dims.size(); ++i) {
     x_shape_dim[i + 1] = in_dims[i];
   }
-  ctx->SetOutputDim("XShape", phi::make_ddim(x_shape_dim));
+  ctx->SetOutputDim("XShape", common::make_ddim(x_shape_dim));
   ctx->ShareLoD("X", /*->*/ "XShape");
 }
 
@@ -129,7 +129,7 @@ phi::KernelKey Transpose2Op::GetExpectedKernelType(
     const framework::ExecutionContext &ctx) const {
   auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
   auto &data_format = ctx.Attr<std::string>("data_format");
-  phi::DataLayout layout_ = phi::StringToDataLayout(data_format);
+  phi::DataLayout layout_ = common::StringToDataLayout(data_format);
   return phi::KernelKey(
       ctx.GetPlace(), layout_, phi::TransToPhiDataType(data_type));
 }
@@ -233,7 +233,7 @@ class Transpose2OpGrad : public framework::OperatorWithKernel {
         OperatorWithKernel::IndicateVarDataType(ctx,
                                                 framework::GradVarName("Out"));
     std::string data_format = ctx.Attr<std::string>("data_format");
-    phi::DataLayout layout_ = phi::StringToDataLayout(data_format);
+    phi::DataLayout layout_ = common::StringToDataLayout(data_format);
     return phi::KernelKey(
         ctx.GetPlace(), layout_, phi::TransToPhiDataType(data_type));
   }
diff --git a/paddle/fluid/operators/unbind_op.h b/paddle/fluid/operators/unbind_op.h
index 51347e45929886..7a5bf4d34c47c7 100644
--- a/paddle/fluid/operators/unbind_op.h
+++ b/paddle/fluid/operators/unbind_op.h
@@ -33,7 +33,7 @@ static inline framework::DDim UnbindOutsDims(const framework::DDim in_dims,
   for (int i = 0; i < in_dims.size(); i++) {
     if (i != axis) out_dims.push_back(in_dims[i]);
   }
-  return phi::make_ddim(out_dims);
+  return common::make_ddim(out_dims);
 }
 
 template <typename T>
diff --git a/paddle/fluid/operators/uniform_random_batch_size_like_op.cc b/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
index 4ff2c885c6930f..6b84fd1612e656 100644
--- a/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
+++ b/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
@@ -79,11 +79,11 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> {
       tensor = selected_rows->mutable_value();
       auto shape = ctx.Attr<std::vector<int64_t>>("shape");
       if (!new_shape.empty()) shape = new_shape;
-      tensor->Resize(phi::make_ddim(shape));
+      tensor->Resize(common::make_ddim(shape));
       selected_rows->mutable_rows()->reserve(shape[0]);
     } else if (out_var->IsType<phi::DenseTensor>()) {
       tensor = out_var->GetMutable<phi::DenseTensor>();
-      if (!new_shape.empty()) tensor->Resize(phi::make_ddim(new_shape));
+      if (!new_shape.empty()) tensor->Resize(common::make_ddim(new_shape));
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "Expected type of Output(out) in uniform_random_op must be Tensor, "
diff --git a/paddle/fluid/operators/uniform_random_batch_size_like_op.cu b/paddle/fluid/operators/uniform_random_batch_size_like_op.cu
index 1bbd6eba3c662e..0cf50142c04a0d 100644
--- a/paddle/fluid/operators/uniform_random_batch_size_like_op.cu
+++ b/paddle/fluid/operators/uniform_random_batch_size_like_op.cu
@@ -39,11 +39,11 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
       tensor = selected_rows->mutable_value();
       auto shape = context.Attr<std::vector<int64_t>>("shape");
       if (!new_shape.empty()) shape = new_shape;
-      tensor->Resize(phi::make_ddim(shape));
+      tensor->Resize(common::make_ddim(shape));
       selected_rows->mutable_rows()->reserve(shape[0]);
     } else if (out_var->IsType<phi::DenseTensor>()) {
       tensor = out_var->GetMutable<phi::DenseTensor>();
-      if (!new_shape.empty()) tensor->Resize(phi::make_ddim(new_shape));
+      if (!new_shape.empty()) tensor->Resize(common::make_ddim(new_shape));
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "Expected type of Output(out) in uniform_random_op must be "
diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h
index 16bce515f2a7fd..458794223dc743 100644
--- a/paddle/fluid/operators/uniform_random_op.h
+++ b/paddle/fluid/operators/uniform_random_op.h
@@ -75,7 +75,7 @@ inline std::vector<int64_t> GetNewDataFromShapeTensorList(
     auto tensor = list_new_shape_tensor[i];
     PADDLE_ENFORCE_EQ(
         tensor->dims(),
-        phi::make_ddim({1}),
+        common::make_ddim({1}),
         platform::errors::InvalidArgument(
             "Shape of dim tensor in uniform_random_op should be [1]"
             "But received tensor's dim=%s.",
diff --git a/paddle/fluid/operators/unique_op.h b/paddle/fluid/operators/unique_op.h
index 4d9b39d2dd262e..4d7a9eb5e49378 100644
--- a/paddle/fluid/operators/unique_op.h
+++ b/paddle/fluid/operators/unique_op.h
@@ -74,7 +74,7 @@ struct UniqueOpFunctor {
 
     if (count_ != nullptr) {
       // Resize the count tensor dims to allocate the memory
-      count_->Resize(phi::make_ddim({static_cast<int64_t>(uniq.size())}));
+      count_->Resize(common::make_ddim({static_cast<int64_t>(uniq.size())}));
       IndexT* count_data = count_->mutable_data<IndexT>(platform::CPUPlace());
       // init count_data to 0
       memset(count_data, 0, uniq.size() * sizeof(IndexT));
@@ -106,7 +106,7 @@ struct UniqueOpFunctor {
       }
     }
 
-    out_->Resize(phi::make_ddim({static_cast<int64_t>(uniq.size())}));
+    out_->Resize(common::make_ddim({static_cast<int64_t>(uniq.size())}));
     auto out_data = out_->mutable_data<InT>(platform::CPUPlace());
     std::memcpy(out_data, uniq.data(), uniq.size() * sizeof(InT));
   }
@@ -143,13 +143,13 @@ static void UniqueFlattendTensor(const framework::ExecutionContext& context,
                                  bool return_counts) {
   const InT* in_data = in.data<InT>();
   std::set<InT> unique(in_data, in_data + in.numel());
-  out->Resize(phi::make_ddim({static_cast<int64_t>(unique.size())}));
+  out->Resize(common::make_ddim({static_cast<int64_t>(unique.size())}));
   auto out_data = out->mutable_data<InT>(context.GetPlace());
   std::copy(unique.begin(), unique.end(), out_data);
 
   if (return_index) {
     auto* indices = context.Output<phi::DenseTensor>("Indices");
-    indices->Resize(phi::make_ddim({out->numel()}));
+    indices->Resize(common::make_ddim({out->numel()}));
     auto indices_data = indices->mutable_data<IndexT>(context.GetPlace());
     std::unordered_map<InT, IndexT> indices_map;
     indices_map.reserve(out->numel());
@@ -164,7 +164,7 @@ static void UniqueFlattendTensor(const framework::ExecutionContext& context,
 
   if (return_inverse) {
     auto* inverse = context.Output<phi::DenseTensor>("Index");
-    inverse->Resize(phi::make_ddim({in.numel()}));
+    inverse->Resize(common::make_ddim({in.numel()}));
     auto inverse_data = inverse->mutable_data<IndexT>(context.GetPlace());
     std::unordered_map<InT, IndexT> inverse_map;
     inverse_map.reserve(out->numel());
@@ -178,7 +178,7 @@ static void UniqueFlattendTensor(const framework::ExecutionContext& context,
 
   if (return_counts) {
     auto* count = context.Output<phi::DenseTensor>("Counts");
-    count->Resize(phi::make_ddim({out->numel()}));
+    count->Resize(common::make_ddim({out->numel()}));
     auto count_data = count->mutable_data<IndexT>(context.GetPlace());
     std::unordered_map<InT, IndexT> counts_map;
     counts_map.reserve(out->numel());
@@ -242,18 +242,18 @@ static void UniqueDim(const framework::ExecutionContext& context,
   std::iota(permute.begin(), permute.end(), 0);
   permute[axis] = 0;
   permute[0] = axis;
-  std::vector<int64_t> in_trans_dims_vec(phi::vectorize(in.dims()));
+  std::vector<int64_t> in_trans_dims_vec(common::vectorize(in.dims()));
   in_trans_dims_vec[axis] = in.dims()[0];
   in_trans_dims_vec[0] = in.dims()[axis];
   phi::DenseTensor in_trans;
-  framework::DDim in_trans_dims = phi::make_ddim(in_trans_dims_vec);
+  framework::DDim in_trans_dims = common::make_ddim(in_trans_dims_vec);
   in_trans.Resize(in_trans_dims);
   in_trans.mutable_data<InT>(context.GetPlace());
   auto& dev_ctx = context.template device_context<DeviceContext>();
   phi::funcs::TransCompute<DeviceContext, InT>(
       in.dims().size(), dev_ctx, in, &in_trans, permute);
   // reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2]
-  framework::DDim in_trans_flat_dims = phi::flatten_to_2d(in_trans_dims, 1);
+  framework::DDim in_trans_flat_dims = common::flatten_to_2d(in_trans_dims, 1);
   in_trans.Resize(in_trans_flat_dims);
 
   // sort indices
@@ -308,10 +308,10 @@ static void UniqueDim(const framework::ExecutionContext& context,
   phi::DenseTensor out_trans;
   std::vector<int64_t> out_trans_dims_vec = in_trans_dims_vec;
   out_trans_dims_vec[0] = input_unbind.size();
-  out_trans.Resize(phi::make_ddim(out_trans_dims_vec));
+  out_trans.Resize(common::make_ddim(out_trans_dims_vec));
   out_trans.mutable_data<InT>(context.GetPlace());
   std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]);
-  out->Resize(phi::make_ddim(out_trans_dims_vec));
+  out->Resize(common::make_ddim(out_trans_dims_vec));
   out->mutable_data<InT>(context.GetPlace());
   concat_functor(dev_ctx, input_unbind, 0, &out_trans);
   phi::funcs::TransCompute<DeviceContext, InT>(
diff --git a/paddle/fluid/operators/utils.h b/paddle/fluid/operators/utils.h
index cc6a7d7dcf9248..cecd2e2931af62 100644
--- a/paddle/fluid/operators/utils.h
+++ b/paddle/fluid/operators/utils.h
@@ -30,7 +30,7 @@ inline std::vector<T> GetDataFromTensorList(
   for (size_t i = 0; i < list_tensor.size(); ++i) {
     auto tensor = list_tensor[i];
     PADDLE_ENFORCE_EQ(tensor->dims(),
-                      phi::make_ddim({1}),
+                      common::make_ddim({1}),
                       platform::errors::InvalidArgument(
                           "The shape of Tensor in list must be [1]. "
                           "But received its shape "
@@ -71,19 +71,19 @@ inline framework::DDim GetShape(const framework::ExecutionContext& ctx) {
   if (ctx.HasInput("ShapeTensor")) {
     auto* shape_tensor = ctx.Input<phi::DenseTensor>("ShapeTensor");
     auto vec_shape = phi::GetVectorFromTensor<int>(shape_tensor);
-    return phi::make_ddim(vec_shape);
+    return common::make_ddim(vec_shape);
   }
 
   // 2. shape is a list/tuple containing Tensor
   auto shape_tensor_list = ctx.MultiInput<phi::DenseTensor>("ShapeTensorList");
   if (shape_tensor_list.size() > 0) {
     auto vec_shape = GetDataFromTensorList(shape_tensor_list);
-    return phi::make_ddim(vec_shape);
+    return common::make_ddim(vec_shape);
   }
 
   // 3. shape is a list/tuple without containing Tensor
   auto vec_shape = ctx.Attr<std::vector<int64_t>>("shape");
-  return phi::make_ddim(vec_shape);
+  return common::make_ddim(vec_shape);
 }
 
 template <typename T>
diff --git a/paddle/fluid/operators/var_conv_2d_op.cc b/paddle/fluid/operators/var_conv_2d_op.cc
index f41cc2bab2021b..86e3fc3420ed63 100644
--- a/paddle/fluid/operators/var_conv_2d_op.cc
+++ b/paddle/fluid/operators/var_conv_2d_op.cc
@@ -171,8 +171,8 @@ void VarConv2dOP::InferShape(framework::InferShapeContext* ctx) const {
     out_dims_vec.push_back(1);
     std::vector<int64_t> col_dims_vec{-1};
     col_dims_vec.push_back(1);
-    ctx->SetOutputDim("Out", phi::make_ddim(out_dims_vec));
-    ctx->SetOutputDim("Col", phi::make_ddim(col_dims_vec));
+    ctx->SetOutputDim("Out", common::make_ddim(out_dims_vec));
+    ctx->SetOutputDim("Col", common::make_ddim(col_dims_vec));
   }
 }
 
@@ -226,7 +226,7 @@ class CPUVarConv2dOPKernel : public framework::OpKernel<T> {
     std::vector<int64_t> col_dims_vec{top_size};
     col_dims_vec.push_back(1);
     auto* top_data =
-        col->mutable_data<T>(phi::make_ddim(col_dims_vec), ctx.GetPlace());
+        col->mutable_data<T>(common::make_ddim(col_dims_vec), ctx.GetPlace());
     auto* bottom_data = input.data<T>();
 
     int kernel_win_size = kernel_h * kernel_w;
@@ -321,7 +321,7 @@ class CPUVarConv2dOPKernel : public framework::OpKernel<T> {
     std::vector<int64_t> top_dims_vec{top_size};
     top_dims_vec.push_back(1);
     auto* top_data =
-        top->mutable_data<T>(phi::make_ddim(top_dims_vec), ctx.GetPlace());
+        top->mutable_data<T>(common::make_ddim(top_dims_vec), ctx.GetPlace());
 
     auto* w_data = w->data<T>();
     auto* col_data = col->data<T>();
diff --git a/paddle/fluid/pir/dialect/CMakeLists.txt b/paddle/fluid/pir/dialect/CMakeLists.txt
index 7cac45069a9e1d..e563831e96e61a 100644
--- a/paddle/fluid/pir/dialect/CMakeLists.txt
+++ b/paddle/fluid/pir/dialect/CMakeLists.txt
@@ -149,7 +149,7 @@ list(
 
 set(op_dialect_srcs ${op_dialect_srcs} ${op_source_file} ${api_source_file})
 
-set(op_dialect_deps phi pir type_info string_helper)
+set(op_dialect_deps phi common pir type_info string_helper)
 
 cc_library(
   op_dialect
diff --git a/paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.cc b/paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.cc
index d34e65e24b7eef..95e77ff6169c68 100644
--- a/paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.cc
+++ b/paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.cc
@@ -13,12 +13,12 @@
 // limitations under the License.
 
 #include "paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.h"
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/pir/dialect/kernel/ir/kernel_attribute.h"
 #include "paddle/fluid/pir/dialect/kernel/ir/kernel_op.h"
 #include "paddle/fluid/pir/dialect/kernel/ir/kernel_type.h"
 #include "paddle/fluid/platform/init_phi.h"
 #include "paddle/phi/common/place.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/pir/core/ir_printer.h"
 
 REGISTER_FILE_SYMBOLS(kernel_dialect);
@@ -46,7 +46,7 @@ void KernelDialect::PrintType(pir::Type type, std::ostream &os) const {
 
     os << phi::AllocationTypeStr(tensor_type.place().GetType()) << "_";
     os << "tensor<";
-    for (auto d : phi::vectorize(tensor_type.dims())) {
+    for (auto d : common::vectorize(tensor_type.dims())) {
       os << d;
       os << "x";
     }
@@ -58,7 +58,7 @@ void KernelDialect::PrintType(pir::Type type, std::ostream &os) const {
 
     os << phi::AllocationTypeStr(tensor_type.place().GetType()) << "_";
     os << "tensor<";
-    for (auto d : phi::vectorize(tensor_type.dims())) {
+    for (auto d : common::vectorize(tensor_type.dims())) {
       os << d;
       os << "x";
     }
diff --git a/paddle/fluid/pir/dialect/op_generator/op_build_gen.py b/paddle/fluid/pir/dialect/op_generator/op_build_gen.py
index 6c2165940c8e18..39c5d71616a220 100644
--- a/paddle/fluid/pir/dialect/op_generator/op_build_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/op_build_gen.py
@@ -395,7 +395,7 @@ def GenBuildOutputs(
     {name} = std::move(phi::IntArray(std::vector<int64_t>({name}_size, -1)));
     {name}.SetFromTensor(true);
   }} else if ({name}_.type().isa<paddle::dialect::DenseTensorType>()) {{
-    size_t {name}_size = phi::product({name}_.type().dyn_cast<paddle::dialect::DenseTensorType>().dims());
+    size_t {name}_size = common::product({name}_.type().dyn_cast<paddle::dialect::DenseTensorType>().dims());
     {name} = std::move(phi::IntArray(std::vector<int64_t>({name}_size, -1)));
     {name}.SetFromTensor(true);
   }} else {{
@@ -412,7 +412,7 @@ def GenBuildOutputs(
     size_t {name}_size = {name}_.type().dyn_cast<pir::VectorType>().size();
     {name} = std::vector<int64_t>({name}_size, -1);
   }} else if ({name}_.type().isa<paddle::dialect::DenseTensorType>()) {{
-    size_t {name}_size = phi::product({name}_.type().dyn_cast<paddle::dialect::DenseTensorType>().dims());
+    size_t {name}_size = common::product({name}_.type().dyn_cast<paddle::dialect::DenseTensorType>().dims());
     {name} = std::vector<int64_t>({name}_size, -1);
   }} else {{
     PADDLE_THROW(phi::errors::Unimplemented("Only support VectorType or DenseTensorType"));
diff --git a/paddle/fluid/pir/dialect/operator/ir/api_builder.cc b/paddle/fluid/pir/dialect/operator/ir/api_builder.cc
index af2268fda16a89..af00df8704136f 100644
--- a/paddle/fluid/pir/dialect/operator/ir/api_builder.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/api_builder.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/pir/dialect/operator/ir/api_builder.h"
-#include "paddle/pir/core/enforce.h"
+#include "paddle/common/enforce.h"
 #include "paddle/pir/core/ir_context.h"
 
 namespace paddle {
diff --git a/paddle/fluid/pir/dialect/operator/ir/api_builder.h b/paddle/fluid/pir/dialect/operator/ir/api_builder.h
index 92b54c97db7f8c..7d1f1f6422b760 100644
--- a/paddle/fluid/pir/dialect/operator/ir/api_builder.h
+++ b/paddle/fluid/pir/dialect/operator/ir/api_builder.h
@@ -16,8 +16,8 @@
 #include <memory>
 #include <stack>
 
+#include "paddle/common/macros.h"
 #include "paddle/pir/core/builder.h"
-#include "paddle/pir/core/macros.h"
 #include "paddle/pir/core/parameter.h"
 #include "paddle/pir/core/program.h"
 
diff --git a/paddle/fluid/pir/dialect/operator/ir/attribute_storage.h b/paddle/fluid/pir/dialect/operator/ir/attribute_storage.h
index 68f066b0093294..15e49e68e25833 100644
--- a/paddle/fluid/pir/dialect/operator/ir/attribute_storage.h
+++ b/paddle/fluid/pir/dialect/operator/ir/attribute_storage.h
@@ -14,9 +14,9 @@
 
 #pragma once
 
+#include "paddle/common/layout.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/int_array.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/pir/core/attribute.h"
 #include "paddle/pir/core/attribute_base.h"
diff --git a/paddle/fluid/pir/dialect/operator/ir/ir_meta_tensor.h b/paddle/fluid/pir/dialect/operator/ir/ir_meta_tensor.h
index ae3b82a5e3d0c1..f2af1eb66bf3c0 100644
--- a/paddle/fluid/pir/dialect/operator/ir/ir_meta_tensor.h
+++ b/paddle/fluid/pir/dialect/operator/ir/ir_meta_tensor.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/meta_tensor.h"
 
 namespace paddle {
diff --git a/paddle/fluid/pir/dialect/operator/ir/ir_selected_rows.cc b/paddle/fluid/pir/dialect/operator/ir/ir_selected_rows.cc
index d08dfccc25250d..384560ef591fce 100644
--- a/paddle/fluid/pir/dialect/operator/ir/ir_selected_rows.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/ir_selected_rows.cc
@@ -13,8 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/pir/dialect/operator/ir/ir_selected_rows.h"
-
-#include "paddle/pir/core/enforce.h"
+#include "paddle/common/enforce.h"
 
 namespace paddle {
 namespace dialect {
@@ -51,7 +50,7 @@ IrSelectedRows& IrSelectedRows::operator=(IrSelectedRows&& other) noexcept {
   return *this;
 }
 
-int64_t IrSelectedRows::numel() const { return phi::product(dims_); }
+int64_t IrSelectedRows::numel() const { return common::product(dims_); }
 
 const phi::Place& IrSelectedRows::place() const {
   IR_THROW("Don't use IrSelectedRows::place method.");
diff --git a/paddle/fluid/pir/dialect/operator/ir/ir_tensor.cc b/paddle/fluid/pir/dialect/operator/ir/ir_tensor.cc
index be06d3dbfafc52..6383257647323e 100644
--- a/paddle/fluid/pir/dialect/operator/ir/ir_tensor.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/ir_tensor.cc
@@ -14,7 +14,7 @@
 
 #include "paddle/fluid/pir/dialect/operator/ir/ir_tensor.h"
 
-#include "paddle/pir/core/enforce.h"
+#include "paddle/common/enforce.h"
 
 namespace paddle {
 namespace dialect {
@@ -51,7 +51,7 @@ IrTensor& IrTensor::operator=(IrTensor&& other) noexcept {
   return *this;
 }
 
-int64_t IrTensor::numel() const { return phi::product(dims_); }
+int64_t IrTensor::numel() const { return common::product(dims_); }
 
 const phi::Place& IrTensor::place() const {
   IR_THROW("Don't use IrTensor::place method.");
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_op.cc b/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
index 5fdde6aadc08c0..cda564bedbb1df 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
@@ -2086,7 +2086,7 @@ void ExpandOp::Build(pir::Builder &builder,
     shape = std::move(phi::IntArray(std::vector<int64_t>(shape_size, -2)));
     shape.SetFromTensor(true);
   } else if (shape_.type().isa<paddle::dialect::DenseTensorType>()) {
-    size_t shape_size = phi::product(
+    size_t shape_size = common::product(
         shape_.type().dyn_cast<paddle::dialect::DenseTensorType>().dims());
     // In ExpandInferMeta use -2 to represent the element in expand_shape is a
     // var.
diff --git a/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc b/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc
index 69508f198b1102..4c44b91af35b72 100644
--- a/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc
@@ -88,7 +88,7 @@ void OperatorDialect::PrintType(pir::Type type, std::ostream &os) const {
   os << '.';
   if (auto tensor_type = type.dyn_cast<DenseTensorType>()) {
     os << "tensor<";
-    for (auto d : phi::vectorize(tensor_type.dims())) {
+    for (auto d : common::vectorize(tensor_type.dims())) {
       os << d;
       os << "x";
     }
@@ -96,7 +96,7 @@ void OperatorDialect::PrintType(pir::Type type, std::ostream &os) const {
     os << ">";
   } else if (auto selected_rows_type = type.dyn_cast<SelectedRowsType>()) {
     os << "selectedrows<";
-    for (auto d : phi::vectorize(selected_rows_type.dims())) {
+    for (auto d : common::vectorize(selected_rows_type.dims())) {
       os << d;
       os << "x";
     }
@@ -153,7 +153,7 @@ pir::Type OperatorDialect::ParseType(pir::IrParser &parser) {  // NOLINT
       break;
     }
   }
-  phi::DDim ddim = phi::make_ddim(dim);
+  phi::DDim ddim = common::make_ddim(dim);
   pir::Type dtype = parser.ParseType();
   std::vector<std::vector<size_t>> lod;
   std::vector<size_t> lodv;
diff --git a/paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.cc b/paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.cc
index bf752a089b4f6f..7fc00acc12a81b 100644
--- a/paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.cc
+++ b/paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.h"
+#include "paddle/phi/core/enforce.h"
 
 namespace paddle {
 namespace dialect {
diff --git a/paddle/fluid/pir/dialect/operator/utils/utils.h b/paddle/fluid/pir/dialect/operator/utils/utils.h
index 4bbd454d3ea350..18f9f2950c11fd 100644
--- a/paddle/fluid/pir/dialect/operator/utils/utils.h
+++ b/paddle/fluid/pir/dialect/operator/utils/utils.h
@@ -18,6 +18,7 @@
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/attribute.h"
+#include "paddle/phi/core/enforce.h"
 #include "paddle/pir/core/builtin_attribute.h"
 #include "paddle/pir/core/builtin_type.h"
 #include "paddle/pir/core/value.h"
diff --git a/paddle/fluid/pir/transforms/fusion/conv2d_add_act_fuse_pass.cc b/paddle/fluid/pir/transforms/fusion/conv2d_add_act_fuse_pass.cc
index 6adb25f8c2dd06..4c701a4c4a51c7 100644
--- a/paddle/fluid/pir/transforms/fusion/conv2d_add_act_fuse_pass.cc
+++ b/paddle/fluid/pir/transforms/fusion/conv2d_add_act_fuse_pass.cc
@@ -17,10 +17,10 @@
 #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 #include "paddle/fluid/pir/transforms/fusion/conv2d_add_act_fuse_pass.h"
 
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
 #include "paddle/fluid/pir/transforms/transform_general_functions.h"
-#include "paddle/phi/core/ddim.h"
 
 namespace {
 
diff --git a/paddle/fluid/pir/transforms/fusion/conv2d_add_fuse_pass.cc b/paddle/fluid/pir/transforms/fusion/conv2d_add_fuse_pass.cc
index 35b0e65e16b5b5..037b2b95c6017c 100644
--- a/paddle/fluid/pir/transforms/fusion/conv2d_add_fuse_pass.cc
+++ b/paddle/fluid/pir/transforms/fusion/conv2d_add_fuse_pass.cc
@@ -22,8 +22,8 @@
 #include "paddle/fluid/pir/transforms/fusion/conv2d_add_fuse_pass.h"
 #include "paddle/fluid/pir/transforms/transform_general_functions.h"
 
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/pir/drr/api/drr_pattern_base.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/pir/pass/pass.h"
 
 namespace {
diff --git a/paddle/fluid/pir/transforms/fusion/conv2d_bn_fuse_pass.cc b/paddle/fluid/pir/transforms/fusion/conv2d_bn_fuse_pass.cc
index 8406d705973031..42129852bc8bc3 100644
--- a/paddle/fluid/pir/transforms/fusion/conv2d_bn_fuse_pass.cc
+++ b/paddle/fluid/pir/transforms/fusion/conv2d_bn_fuse_pass.cc
@@ -17,10 +17,10 @@
 #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 #include "paddle/fluid/pir/transforms/fusion/conv2d_bn_fuse_pass.h"
 
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
 #include "paddle/fluid/pir/transforms/transform_general_functions.h"
-#include "paddle/phi/core/ddim.h"
 
 namespace {
 
@@ -60,7 +60,7 @@ class Conv2dBnFusePattern
         bn_variance.type().dyn_cast<paddle::dialect::DenseTensorType>().dims();
     float epsilon = op.attribute<pir::FloatAttribute>("epsilon").data();
     paddle::dialect::FullOp full_op = rewriter.Build<paddle::dialect::FullOp>(
-        phi::vectorize(bn_variance_shape), epsilon);
+        common::vectorize(bn_variance_shape), epsilon);
     paddle::dialect::AddOp add_op = rewriter.Build<paddle::dialect::AddOp>(
         bn_variance.dyn_cast<pir::OpResult>(), full_op.out());
     paddle::dialect::SqrtOp sqrt_op =
diff --git a/paddle/fluid/pir/transforms/params_sync_among_devices_pass.cc b/paddle/fluid/pir/transforms/params_sync_among_devices_pass.cc
index 3e121aa51f0756..51b75ea0335821 100644
--- a/paddle/fluid/pir/transforms/params_sync_among_devices_pass.cc
+++ b/paddle/fluid/pir/transforms/params_sync_among_devices_pass.cc
@@ -23,7 +23,7 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 
-#include "paddle/phi/core/errors.h"
+#include "paddle/common/errors.h"
 #include "paddle/pir/core/builtin_attribute.h"
 #include "paddle/pir/core/builtin_op.h"
 #include "paddle/pir/pass/pass.h"
diff --git a/paddle/fluid/pir/transforms/transform_general_functions.h b/paddle/fluid/pir/transforms/transform_general_functions.h
index 48399a95a81ce6..c5f138daa41a79 100644
--- a/paddle/fluid/pir/transforms/transform_general_functions.h
+++ b/paddle/fluid/pir/transforms/transform_general_functions.h
@@ -14,9 +14,9 @@
 
 #pragma once
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
+#include "paddle/common/errors.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/pir/core/operation.h"
 #include "paddle/pir/core/parameter.h"
 #include "paddle/pir/core/type.h"
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 0cca954a6275a5..113ba40ec0cf31 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -6,9 +6,9 @@ cc_library(
 cc_test(
   errors_test
   SRCS errors_test.cc
-  DEPS phi enforce)
+  DEPS phi common enforce)
 
-set(enforce_deps phi)
+set(enforce_deps phi common)
 if(WITH_GPU)
   set(enforce_deps ${enforce_deps} external_error_proto)
 endif()
@@ -16,30 +16,30 @@ endif()
 cc_library(
   enforce INTERFACE
   SRCS enforce.cc
-  DEPS ${enforce_deps})
+  DEPS ${enforce_deps} common)
 cc_library(monitor SRCS monitor.cc)
 cc_test(
   enforce_test
   SRCS enforce_test.cc
-  DEPS enforce)
+  DEPS enforce common)
 
 cc_test(
   cpu_info_test
   SRCS cpu_info_test.cc
-  DEPS phi)
+  DEPS phi common)
 cc_test(
   os_info_test
   SRCS os_info_test.cc
-  DEPS phi)
+  DEPS phi common)
 
 cc_library(
   place
   SRCS place.cc
-  DEPS enforce phi)
+  DEPS enforce phi common)
 cc_test(
   place_test
   SRCS place_test.cc
-  DEPS place glog phi)
+  DEPS place glog phi common)
 
 if(WITH_MKLDNN)
   set(MKLDNN_CTX_DEPS mkldnn)
@@ -53,7 +53,7 @@ add_subdirectory(dynload)
 cc_library(
   cpu_helper
   SRCS cpu_helper.cc
-  DEPS cblas enforce)
+  DEPS cblas enforce common)
 cc_test(
   cpu_helper_test
   SRCS cpu_helper_test.cc
@@ -82,13 +82,13 @@ if(WITH_GPU)
   nv_library(
     stream_callback_manager
     SRCS stream_callback_manager.cc
-    DEPS simple_threadpool enforce)
+    DEPS simple_threadpool enforce common)
 endif()
 if(WITH_ROCM)
   hip_library(
     stream_callback_manager
     SRCS stream_callback_manager.cc
-    DEPS simple_threadpool enforce)
+    DEPS simple_threadpool enforce common)
 endif()
 
 if(WITH_GPU OR WITH_ROCM)
@@ -101,14 +101,14 @@ if(WITH_GLOO)
   cc_library(
     gloo_context
     SRCS gloo_context.cc
-    DEPS framework_proto gloo_wrapper enforce)
+    DEPS framework_proto gloo_wrapper enforce common)
 endif()
 
 # separate init from device_context to avoid cycle dependencies
 cc_library(
   init
   SRCS init.cc
-  DEPS device_context phi memcpy)
+  DEPS device_context phi common memcpy)
 
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
@@ -130,12 +130,13 @@ cc_library(
        ${dgc_deps}
        dlpack
        phi
+       common
        ${XPU_CTX_DEPS})
 
 cc_library(
   collective_helper
   SRCS collective_helper.cc gen_comm_id_helper.cc
-  DEPS framework_proto device_context enforce)
+  DEPS framework_proto device_context enforce common)
 
 if(WITH_GPU OR WITH_ROCM)
   target_link_libraries(device_context gpu_resource_pool)
@@ -159,7 +160,7 @@ set(DEVICE_EVENT_LIBS)
 cc_library(
   device_event_base
   SRCS device_event_base.cc
-  DEPS place enforce device_context op_registry)
+  DEPS place enforce device_context op_registry common)
 set(DEVICE_EVENT_LIBS
     device_event_base
     CACHE INTERNAL "device event libs")
@@ -188,12 +189,12 @@ if(WITH_GPU)
       cuda_graph_with_memory_pool
       SRCS cuda_graph_with_memory_pool.cc
       DEPS ${DEVICE_EVENT_LIBS} device_event_custom_device device_context
-           allocator phi)
+           allocator phi common)
   else()
     nv_library(
       cuda_graph_with_memory_pool
       SRCS cuda_graph_with_memory_pool.cc
-      DEPS ${DEVICE_EVENT_LIBS} device_context allocator phi)
+      DEPS ${DEVICE_EVENT_LIBS} device_context allocator phi common)
   endif()
   nv_test(
     device_context_test
@@ -245,6 +246,7 @@ cc_library(
   lodtensor_printer
   SRCS lodtensor_printer.cc
   DEPS phi
+       common
        place
        tensor
        scope
@@ -263,6 +265,7 @@ if(WITH_GPU)
     profiler
     SRCS profiler.cc profiler.cu
     DEPS phi
+         common
          gpu_info
          enforce
          dynload_cuda
@@ -275,6 +278,7 @@ elseif(WITH_ROCM)
     profiler
     SRCS profiler.cc profiler.cu
     DEPS phi
+         common
          gpu_info
          enforce
          new_profiler
@@ -286,6 +290,7 @@ elseif(WITH_XPU)
     profiler
     SRCS profiler.cc
     DEPS phi
+         common
          enforce
          dynload_xpti
          new_profiler
@@ -296,7 +301,13 @@ else()
   cc_library(
     profiler
     SRCS profiler.cc
-    DEPS phi enforce new_profiler stats op_proto_maker shape_inference)
+    DEPS phi
+         common
+         enforce
+         new_profiler
+         stats
+         op_proto_maker
+         shape_inference)
 endif()
 
 cc_test(
@@ -332,7 +343,7 @@ if(WITH_GPU)
   nv_test(
     test_limit_gpu_memory
     SRCS test_limit_gpu_memory.cu
-    DEPS gpu_info phi)
+    DEPS gpu_info phi common)
   nv_library(
     cuda_device_guard
     SRCS cuda_device_guard.cc
@@ -347,7 +358,7 @@ if(WITH_ROCM)
   hip_test(
     test_limit_gpu_memory
     SRCS test_limit_gpu_memory.cu
-    DEPS gpu_info phi)
+    DEPS gpu_info phi common)
   hip_library(
     cuda_device_guard
     SRCS cuda_device_guard.cc
@@ -359,7 +370,7 @@ if(NOT APPLE AND NOT WIN32)
     cc_test(
       device_code_test
       SRCS device_code_test.cc
-      DEPS phi lod_tensor)
+      DEPS phi common lod_tensor)
   endif()
 endif()
 
@@ -381,4 +392,4 @@ cc_library(
 cc_test(
   init_phi_test
   SRCS init_phi_test.cc
-  DEPS phi init_phi)
+  DEPS phi common init_phi)
diff --git a/paddle/fluid/platform/bfloat16_test.cu b/paddle/fluid/platform/bfloat16_test.cu
index cec83cbd11fe94..4e6bdb94625ddc 100644
--- a/paddle/fluid/platform/bfloat16_test.cu
+++ b/paddle/fluid/platform/bfloat16_test.cu
@@ -59,7 +59,7 @@ TEST(bfloat16, lod_tensor_on_gpu) {
   phi::DenseTensor dst_tensor;
 
   bfloat16 *src_ptr =
-      src_tensor.mutable_data<bfloat16>(phi::make_ddim({2, 2}), CPUPlace());
+      src_tensor.mutable_data<bfloat16>(common::make_ddim({2, 2}), CPUPlace());
 
   bfloat16 arr[4] = {
       bfloat16(1.0f), bfloat16(0.5f), bfloat16(0.33333f), bfloat16(0.0f)};
diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.h b/paddle/fluid/platform/cuda_graph_with_memory_pool.h
index 78f36a77e5f9cd..570ee01ec0a511 100644
--- a/paddle/fluid/platform/cuda_graph_with_memory_pool.h
+++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.h
@@ -14,10 +14,10 @@
 
 #pragma once
 
+#include "paddle/common/macros.h"
 #include "paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/macros.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/device/custom/CMakeLists.txt b/paddle/fluid/platform/device/custom/CMakeLists.txt
index 8e081781e298d6..c01bead7b03e66 100644
--- a/paddle/fluid/platform/device/custom/CMakeLists.txt
+++ b/paddle/fluid/platform/device/custom/CMakeLists.txt
@@ -2,9 +2,9 @@ if(WITH_CUSTOM_DEVICE)
   cc_library(
     custom_device_resource_pool
     SRCS custom_device_resource_pool.cc
-    DEPS phi glog enforce monitor)
+    DEPS phi common glog enforce monitor)
   cc_test(
     custom_device_test
     SRCS custom_device_test.cc
-    DEPS phi gradient_accumulator)
+    DEPS phi common gradient_accumulator)
 endif()
diff --git a/paddle/fluid/platform/device/custom/custom_device_test.cc b/paddle/fluid/platform/device/custom/custom_device_test.cc
index 7cb38b8850b7c6..b36355b2386be6 100644
--- a/paddle/fluid/platform/device/custom/custom_device_test.cc
+++ b/paddle/fluid/platform/device/custom/custom_device_test.cc
@@ -81,22 +81,22 @@ void TestTensorMutableData(const paddle::platform::Place& place) {
   float* p1 = nullptr;
   float* p2 = nullptr;
   // initialization
-  p1 = src_tensor.mutable_data<float>(phi::make_ddim({1, 2, 3}), place);
+  p1 = src_tensor.mutable_data<float>(common::make_ddim({1, 2, 3}), place);
   auto p1_holder = src_tensor.Holder();
   EXPECT_NE(p1, nullptr);
   // set src_tensor a new dim with large size
   // momery is supposed to be re-allocated
-  p2 = src_tensor.mutable_data<float>(phi::make_ddim({3, 1024}), place);
+  p2 = src_tensor.mutable_data<float>(common::make_ddim({3, 1024}), place);
   auto p2_holder = src_tensor.Holder();
   EXPECT_NE(p2, nullptr);
   EXPECT_NE(p1_holder.get(), p2_holder.get());
   // set src_tensor a new dim with same size
   // momery block is supposed to be unchanged
-  p1 = src_tensor.mutable_data<float>(phi::make_ddim({2, 2, 3}), place);
+  p1 = src_tensor.mutable_data<float>(common::make_ddim({2, 2, 3}), place);
   EXPECT_EQ(p1, p2);
   // set src_tensor a new dim with smaller size
   // momery block is supposed to be unchanged
-  p2 = src_tensor.mutable_data<float>(phi::make_ddim({2, 2}), place);
+  p2 = src_tensor.mutable_data<float>(common::make_ddim({2, 2}), place);
   EXPECT_EQ(p1, p2);
 }
 
@@ -104,7 +104,7 @@ void TestTensorShareDataWith(const paddle::platform::Place& place) {
   std::cout << "TestTensorShareDataWith on " << place << std::endl;
   phi::DenseTensor src_tensor;
   phi::DenseTensor dst_tensor;
-  src_tensor.mutable_data<int>(phi::make_ddim({2, 3, 4}), place);
+  src_tensor.mutable_data<int>(common::make_ddim({2, 3, 4}), place);
   dst_tensor.ShareDataWith(src_tensor);
   ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
 }
@@ -118,7 +118,7 @@ void TestTensorUtils(const paddle::platform::Place& place) {
   phi::DenseTensor gpu_tensor;
   phi::DenseTensor dst_tensor;
 
-  int* src_ptr = src_tensor.mutable_data<int>(phi::make_ddim({3, 3}),
+  int* src_ptr = src_tensor.mutable_data<int>(common::make_ddim({3, 3}),
                                               paddle::platform::CPUPlace());
 
   std::array<int, 9> arr = {1, 2, 3, 4, 5, 6, 7, 8, 9};
diff --git a/paddle/fluid/platform/device/gpu/CMakeLists.txt b/paddle/fluid/platform/device/gpu/CMakeLists.txt
index 897f8d3732b730..65c3fb20631675 100644
--- a/paddle/fluid/platform/device/gpu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/gpu/CMakeLists.txt
@@ -3,7 +3,13 @@ if(WITH_GPU)
   nv_library(
     gpu_info
     SRCS gpu_info.cc
-    DEPS phi glog enforce monitor dynload_cuda malloc)
+    DEPS phi
+         common
+         glog
+         enforce
+         monitor
+         dynload_cuda
+         malloc)
 
   nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
   nv_test(
@@ -15,7 +21,7 @@ elseif(WITH_ROCM)
   hip_library(
     gpu_info
     SRCS gpu_info.cc
-    DEPS phi glog enforce monitor dynload_cuda)
+    DEPS phi common glog enforce monitor dynload_cuda)
 
   hip_test(cuda_helper_test SRCS cuda_helper_test.cu)
   hip_test(
diff --git a/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
index 07901054b3b337..a535cd74478437 100644
--- a/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
+++ b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
@@ -1,7 +1,7 @@
 nv_library(
   cuda_profiler
   SRCS cuda_profiler.cc
-  DEPS enforce)
+  DEPS enforce common)
 
 nv_test(
   cudnn_helper_test
diff --git a/paddle/fluid/platform/device/ipu/CMakeLists.txt b/paddle/fluid/platform/device/ipu/CMakeLists.txt
index a4a6db37837063..68bed1034af530 100644
--- a/paddle/fluid/platform/device/ipu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/ipu/CMakeLists.txt
@@ -51,5 +51,5 @@ if(WITH_IPU)
   cc_library(
     ipu_info
     SRCS ${IPU_INFO_SRC}
-    DEPS popart-only enforce)
+    DEPS popart-only enforce common)
 endif()
diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.cc b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
index ec6c8a49647b18..811f897cbda7b4 100644
--- a/paddle/fluid/platform/device/ipu/ipu_compiler.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
@@ -415,7 +415,7 @@ void Compiler::LowerConstants(const Scope* scope) {
       ConstantOpAttrVisitor visitor(tensor, dtype);
       auto value = op_desc->GetAttr("value");
       paddle::visit(visitor, value);
-      auto ddim = phi::make_ddim(shape);
+      auto ddim = common::make_ddim(shape);
       tensor->Resize(ddim);
 
       auto const_data = std::unique_ptr<popart::ConstVoidData>();
diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.cc b/paddle/fluid/platform/device/ipu/ipu_executor.cc
index ee749f3a19a15d..d0792689228de1 100644
--- a/paddle/fluid/platform/device/ipu/ipu_executor.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_executor.cc
@@ -210,7 +210,7 @@ void Executor::Run(const std::vector<const Tensor *> &inputs,
     }
 
     auto *tensor = outputs[i];
-    tensor->Resize(phi::make_ddim(output_shape));
+    tensor->Resize(common::make_ddim(output_shape));
     auto fetch_dtype = fetch_info.dataType();
     auto paddle_type = PopartDType2VarType(fetch_dtype);
     tensor->mutable_data(ctx.GetPlace(),
@@ -427,7 +427,7 @@ void Executor::RunPopef(const std::vector<const Tensor *> &inputs,
 
     auto *tensor = outputs[i];
     // resize output size to make data_ptr valid.
-    tensor->Resize(phi::make_ddim(output_shape));
+    tensor->Resize(common::make_ddim(output_shape));
     tensor->mutable_data(ctx.GetPlace(),
                          framework::TransToPhiDataType(paddle_dtype));
 
diff --git a/paddle/fluid/platform/device/xpu/CMakeLists.txt b/paddle/fluid/platform/device/xpu/CMakeLists.txt
index f9e9659fa9f4cc..6a61d750b501d9 100644
--- a/paddle/fluid/platform/device/xpu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/xpu/CMakeLists.txt
@@ -20,6 +20,7 @@ cc_library(
        device_context
        place
        phi
+       common
        dynload_xpti)
 cc_library(
   xpu_op_list
@@ -30,6 +31,7 @@ cc_library(
        device_context
        op_kernel_type
        phi
+       common
        dynload_xpti)
 cc_library(
   xpu_resource_pool
diff --git a/paddle/fluid/platform/device_code_test.cc b/paddle/fluid/platform/device_code_test.cc
index 6b58453f03ea83..d72722de96ae04 100644
--- a/paddle/fluid/platform/device_code_test.cc
+++ b/paddle/fluid/platform/device_code_test.cc
@@ -60,8 +60,8 @@ TEST(DeviceCode, cuda) {
   phi::DenseTensor cpu_z;
 
   float scale = 2;
-  auto dims =
-      phi::make_ddim({static_cast<int64_t>(256), static_cast<int64_t>(1024)});
+  auto dims = common::make_ddim(
+      {static_cast<int64_t>(256), static_cast<int64_t>(1024)});
   phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
   auto* cpu_ctx = reinterpret_cast<phi::CPUContext*>(pool.Get(phi::CPUPlace()));
   cpu_x.Resize(dims);
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index 4cb3bfdb3adaef..29f7b91a171572 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -1,7 +1,7 @@
 cc_library(
   dynamic_loader
   SRCS dynamic_loader.cc
-  DEPS glog enforce phi)
+  DEPS glog enforce phi common)
 
 list(
   APPEND
@@ -57,20 +57,20 @@ if(WITH_ROCM)
   hip_library(
     dynload_cuda
     SRCS ${HIP_SRCS}
-    DEPS dynamic_loader phi)
+    DEPS dynamic_loader phi common)
   cc_library(
     dynload_warpctc
     SRCS warpctc.cc
-    DEPS dynamic_loader warpctc phi)
+    DEPS dynamic_loader warpctc phi common)
 else()
   nv_library(
     dynload_cuda
     SRCS ${CUDA_SRCS}
-    DEPS dynamic_loader phi)
+    DEPS dynamic_loader phi common)
   cc_library(
     dynload_warpctc
     SRCS warpctc.cc
-    DEPS dynamic_loader warpctc phi)
+    DEPS dynamic_loader warpctc phi common)
 endif()
 if(WITH_XPU)
   cc_library(
@@ -86,6 +86,6 @@ if(MKL_FOUND AND WITH_ONEMKL)
   cc_library(
     dynload_mklrt
     SRCS mklrt.cc
-    DEPS dynamic_loader phi)
+    DEPS dynamic_loader phi common)
   target_include_directories(dynload_mklrt PRIVATE ${MKL_INCLUDE})
 endif()
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index d9c9398461d5c7..1a82b05f3bc3af 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -103,10 +103,6 @@ limitations under the License. */
 #endif
 #include "paddle/phi/core/flags.h"
 
-namespace phi {
-class ErrorSummary;
-}  // namespace phi
-
 PHI_DECLARE_int32(call_stack_level);
 
 namespace paddle {
diff --git a/paddle/fluid/platform/errors.h b/paddle/fluid/platform/errors.h
index 758af3e2d9137e..b13a8b8d7a7129 100644
--- a/paddle/fluid/platform/errors.h
+++ b/paddle/fluid/platform/errors.h
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/phi/core/errors.h"
+#include "paddle/common/errors.h"
 namespace paddle {
 namespace platform {
-namespace errors = ::phi::errors;
-using error = ::phi::ErrorCode;
+namespace errors = ::common::errors;
+using error = ::common::ErrorCode;
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu
index 3297b3a2326dac..4575b54d48c9bf 100644
--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
@@ -320,7 +320,7 @@ TEST(float16, lod_tensor_on_gpu) {
   phi::DenseTensor dst_tensor;
 
   float16 *src_ptr =
-      src_tensor.mutable_data<float16>(phi::make_ddim({2, 2}), CPUPlace());
+      src_tensor.mutable_data<float16>(common::make_ddim({2, 2}), CPUPlace());
 
   float16 arr[4] = {
       float16(1.0f), float16(0.5f), float16(0.33333f), float16(0.0f)};
diff --git a/paddle/fluid/platform/macros.h b/paddle/fluid/platform/macros.h
index 3f854d40b8b23c..33ccc87fe32893 100644
--- a/paddle/fluid/platform/macros.h
+++ b/paddle/fluid/platform/macros.h
@@ -14,4 +14,4 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/core/macros.h"
+#include "paddle/common/macros.h"
diff --git a/paddle/fluid/platform/monitor.h b/paddle/fluid/platform/monitor.h
index bfbbcf3db77be5..bcc5dba0b5732d 100644
--- a/paddle/fluid/platform/monitor.h
+++ b/paddle/fluid/platform/monitor.h
@@ -26,7 +26,7 @@
 
 #include "glog/logging.h"
 
-#include "paddle/phi/core/macros.h"
+#include "paddle/common/macros.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/profiler/CMakeLists.txt b/paddle/fluid/platform/profiler/CMakeLists.txt
index 85eba90ec6166f..73fc8b79b4e4ef 100644
--- a/paddle/fluid/platform/profiler/CMakeLists.txt
+++ b/paddle/fluid/platform/profiler/CMakeLists.txt
@@ -1,24 +1,24 @@
 cc_library(
   host_tracer
   SRCS host_tracer.cc
-  DEPS framework_proto enforce phi var_type_traits)
+  DEPS framework_proto enforce phi common var_type_traits)
 cc_library(
   cuda_tracer
   SRCS cuda_tracer.cc cupti_data_process.cc
-  DEPS workqueue_utils enforce glog)
+  DEPS workqueue_utils enforce glog common)
 cc_library(
   xpu_tracer
   SRCS xpu_tracer.cc
-  DEPS enforce glog)
+  DEPS enforce glog common)
 add_subdirectory(custom_device)
 cc_library(
   event_node
   SRCS event_node.cc
-  DEPS enforce place)
+  DEPS enforce place common)
 cc_library(
   profiler_utils
   SRCS utils.cc
-  DEPS enforce glog)
+  DEPS enforce glog common)
 add_subdirectory(dump)
 cc_library(
   profiler_logger
@@ -32,7 +32,7 @@ cc_library(
 cc_library(
   cpu_utilization
   SRCS cpu_utilization.cc
-  DEPS phi enforce glog)
+  DEPS phi common enforce glog common)
 cc_library(
   new_profiler
   SRCS profiler.cc
diff --git a/paddle/fluid/platform/profiler/custom_device/CMakeLists.txt b/paddle/fluid/platform/profiler/custom_device/CMakeLists.txt
index f4fe05d0e7de98..ece3e7466f0550 100644
--- a/paddle/fluid/platform/profiler/custom_device/CMakeLists.txt
+++ b/paddle/fluid/platform/profiler/custom_device/CMakeLists.txt
@@ -1,4 +1,4 @@
 cc_library(
   custom_tracer
   SRCS custom_tracer.cc
-  DEPS workqueue_utils enforce glog)
+  DEPS workqueue_utils enforce glog common)
diff --git a/paddle/fluid/prim/api/auto_code_generated/tensor_operants_gen.py b/paddle/fluid/prim/api/auto_code_generated/tensor_operants_gen.py
index 5f439485eb1bc4..378f57a468cd46 100644
--- a/paddle/fluid/prim/api/auto_code_generated/tensor_operants_gen.py
+++ b/paddle/fluid/prim/api/auto_code_generated/tensor_operants_gen.py
@@ -27,7 +27,7 @@
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/common/int_array.h"
-#include "paddle/phi/core/macros.h"
+#include "paddle/common/macros.h"
 #include "paddle/utils/test_macros.h"
 
 """
@@ -152,7 +152,7 @@ class TEST_API EagerTensorOperants : public TensorOperantsBase {
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/common/int_array.h"
-#include "paddle/phi/core/macros.h"
+#include "paddle/common/macros.h"
 #include "paddle/utils/test_macros.h"
 """
 
diff --git a/paddle/fluid/prim/api/composite_backward/composite_backward_api.h b/paddle/fluid/prim/api/composite_backward/composite_backward_api.h
index 6e12d6fa464cc7..767d0a653c9265 100644
--- a/paddle/fluid/prim/api/composite_backward/composite_backward_api.h
+++ b/paddle/fluid/prim/api/composite_backward/composite_backward_api.h
@@ -20,12 +20,12 @@
 
 #include <math.h>
 
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/prim/api/all.h"
 #include "paddle/fluid/prim/api/composite_backward/composite_double_backward_api.h"
 #include "paddle/fluid/prim/api/generated_prim/prim_generated_api.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/int_array.h"
-#include "paddle/phi/core/ddim.h"
 
 namespace paddle {
 namespace prim {
@@ -36,12 +36,12 @@ using IntArray = paddle::experimental::IntArrayBase<paddle::Tensor>;
 template <typename T>
 void hardswish_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {
   if (x_grad) {
-    auto offset = full<T>(phi::vectorize(x.dims()), 3.0, x.dtype());
+    auto offset = full<T>(common::vectorize(x.dims()), 3.0, x.dtype());
     auto condition = less_equal<T>(x, offset);
     auto tmp1 = where<T>(condition, out_grad * ((x / 3.0) + 0.5), out_grad);
     auto res = where<T>(
-        less_than<T>(x, full<T>(phi::vectorize(x.dims()), -3.0, x.dtype())),
-        full<T>(phi::vectorize(x.dims()), 0.0, x.dtype()),
+        less_than<T>(x, full<T>(common::vectorize(x.dims()), -3.0, x.dtype())),
+        full<T>(common::vectorize(x.dims()), 0.0, x.dtype()),
         tmp1);
     set_output<T>(res, x_grad);
   }
@@ -54,7 +54,7 @@ void leaky_relu_grad(const Tensor& out,
                      Tensor* x_grad) {
   if (x_grad) {
     auto condition = greater_than<T>(
-        out, full<T>(phi::vectorize(out.dims()), 0.0, out.dtype()));
+        out, full<T>(common::vectorize(out.dims()), 0.0, out.dtype()));
     auto res = where<T>(condition, out_grad, out_grad * negative_slope);
     set_output<T>(res, x_grad);
   }
@@ -88,10 +88,11 @@ template <typename T>
 void relu_grad(const Tensor& out, const Tensor& out_grad, Tensor* x_grad) {
   if (x_grad) {
     auto condition = greater_than<T>(
-        out, full<T>(phi::vectorize(out.dims()), 0.0, out.dtype()));
-    auto res = where<T>(condition,
-                        out_grad,
-                        full<T>(phi::vectorize(out.dims()), 0.0, out.dtype()));
+        out, full<T>(common::vectorize(out.dims()), 0.0, out.dtype()));
+    auto res =
+        where<T>(condition,
+                 out_grad,
+                 full<T>(common::vectorize(out.dims()), 0.0, out.dtype()));
     set_output<T>(res, x_grad);
   }
 }
@@ -119,7 +120,7 @@ void softmax_grad(const Tensor& out,
       }
     } else {
       set_output<T>(
-          full<T>(phi::vectorize(out_grad.dims()), 0.0, out_grad.dtype()),
+          full<T>(common::vectorize(out_grad.dims()), 0.0, out_grad.dtype()),
           x_grad);
     }
   }
@@ -139,7 +140,7 @@ void gather_grad(const Tensor& x,
                  const Tensor& out_grad,
                  const Scalar& axis,
                  Tensor* grad_x) {
-  auto zero_tensor = full<T>(phi::vectorize(x.dims()), 0.0, x.dtype());
+  auto zero_tensor = full<T>(common::vectorize(x.dims()), 0.0, x.dtype());
   std::vector<int> tmp_perm;
 
   // change axis to rank 0
@@ -189,7 +190,7 @@ void tanh_grad(const Tensor& out, const Tensor& grad_out, Tensor* grad_x) {
 template <typename T>
 void reshape_grad(const Tensor& x, const Tensor& grad_out, Tensor* grad_x) {
   if (grad_x) {
-    auto grad_x_tmp = reshape<T>(grad_out, phi::vectorize(x.dims()));
+    auto grad_x_tmp = reshape<T>(grad_out, common::vectorize(x.dims()));
     set_output<T>(grad_x_tmp, grad_x);
   }
 }
@@ -229,8 +230,8 @@ void subtract_grad(const Tensor& x,
         by_pass<T>(scale_out_grad, dy);
       } else {
         auto dy_reduce_res =
-            scale_out_grad.sum(phi::vectorize(reduce_dim), y.dtype(), false);
-        auto dy_tmp = reshape<T>(dy_reduce_res, phi::vectorize(y.dims()));
+            scale_out_grad.sum(common::vectorize(reduce_dim), y.dtype(), false);
+        auto dy_tmp = reshape<T>(dy_reduce_res, common::vectorize(y.dims()));
         set_output<T>(dy_tmp, dy);
       }
     } else {
@@ -245,8 +246,8 @@ void subtract_grad(const Tensor& x,
         by_pass<T>(out_grad, dx);
       } else {
         auto dx_reduce_res =
-            out_grad.sum(phi::vectorize(reduce_dim), x.dtype(), false);
-        auto dx_tmp = reshape<T>(dx_reduce_res, phi::vectorize(x.dims()));
+            out_grad.sum(common::vectorize(reduce_dim), x.dtype(), false);
+        auto dx_tmp = reshape<T>(dx_reduce_res, common::vectorize(x.dims()));
         set_output<T>(dx_tmp, dx);
       }
     } else {
@@ -270,8 +271,8 @@ void add_grad(const Tensor& x,
         by_pass<T>(out_grad, dy);
       } else {
         auto dy_reduce_res =
-            out_grad.sum(phi::vectorize(reduce_dim), y.dtype(), false);
-        auto dy_tmp = reshape<T>(dy_reduce_res, phi::vectorize(y.dims()));
+            out_grad.sum(common::vectorize(reduce_dim), y.dtype(), false);
+        auto dy_tmp = reshape<T>(dy_reduce_res, common::vectorize(y.dims()));
         set_output<T>(dy_tmp, dy);
       }
 
@@ -287,8 +288,8 @@ void add_grad(const Tensor& x,
         by_pass<T>(out_grad, dx);
       } else {
         auto dx_reduce_res =
-            out_grad.sum(phi::vectorize(reduce_dim), x.dtype(), false);
-        auto dx_tmp = reshape<T>(dx_reduce_res, phi::vectorize(x.dims()));
+            out_grad.sum(common::vectorize(reduce_dim), x.dtype(), false);
+        auto dx_tmp = reshape<T>(dx_reduce_res, common::vectorize(x.dims()));
         set_output<T>(dx_tmp, dx);
       }
     } else {
@@ -307,7 +308,7 @@ void sum_grad(const Tensor& x,
   if (!x_grad) {
     return;
   }
-  std::vector<int64_t> x_dim = phi::vectorize<int64_t>(x.dims());
+  std::vector<int64_t> x_dim = common::vectorize<int64_t>(x.dims());
   int64_t axis_size = axis.size();
   int64_t x_dim_size = x_dim.size();
   reduce_all = false;
@@ -363,8 +364,8 @@ void divide_grad(const Tensor& x,
         set_output<T>(dy_res, dy);
       } else {
         auto dy_reduce_res =
-            dy_res.sum(phi::vectorize(reduce_dim), y.dtype(), false);
-        auto dy_tmp = reshape<T>(dy_reduce_res, phi::vectorize(y.dims()));
+            dy_res.sum(common::vectorize(reduce_dim), y.dtype(), false);
+        auto dy_tmp = reshape<T>(dy_reduce_res, common::vectorize(y.dims()));
         set_output<T>(dy_tmp, dy);
       }
     } else {
@@ -373,7 +374,7 @@ void divide_grad(const Tensor& x,
   }  // indicate we will compute dy
   if (dx) {
     // dx = (1/y) * dout
-    auto one_tensor = full<T>(phi::vectorize(y.dims()), 1.0, y.dtype());
+    auto one_tensor = full<T>(common::vectorize(y.dims()), 1.0, y.dtype());
     auto dx_res = one_tensor / y * out_grad;
     if (y.dims() != x.dims()) {
       // Maybe need reduce here
@@ -382,8 +383,8 @@ void divide_grad(const Tensor& x,
         set_output<T>(dx_res, dx);
       } else {
         auto dx_reduce_res =
-            dx_res.sum(phi::vectorize(reduce_dim), x.dtype(), false);
-        auto dx_tmp = reshape<T>(dx_reduce_res, phi::vectorize(x.dims()));
+            dx_res.sum(common::vectorize(reduce_dim), x.dtype(), false);
+        auto dx_tmp = reshape<T>(dx_reduce_res, common::vectorize(x.dims()));
         set_output<T>(dx_tmp, dx);
       }
 
@@ -411,8 +412,8 @@ void elementwise_pow_grad(const Tensor& x,
         set_output<T>(dy_res, dy);
       } else {
         auto dy_reduce_res =
-            dy_res.sum(phi::vectorize(reduce_dim), y.dtype(), false);
-        auto dy_tmp = reshape<T>(dy_reduce_res, phi::vectorize(y.dims()));
+            dy_res.sum(common::vectorize(reduce_dim), y.dtype(), false);
+        auto dy_tmp = reshape<T>(dy_reduce_res, common::vectorize(y.dims()));
         set_output<T>(dy_tmp, dy);
       }
     } else {
@@ -431,8 +432,8 @@ void elementwise_pow_grad(const Tensor& x,
         set_output<T>(dx_res, dx);
       } else {
         auto dx_reduce_res =
-            dx_res.sum(phi::vectorize(reduce_dim), x.dtype(), false);
-        auto dx_tmp = reshape<T>(dx_reduce_res, phi::vectorize(x.dims()));
+            dx_res.sum(common::vectorize(reduce_dim), x.dtype(), false);
+        auto dx_tmp = reshape<T>(dx_reduce_res, common::vectorize(x.dims()));
         set_output<T>(dx_tmp, dx);
       }
 
@@ -455,7 +456,7 @@ template <typename T>
 void floor_grad(const Tensor& out_grad, Tensor* x_grad) {
   if (x_grad) {
     auto zero_tensor =
-        full<T>(phi::vectorize(out_grad.dims()), 0.0, out_grad.dtype());
+        full<T>(common::vectorize(out_grad.dims()), 0.0, out_grad.dtype());
     set_output<T>(zero_tensor, x_grad);
   }
 }
@@ -498,7 +499,7 @@ void multiply_grad(const Tensor& x,
         set_output<T>(x_grad_unreduce, x_grad);
       } else {
         auto x_grad_reduced = x_grad_unreduce.sum(
-            phi::vectorize(axes), x_grad_unreduce.dtype(), false);
+            common::vectorize(axes), x_grad_unreduce.dtype(), false);
         if (x_grad_reduced.dims().size() != x.dims().size()) {
           x_grad_reduced = reshape<T>(x_grad_reduced, x.shape());
         }
@@ -516,7 +517,7 @@ void multiply_grad(const Tensor& x,
         set_output<T>(y_grad_unreduce, y_grad);
       } else {
         auto y_grad_reduced = y_grad_unreduce.sum(
-            phi::vectorize(axes), y_grad_unreduce.dtype(), false);
+            common::vectorize(axes), y_grad_unreduce.dtype(), false);
         if (y_grad_reduced.dims().size() != y.dims().size()) {
           y_grad_reduced = reshape<T>(y_grad_reduced, y.shape());
         }
@@ -534,13 +535,13 @@ void expand_grad(const Tensor& x,
                  const IntArray& shape,
                  Tensor* x_grad) {
   if (x_grad) {
-    auto out_dims = phi::make_ddim(shape.GetData());
+    auto out_dims = common::make_ddim(shape.GetData());
     if (out_dims != x.dims()) {
       auto axes = get_reduce_dims(x.dims(), out_dims);
       if (!axes.size()) {
         by_pass<T>(out_grad, x_grad);
       } else {
-        auto reduced = out_grad.sum(phi::vectorize(axes), x.dtype(), false);
+        auto reduced = out_grad.sum(common::vectorize(axes), x.dtype(), false);
         if (reduced.dims().size() != x.dims().size()) {
           reduced = reshape<T>(reduced, x.shape());
         }
@@ -609,7 +610,7 @@ void slice_grad(const Tensor& input,
     if (decrease_size > 0) {
       if (decrease_size == static_cast<size_t>(in_dims.size())) {
         // all dims decrease
-        out_dims = phi::make_ddim(std::vector<int>(decrease_size, 1));
+        out_dims = common::make_ddim(std::vector<int>(decrease_size, 1));
       } else {
         origin_out_shape.resize(out_dims.size() + decrease_size, -1);
         for (size_t i = 0; i < decrease_size; ++i) {
@@ -623,7 +624,7 @@ void slice_grad(const Tensor& input,
             ++index;
           }
         }
-        out_dims = phi::make_ddim(origin_out_shape);
+        out_dims = common::make_ddim(origin_out_shape);
       }
     }
 
@@ -705,7 +706,7 @@ void group_norm_grad(const Tensor& x,
   //
   // cal d_bias:
   // d_bias = sum(dy, axes=(0,2,3))
-  DataLayout data_layout_ = phi::StringToDataLayout(data_layout);
+  DataLayout data_layout_ = common::StringToDataLayout(data_layout);
   if (data_layout_ != DataLayout::kNCHW) {
     PADDLE_THROW(phi::errors::InvalidArgument("Unsupported storage order: %s",
                                               data_layout));
@@ -723,7 +724,7 @@ void group_norm_grad(const Tensor& x,
     out_grad_data = cast<T>(out_grad, phi::DataType::FLOAT32);
   }
 
-  std::vector<int64_t> x_dims = phi::vectorize<int64_t>(x.dims());
+  std::vector<int64_t> x_dims = common::vectorize<int64_t>(x.dims());
   auto add_axis = std::vector<int64_t>({-1});
   const int N = x_dims[0];
   const int C = x_dims[1];
@@ -881,7 +882,7 @@ void layer_norm_grad(const Tensor& x,
 
     auto d_mean_d_std = (1.0 / shape_2) * (d_mean + d_std);
     auto x_grad_tmp = dx_end - d_mean_d_std;
-    x_grad_tmp = reshape<T>(x_grad_tmp, phi::vectorize(x.dims()));
+    x_grad_tmp = reshape<T>(x_grad_tmp, common::vectorize(x.dims()));
 
     if (x.dtype() == phi::DataType::FLOAT16 ||
         x.dtype() == phi::DataType::BFLOAT16) {
@@ -962,7 +963,7 @@ void topk_grad(const Tensor& x,
       by_pass<T>(out_grad, x_grad);
       return;
     }
-    auto zero_tensor = full<T>(phi::vectorize(x.dims()), 0, x.dtype());
+    auto zero_tensor = full<T>(common::vectorize(x.dims()), 0, x.dtype());
     auto x_grad_tmp = put_along_axis<T>(zero_tensor, indices, out_grad, axis);
     set_output<T>(x_grad_tmp, x_grad);
   }
@@ -974,7 +975,7 @@ void gather_nd_grad(const Tensor& x,
                     const Tensor& out_grad,
                     Tensor* x_grad) {
   if (x_grad) {
-    auto zero_tensor = full<T>(phi::vectorize(x.dims()), 0.0, x.dtype());
+    auto zero_tensor = full<T>(common::vectorize(x.dims()), 0.0, x.dtype());
     auto x_grad_tmp = scatter_nd_add<T>(zero_tensor, index, out_grad);
     set_output<T>(x_grad_tmp, x_grad);
   }
@@ -989,7 +990,7 @@ void prod_grad(const Tensor& x,
                bool reduce_all,
                Tensor* x_grad) {
   if (x_grad) {
-    std::vector<int64_t> x_dim = phi::vectorize<int64_t>(x.dims());
+    std::vector<int64_t> x_dim = common::vectorize<int64_t>(x.dims());
     int64_t axis_size = axis.size();
     int64_t x_dim_size = x_dim.size();
     reduce_all = false;
@@ -1044,8 +1045,8 @@ void max_grad(const Tensor& x,
   if (!x_grad) {
     return;
   }
-  auto zero_tensor = full<T>(phi::vectorize(x.dims()), 0.0, x.dtype());
-  std::vector<int64_t> x_dim = phi::vectorize<int64_t>(x.dims());
+  auto zero_tensor = full<T>(common::vectorize(x.dims()), 0.0, x.dtype());
+  std::vector<int64_t> x_dim = common::vectorize<int64_t>(x.dims());
   int64_t axis_size = axis.size();
   int64_t x_dim_size = x_dim.size();
   reduce_all = false;
@@ -1095,8 +1096,9 @@ void assign_grad(const Tensor& out_grad, Tensor* x_grad) {
 template <typename T>
 void erf_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {
   if (x_grad) {
-    auto m_2_sqrt_pi = full<T>(phi::vectorize(x.dims()), M_2_SQRTPI, x.dtype());
-    auto neg_one = full<T>(phi::vectorize(x.dims()), -1.0, x.dtype());
+    auto m_2_sqrt_pi =
+        full<T>(common::vectorize(x.dims()), M_2_SQRTPI, x.dtype());
+    auto neg_one = full<T>(common::vectorize(x.dims()), -1.0, x.dtype());
     auto neg_tmp = neg_one * x * x;
     auto mul_tmp = m_2_sqrt_pi * exp<T>(neg_tmp);
     set_output<T>(out_grad * mul_tmp, x_grad);
@@ -1119,8 +1121,8 @@ void maximum_grad(const Tensor& x,
         set_output<T>(dx_res, x_grad);
       } else {
         auto dx_reduce_res =
-            dx_res.sum(phi::vectorize(reduce_dim), x.dtype(), false);
-        auto dx_tmp = reshape<T>(dx_reduce_res, phi::vectorize(x.dims()));
+            dx_res.sum(common::vectorize(reduce_dim), x.dtype(), false);
+        auto dx_tmp = reshape<T>(dx_reduce_res, common::vectorize(x.dims()));
         set_output<T>(dx_tmp, x_grad);
       }
     } else {
@@ -1138,8 +1140,8 @@ void maximum_grad(const Tensor& x,
         set_output<T>(dy_res, y_grad);
       } else {
         auto dy_reduce_res =
-            dy_res.sum(phi::vectorize(reduce_dim), y.dtype(), false);
-        auto dy_tmp = reshape<T>(dy_reduce_res, phi::vectorize(y.dims()));
+            dy_res.sum(common::vectorize(reduce_dim), y.dtype(), false);
+        auto dy_tmp = reshape<T>(dy_reduce_res, common::vectorize(y.dims()));
         set_output<T>(dy_tmp, y_grad);
       }
     } else {
@@ -1198,7 +1200,7 @@ void scatter_grad(const Tensor& index,
                   Tensor* updates_grad) {
   if (x_grad) {
     auto zero_tensor =
-        full<T>(phi::vectorize(updates.dims()), 0.0, updates.dtype());
+        full<T>(common::vectorize(updates.dims()), 0.0, updates.dtype());
     auto tmp_grad = scatter<T>(out_grad, index, zero_tensor, false);
     set_output<T>(tmp_grad, x_grad);
   }
@@ -1231,7 +1233,7 @@ void batch_norm_grad(const Tensor& x,
                      Tensor* bias_grad) {
   use_global_stats = is_test || use_global_stats;
 
-  DataLayout data_layout_ = phi::StringToDataLayout(data_layout);
+  DataLayout data_layout_ = common::StringToDataLayout(data_layout);
 
   Tensor x_data = x;
   Tensor out_grad_data = out_grad;
@@ -1268,7 +1270,7 @@ void batch_norm_grad(const Tensor& x,
 
   if (use_global_stats) {
     auto eps =
-        full<T>(phi::vectorize(run_var.dims()), epsilon, run_var.dtype());
+        full<T>(common::vectorize(run_var.dims()), epsilon, run_var.dtype());
     mean_data = run_mean;
     rsqrt_var = (run_var + eps).pow(-0.5);
   } else {
@@ -1573,8 +1575,8 @@ void minimum_grad(const Tensor& x,
         set_output<T>(dx_res, x_grad);
       } else {
         auto dx_reduce_res =
-            dx_res.sum(phi::vectorize(reduce_dim), x.dtype(), false);
-        auto dx_tmp = reshape<T>(dx_reduce_res, phi::vectorize(x.dims()));
+            dx_res.sum(common::vectorize(reduce_dim), x.dtype(), false);
+        auto dx_tmp = reshape<T>(dx_reduce_res, common::vectorize(x.dims()));
         set_output<T>(dx_tmp, x_grad);
       }
     } else {
@@ -1592,8 +1594,8 @@ void minimum_grad(const Tensor& x,
         set_output<T>(dy_res, y_grad);
       } else {
         auto dy_reduce_res =
-            dy_res.sum(phi::vectorize(reduce_dim), y.dtype(), false);
-        auto dy_tmp = reshape<T>(dy_reduce_res, phi::vectorize(y.dims()));
+            dy_res.sum(common::vectorize(reduce_dim), y.dtype(), false);
+        auto dy_tmp = reshape<T>(dy_reduce_res, common::vectorize(y.dims()));
         set_output<T>(dy_tmp, y_grad);
       }
     } else {
@@ -1609,13 +1611,13 @@ void tile_grad(const Tensor& x,
                Tensor* x_grad) {
   if (x_grad) {
     auto repeat_times_data = repeat_times.GetData();
-    auto out_grad_shape = phi::vectorize<int>(out_grad.dims());
+    auto out_grad_shape = common::vectorize<int>(out_grad.dims());
     auto result = out_grad;
     for (int i = 0; i < static_cast<int>(repeat_times_data.size()); i++) {
       int size = out_grad_shape[i] / repeat_times_data[i];
       std::vector<int> sections(repeat_times_data[i], size);
       auto split_arr = split<T>(result, IntArray(sections), i);
-      result = full<T>(phi::vectorize(split_arr[0].dims()), 0.0, x.dtype());
+      result = full<T>(common::vectorize(split_arr[0].dims()), 0.0, x.dtype());
       for (int j = 0; j < static_cast<int>(split_arr.size()); j++) {
         result = split_arr[j] + result;
       }
diff --git a/paddle/fluid/prim/api/composite_backward/composite_double_backward_api.h b/paddle/fluid/prim/api/composite_backward/composite_double_backward_api.h
index e20e4a965c9939..1bb91d977cd1e2 100644
--- a/paddle/fluid/prim/api/composite_backward/composite_double_backward_api.h
+++ b/paddle/fluid/prim/api/composite_backward/composite_double_backward_api.h
@@ -20,11 +20,11 @@
 
 #include <math.h>
 
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/prim/api/all.h"
 #include "paddle/fluid/prim/api/generated_prim/prim_generated_api.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/int_array.h"
-#include "paddle/phi/core/ddim.h"
 
 namespace paddle {
 namespace prim {
@@ -135,9 +135,9 @@ void matmul_double_grad(const Tensor& x,
                         Tensor* y_grad,
                         Tensor* grad_out_grad) {
   // Get dims from the input x, y, output_grad
-  std::vector<std::int64_t> x_dims = vectorize(x.dims());
-  std::vector<std::int64_t> y_dims = vectorize(y.dims());
-  std::vector<std::int64_t> grad_out_dims = vectorize(grad_out.dims());
+  std::vector<std::int64_t> x_dims = common::vectorize(x.dims());
+  std::vector<std::int64_t> y_dims = common::vectorize(y.dims());
+  std::vector<std::int64_t> grad_out_dims = common::vectorize(grad_out.dims());
 
   int x_ndim = x_dims.size();
   int y_ndim = y_dims.size();
@@ -384,12 +384,13 @@ void matmul_double_grad(const Tensor& x,
   }
 
   // recover the original dim of output (delete 1)
-  std::vector<int64_t> dx_dims =
-      dx.initialized() ? vectorize(dx.dims()) : std::vector<int64_t>({});
-  std::vector<int64_t> dy_dims =
-      dy.initialized() ? vectorize(dy.dims()) : std::vector<int64_t>({});
-  std::vector<int64_t> ddout_dims =
-      ddout.initialized() ? vectorize(ddout.dims()) : std::vector<int64_t>({});
+  std::vector<int64_t> dx_dims = dx.initialized() ? common::vectorize(dx.dims())
+                                                  : std::vector<int64_t>({});
+  std::vector<int64_t> dy_dims = dy.initialized() ? common::vectorize(dy.dims())
+                                                  : std::vector<int64_t>({});
+  std::vector<int64_t> ddout_dims = ddout.initialized()
+                                        ? common::vectorize(ddout.dims())
+                                        : std::vector<int64_t>({});
   if (x_ndim == 1 && y_ndim == 1) {
     if (dx.initialized() && dx_dims[0] == 1) {
       dx = reshape<T>(dx, IntArray(x_dims));
@@ -470,7 +471,7 @@ void multiply_double_grad(const Tensor& x,
         if (!axes.size()) {
           set_output<T>(dx, x_grad);
         } else {
-          auto dx_reduce = dx.sum(phi::vectorize(axes), dx.dtype(), false);
+          auto dx_reduce = dx.sum(common::vectorize(axes), dx.dtype(), false);
           if (dx_reduce.dims().size() != x.dims().size()) {
             dx_reduce = reshape<T>(dx_reduce, x.shape());
           }
@@ -481,7 +482,7 @@ void multiply_double_grad(const Tensor& x,
       }
 
     } else {
-      auto dx = full<T>(phi::vectorize(x.dims()), 0.0, x.dtype());
+      auto dx = full<T>(common::vectorize(x.dims()), 0.0, x.dtype());
       set_output<T>(dx, x_grad);
     }
   }
@@ -493,7 +494,7 @@ void multiply_double_grad(const Tensor& x,
         if (!axes.size()) {
           set_output<T>(dy, y_grad);
         } else {
-          auto dy_reduce = dy.sum(phi::vectorize(axes), dy.dtype(), false);
+          auto dy_reduce = dy.sum(common::vectorize(axes), dy.dtype(), false);
           if (dy_reduce.dims().size() != y.dims().size()) {
             dy_reduce = reshape<T>(dy_reduce, y.shape());
           }
@@ -503,7 +504,7 @@ void multiply_double_grad(const Tensor& x,
         set_output<T>(dy, y_grad);
       }
     } else {
-      auto dy = full<T>(phi::vectorize(y.dims()), 0.0, y.dtype());
+      auto dy = full<T>(common::vectorize(y.dims()), 0.0, y.dtype());
       set_output<T>(dy, y_grad);
     }
   }
@@ -516,7 +517,8 @@ void multiply_double_grad(const Tensor& x,
     } else if (grad_y_grad) {
       ddout = grad_y_grad.get() * x;
     } else {
-      ddout = full<T>(phi::vectorize(grad_out.dims()), 0.0, grad_out.dtype());
+      ddout =
+          full<T>(common::vectorize(grad_out.dims()), 0.0, grad_out.dtype());
     }
     set_output<T>(ddout, grad_out_grad);
   }
@@ -531,7 +533,7 @@ void add_double_grad(const Tensor& y,
                      Tensor* grad_out_grad) {
   if (grad_out_grad) {
     // ddout = ddx + ddy
-    Tensor ddout = full<T>(phi::vectorize(grad_out.dims()), 0.0, y.dtype());
+    Tensor ddout = full<T>(common::vectorize(grad_out.dims()), 0.0, y.dtype());
     if (!grad_x_grad && !grad_y_grad) {
       set_output<T>(ddout, grad_out_grad);
     } else {
@@ -563,9 +565,9 @@ void add_triple_grad(const paddle::optional<Tensor>& grad_grad_x,
           by_pass<T>(grad_grad_out_grad, grad_grad_y_grad);
         } else {
           auto dddy_reduce_res = grad_grad_out_grad.sum(
-              phi::vectorize(reduce_dim), grad_grad_y.get().dtype(), false);
-          auto dddy_tmp = reshape<T>(dddy_reduce_res,
-                                     phi::vectorize(grad_grad_y.get().dims()));
+              common::vectorize(reduce_dim), grad_grad_y.get().dtype(), false);
+          auto dddy_tmp = reshape<T>(
+              dddy_reduce_res, common::vectorize(grad_grad_y.get().dims()));
           set_output<T>(dddy_tmp, grad_grad_y_grad);
         }
       } else {
@@ -585,9 +587,9 @@ void add_triple_grad(const paddle::optional<Tensor>& grad_grad_x,
           by_pass<T>(grad_grad_out_grad, grad_grad_x_grad);
         } else {
           auto dddx_reduce_res = grad_grad_out_grad.sum(
-              phi::vectorize(reduce_dim), grad_grad_x.get().dtype(), false);
-          auto dddx_tmp = reshape<T>(dddx_reduce_res,
-                                     phi::vectorize(grad_grad_x.get().dims()));
+              common::vectorize(reduce_dim), grad_grad_x.get().dtype(), false);
+          auto dddx_tmp = reshape<T>(
+              dddx_reduce_res, common::vectorize(grad_grad_x.get().dims()));
           set_output<T>(dddx_tmp, grad_grad_x_grad);
         }
       } else {
@@ -611,7 +613,8 @@ void subtract_double_grad(const Tensor& y,
     if (!grad_x_grad && !grad_y_grad) {
       grad_out_grad = nullptr;
     } else {
-      Tensor ddout = full<T>(phi::vectorize(grad_out.dims()), 0.0, y.dtype());
+      Tensor ddout =
+          full<T>(common::vectorize(grad_out.dims()), 0.0, y.dtype());
       if (grad_x_grad) {
         ddout = ddout + grad_x_grad.get();
       }
diff --git a/paddle/fluid/prim/api/manual_prim/utils/static_utils.cc b/paddle/fluid/prim/api/manual_prim/utils/static_utils.cc
index f89a898ca1a58e..2f76e8bbd966f0 100644
--- a/paddle/fluid/prim/api/manual_prim/utils/static_utils.cc
+++ b/paddle/fluid/prim/api/manual_prim/utils/static_utils.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/common/macros.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/op_desc.h"
@@ -21,7 +22,6 @@
 #include "paddle/fluid/prim/utils/static/desc_tensor.h"
 #include "paddle/fluid/prim/utils/static/static_global_utils.h"
 #include "paddle/phi/api/include/tensor.h"
-#include "paddle/phi/core/macros.h"
 #include "paddle/phi/core/utils/data_type.h"
 namespace paddle {
 namespace prim {
diff --git a/paddle/fluid/prim/api/manual_prim/utils/utils.h b/paddle/fluid/prim/api/manual_prim/utils/utils.h
index d37a50c21a8e7b..90a25f8bf1e1fd 100644
--- a/paddle/fluid/prim/api/manual_prim/utils/utils.h
+++ b/paddle/fluid/prim/api/manual_prim/utils/utils.h
@@ -15,13 +15,13 @@
 #pragma once
 #include <string>
 #include <vector>
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
 #include "paddle/fluid/prim/api/generated_prim/prim_generated_api.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/place.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 
 namespace paddle {
@@ -72,7 +72,7 @@ static phi::DDim get_reduce_dims_from_out(const phi::DDim& dout_dims,
               i));
     }
   }
-  return phi::make_ddim(result);
+  return common::make_ddim(result);
 }
 
 static phi::DDim get_reduce_dims(const phi::DDim& x_dims,
@@ -91,7 +91,7 @@ static std::vector<int> get_reduce_dims(const Tensor& dx,
   if (dout_ndim < x_ndim) {
     return std::vector<int>({});
   }
-  const std::vector<std::int64_t> dx_dims = phi::vectorize(dx.dims());
+  const std::vector<std::int64_t> dx_dims = common::vectorize(dx.dims());
   std::vector<std::int64_t> broadcast_dims(dout_ndim);
   std::fill(
       broadcast_dims.data(), broadcast_dims.data() + dout_ndim - x_ndim, 1);
diff --git a/paddle/fluid/prim/utils/static/desc_tensor.h b/paddle/fluid/prim/utils/static/desc_tensor.h
index 7d8c939fec122f..cf4db764bb5ca5 100644
--- a/paddle/fluid/prim/utils/static/desc_tensor.h
+++ b/paddle/fluid/prim/utils/static/desc_tensor.h
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #pragma once
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/var_desc.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/extended_tensor.h"
 #include "paddle/phi/core/utils/data_type.h"
 #include "paddle/utils/any.h"
@@ -27,7 +27,7 @@ class DescTensor : public phi::ExtendedTensor,
                    public phi::TypeInfoTraits<phi::TensorBase, DescTensor> {
  public:
   explicit DescTensor(framework::VarDesc* desc)
-      : desc_ptr_(desc), dims_(phi::make_ddim(desc->GetShape())) {}
+      : desc_ptr_(desc), dims_(common::make_ddim(desc->GetShape())) {}
   static const char* name() { return "DescTensor"; }
 
   std::string Name() const { return desc_ptr_->Name(); }
@@ -35,7 +35,7 @@ class DescTensor : public phi::ExtendedTensor,
   std::vector<int64_t> shape() const { return desc_ptr_->GetShape(); }
 
   const phi::DDim& dims() const override {
-    dims_ = phi::make_ddim(desc_ptr_->GetShape());
+    dims_ = common::make_ddim(desc_ptr_->GetShape());
     return dims_;
   }
 
diff --git a/paddle/fluid/primitive/backend/CMakeLists.txt b/paddle/fluid/primitive/backend/CMakeLists.txt
index ec3d39c8739c10..3e857778a4f144 100644
--- a/paddle/fluid/primitive/backend/CMakeLists.txt
+++ b/paddle/fluid/primitive/backend/CMakeLists.txt
@@ -4,7 +4,7 @@ if(WITH_PYTHON OR NOT ON_INFER)
   cc_library(
     primitive_backend_eager_experimental
     SRCS ${eager_backend_files}
-    DEPS final_dygraph_function eager_utils phi)
+    DEPS final_dygraph_function eager_utils phi common)
 endif()
 set(static_backend_files
     ${CMAKE_CURRENT_SOURCE_DIR}/generated/generated_static_backend.cc
diff --git a/paddle/fluid/primitive/composite/composite.h b/paddle/fluid/primitive/composite/composite.h
index bb1a91a110793f..9b7323b76f2ae9 100644
--- a/paddle/fluid/primitive/composite/composite.h
+++ b/paddle/fluid/primitive/composite/composite.h
@@ -31,7 +31,7 @@ Tensor mean_decomp(const Tensor& x, const IntArray& axis, bool keepdim) {
   if (need_cast) {
     x_tmp = cast<T>(x, phi::DataType::FLOAT32);
   }
-  std::vector<int64_t> x_dim = phi::vectorize<int64_t>(x_tmp.dims());
+  std::vector<int64_t> x_dim = common::vectorize<int64_t>(x_tmp.dims());
   int64_t axis_size = axis.size();
   int64_t x_dim_size = x_dim.size();
   auto axis_ = std::vector<int64_t>();
@@ -54,7 +54,7 @@ Tensor mean_decomp(const Tensor& x, const IntArray& axis, bool keepdim) {
   }
   auto sum_x = sum<T>(x_tmp, IntArray(axis_), x_tmp.dtype(), keepdim);
   auto res =
-      sum_x / full<T>(phi::vectorize(sum_x.dims()), value, sum_x.dtype());
+      sum_x / full<T>(common::vectorize(sum_x.dims()), value, sum_x.dtype());
   if (need_cast) {
     return cast<T>(res, org_dtype);
   } else {
@@ -93,7 +93,7 @@ Tensor pow_decomp(const Tensor& x, const paddle::Scalar& y) {
 
   Tensor y_full;
   if (valid_type(y.dtype())) {
-    y_full = full<T>(phi::vectorize(x_cast.dims()), y, x_cast.dtype());
+    y_full = full<T>(common::vectorize(x_cast.dims()), y, x_cast.dtype());
   } else {
     PADDLE_THROW(phi::errors::InvalidArgument(
         "Unsupported data type: %s", phi::DataTypeToString(y.dtype())));
@@ -128,9 +128,9 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> batch_norm_decomp(
     x_cast = cast<T>(x, phi::DataType::FLOAT32);
   }
 
-  std::vector<int64_t> x_dim = phi::vectorize<int64_t>(x_cast.dims());
+  std::vector<int64_t> x_dim = common::vectorize<int64_t>(x_cast.dims());
   int rank = x_dim.size();
-  DataLayout data_layout_ = phi::StringToDataLayout(data_layout);
+  DataLayout data_layout_ = common::StringToDataLayout(data_layout);
   int feature_axis;
   if (data_layout_ == DataLayout::kNCHW) {
     feature_axis = 1;
@@ -177,9 +177,10 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> batch_norm_decomp(
     run_mean_ = run_mean * momentum + batch_mean * (1. - momentum);
     run_var_ = run_var * momentum + batch_var * (1. - momentum);
   } else {
-    batch_mean = full<T>(phi::vectorize(run_mean.dims()), 0, run_mean.dtype());
+    batch_mean =
+        full<T>(common::vectorize(run_mean.dims()), 0, run_mean.dtype());
     auto batch_var =
-        full<T>(phi::vectorize(run_var.dims()), 0, run_var.dtype());
+        full<T>(common::vectorize(run_var.dims()), 0, run_var.dtype());
     inv_std = elementwise_pow<T>((batch_var + epsilon), half);
     if (data_layout_ == DataLayout::kNHWC) {
       x_hat =
@@ -195,10 +196,10 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> batch_norm_decomp(
   Tensor y;
   Tensor new_scale =
       scale ? scale.get()
-            : full<T>(phi::vectorize(x_cast.dims()), 1, x_cast.dtype());
+            : full<T>(common::vectorize(x_cast.dims()), 1, x_cast.dtype());
   Tensor new_bias =
       bias ? bias.get()
-           : full<T>(phi::vectorize(x_cast.dims()), 0, x_cast.dtype());
+           : full<T>(common::vectorize(x_cast.dims()), 0, x_cast.dtype());
   if (data_layout_ == DataLayout::kNHWC) {
     y = x_hat * new_scale + new_bias;
   } else {
@@ -254,9 +255,9 @@ Tensor silu_decomp(const Tensor& x) {
   }
 
   // res = x / (1 + exp(-x))
-  auto one = full<T>(phi::vectorize(x.dims()), 1, x_tmp.dtype());
+  auto one = full<T>(common::vectorize(x.dims()), 1, x_tmp.dtype());
   auto exp_temp =
-      exp<T>(full<T>(phi::vectorize(x.dims()), -1, x_tmp.dtype()) * x_tmp);
+      exp<T>(full<T>(common::vectorize(x.dims()), -1, x_tmp.dtype()) * x_tmp);
   auto res = x_tmp / (exp_temp + one);
   if (need_cast) {
     return cast<T>(res, org_dtype);
@@ -267,7 +268,7 @@ Tensor silu_decomp(const Tensor& x) {
 
 template <typename T>
 Tensor relu_decomp(const Tensor& x) {
-  return maximum<T>(x, full<T>(phi::vectorize(x.dims()), 0.0, x.dtype()));
+  return maximum<T>(x, full<T>(common::vectorize(x.dims()), 0.0, x.dtype()));
 }
 
 template <typename T>
@@ -281,7 +282,7 @@ Tensor rsqrt_decomp(const Tensor& x) {
   }
 
   auto ans = elementwise_pow<T>(
-      x_cast, full<T>(phi::vectorize(x_cast.dims()), -0.5, x_cast.dtype()));
+      x_cast, full<T>(common::vectorize(x_cast.dims()), -0.5, x_cast.dtype()));
   if (need_cast) {
     return cast<T>(ans, org_dtype);
   } else {
@@ -326,7 +327,7 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_decomp(
     x_cast = cast<T>(x_cast, phi::DataType::FLOAT32);
   }
 
-  auto x_dim = phi::vectorize<int64_t>(x.dims());
+  auto x_dim = common::vectorize<int64_t>(x.dims());
   for (size_t i = begin_norm_axis; i < x_dim.size(); i++) {
     axis.push_back(static_cast<int64_t>(i));
   }
@@ -337,7 +338,7 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_decomp(
   auto var_tmp3 = variance + epsilon;
   auto rsqrt_var = elementwise_pow<T>(
       var_tmp3,
-      full<T>(phi::vectorize(var_tmp3.dims()), -0.5, var_tmp3.dtype()));
+      full<T>(common::vectorize(var_tmp3.dims()), -0.5, var_tmp3.dtype()));
   auto out = difference * rsqrt_var;
 
   auto scale_ptr = scale.get_ptr();
@@ -462,7 +463,7 @@ Tensor sqrt_decomp(const Tensor& x) {
   }
 
   auto ans = elementwise_pow<T>(
-      x_cast, full<T>(phi::vectorize(x_cast.dims()), 0.5, x_cast.dtype()));
+      x_cast, full<T>(common::vectorize(x_cast.dims()), 0.5, x_cast.dtype()));
   if (need_cast) {
     return cast<T>(ans, org_dtype);
   } else {
@@ -476,22 +477,24 @@ Tensor gelu_decomp(const Tensor& x, bool approximate) {
   const double PM_SQRT1_2 = 0.70710678118654752440;  /* 1/sqrt(2) */
 
   auto org_dtype = x.dtype();
-  auto half = full<T>(phi::vectorize(x.dims()), 0.5, org_dtype);
-  auto one = full<T>(phi::vectorize(x.dims()), 1.0, org_dtype);
+  auto half = full<T>(common::vectorize(x.dims()), 0.5, org_dtype);
+  auto one = full<T>(common::vectorize(x.dims()), 1.0, org_dtype);
   if (approximate) {
     // gelu(x) = 0.5 * x * (1 + tanh(sqrt(2 / \pi) * (x + 0.044715 * x^{3})))
-    auto kAlpha =
-        full<T>(phi::vectorize(x.dims()), PM_2_SQRTPI * PM_SQRT1_2, org_dtype);
-    auto GELU_CONSTANT = full<T>(phi::vectorize(x.dims()), 0.044715, org_dtype);
-    auto x_pow3 =
-        elementwise_pow<T>(x, full<T>(phi::vectorize(x.dims()), 3, org_dtype));
+    auto kAlpha = full<T>(
+        common::vectorize(x.dims()), PM_2_SQRTPI * PM_SQRT1_2, org_dtype);
+    auto GELU_CONSTANT =
+        full<T>(common::vectorize(x.dims()), 0.044715, org_dtype);
+    auto x_pow3 = elementwise_pow<T>(
+        x, full<T>(common::vectorize(x.dims()), 3, org_dtype));
     auto tanh_out = tanh<T>(kAlpha * (x + x_pow3 * GELU_CONSTANT));
 
     auto res = x * half * (one + tanh_out);
     return res;
   } else {
     // gelu(x) = 0.5 * x *  (1 + erf(x / sqrt(2)))
-    auto M_SQRT1_2T = full<T>(phi::vectorize(x.dims()), PM_SQRT1_2, org_dtype);
+    auto M_SQRT1_2T =
+        full<T>(common::vectorize(x.dims()), PM_SQRT1_2, org_dtype);
     auto erf_out = one + erf<T>(x * M_SQRT1_2T);
 
     auto res = x * half * erf_out;
diff --git a/paddle/fluid/primitive/rule/vjp/details.h b/paddle/fluid/primitive/rule/vjp/details.h
index 703e72f3bac0d5..16cef793c3f72b 100644
--- a/paddle/fluid/primitive/rule/vjp/details.h
+++ b/paddle/fluid/primitive/rule/vjp/details.h
@@ -47,8 +47,8 @@ void divide_grad(const Tensor& x,
         set_output<T>(dy_res, dy);
       } else {
         auto dy_reduce_res =
-            sum<T>(dy_res, phi::vectorize(reduce_dim), y.dtype(), false);
-        auto dy_tmp = reshape<T>(dy_reduce_res, phi::vectorize(y.dims()));
+            sum<T>(dy_res, common::vectorize(reduce_dim), y.dtype(), false);
+        auto dy_tmp = reshape<T>(dy_reduce_res, common::vectorize(y.dims()));
         set_output<T>(dy_tmp, dy);
       }
     } else {
@@ -57,7 +57,7 @@ void divide_grad(const Tensor& x,
   }  // indicate we will compute dy
   if (dx) {
     // dx = (1/y) * dout
-    auto one_tensor = full<T>(phi::vectorize(y.dims()), 1.0, y.dtype());
+    auto one_tensor = full<T>(common::vectorize(y.dims()), 1.0, y.dtype());
     auto dx_res = one_tensor / y * out_grad;
     if (y.dims() != x.dims()) {
       // Maybe need reduce here
@@ -66,8 +66,8 @@ void divide_grad(const Tensor& x,
         set_output<T>(dx_res, dx);
       } else {
         auto dx_reduce_res =
-            sum<T>(dx_res, phi::vectorize(reduce_dim), x.dtype(), false);
-        auto dx_tmp = reshape<T>(dx_reduce_res, phi::vectorize(x.dims()));
+            sum<T>(dx_res, common::vectorize(reduce_dim), x.dtype(), false);
+        auto dx_tmp = reshape<T>(dx_reduce_res, common::vectorize(x.dims()));
         set_output<T>(dx_tmp, dx);
       }
 
@@ -87,7 +87,7 @@ void sum_grad(const Tensor& x,
   if (!x_grad) {
     return;
   }
-  std::vector<int64_t> x_dim = phi::vectorize<int64_t>(x.dims());
+  std::vector<int64_t> x_dim = common::vectorize<int64_t>(x.dims());
   int64_t axis_size = axis.size();
   int64_t x_dim_size = x_dim.size();
   reduce_all = false;
@@ -206,8 +206,8 @@ void reshape_grad(const Tensor& xshape,
   if (grad_x) {
     // xshape: [0] + x.shape
     auto xshape_dims = xshape.dims();
-    auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
-    auto grad_x_tmp = reshape<T>(grad_out, phi::vectorize(x_dims));
+    auto x_dims = common::slice_ddim(xshape_dims, 1, xshape_dims.size());
+    auto grad_x_tmp = reshape<T>(grad_out, common::vectorize(x_dims));
     set_output<T>(grad_x_tmp, grad_x);
   }
 }
@@ -296,8 +296,8 @@ void add_grad(const Tensor& x,
         by_pass<T>(out_grad, dy);
       } else {
         auto dy_reduce_res =
-            out_grad.sum(phi::vectorize(reduce_dim), y.dtype(), false);
-        auto dy_tmp = reshape<T>(dy_reduce_res, phi::vectorize(y.dims()));
+            out_grad.sum(common::vectorize(reduce_dim), y.dtype(), false);
+        auto dy_tmp = reshape<T>(dy_reduce_res, common::vectorize(y.dims()));
         set_output<T>(dy_tmp, dy);
       }
 
@@ -313,8 +313,8 @@ void add_grad(const Tensor& x,
         by_pass<T>(out_grad, dx);
       } else {
         auto dx_reduce_res =
-            out_grad.sum(phi::vectorize(reduce_dim), x.dtype(), false);
-        auto dx_tmp = reshape<T>(dx_reduce_res, phi::vectorize(x.dims()));
+            out_grad.sum(common::vectorize(reduce_dim), x.dtype(), false);
+        auto dx_tmp = reshape<T>(dx_reduce_res, common::vectorize(x.dims()));
         set_output<T>(dx_tmp, dx);
       }
     } else {
@@ -339,8 +339,8 @@ void subtract_grad(const Tensor& x,
         by_pass<T>(scale_out_grad, dy);
       } else {
         auto dy_reduce_res =
-            scale_out_grad.sum(phi::vectorize(reduce_dim), y.dtype(), false);
-        auto dy_tmp = reshape<T>(dy_reduce_res, phi::vectorize(y.dims()));
+            scale_out_grad.sum(common::vectorize(reduce_dim), y.dtype(), false);
+        auto dy_tmp = reshape<T>(dy_reduce_res, common::vectorize(y.dims()));
         set_output<T>(dy_tmp, dy);
       }
     } else {
@@ -355,8 +355,8 @@ void subtract_grad(const Tensor& x,
         by_pass<T>(out_grad, dx);
       } else {
         auto dx_reduce_res =
-            out_grad.sum(phi::vectorize(reduce_dim), x.dtype(), false);
-        auto dx_tmp = reshape<T>(dx_reduce_res, phi::vectorize(x.dims()));
+            out_grad.sum(common::vectorize(reduce_dim), x.dtype(), false);
+        auto dx_tmp = reshape<T>(dx_reduce_res, common::vectorize(x.dims()));
         set_output<T>(dx_tmp, dx);
       }
     } else {
@@ -380,7 +380,7 @@ void multiply_grad(const Tensor& x,
         set_output<T>(x_grad_unreduce, x_grad);
       } else {
         auto x_grad_reduced = x_grad_unreduce.sum(
-            phi::vectorize(axes), x_grad_unreduce.dtype(), false);
+            common::vectorize(axes), x_grad_unreduce.dtype(), false);
         if (x_grad_reduced.dims().size() != x.dims().size()) {
           x_grad_reduced = reshape<T>(x_grad_reduced, x.shape());
         }
@@ -398,7 +398,7 @@ void multiply_grad(const Tensor& x,
         set_output<T>(y_grad_unreduce, y_grad);
       } else {
         auto y_grad_reduced = y_grad_unreduce.sum(
-            phi::vectorize(axes), y_grad_unreduce.dtype(), false);
+            common::vectorize(axes), y_grad_unreduce.dtype(), false);
         if (y_grad_reduced.dims().size() != y.dims().size()) {
           y_grad_reduced = reshape<T>(y_grad_reduced, y.shape());
         }
@@ -428,8 +428,8 @@ void elementwise_pow_grad(const Tensor& x,
         set_output<T>(dy_res, dy);
       } else {
         auto dy_reduce_res =
-            dy_res.sum(phi::vectorize(reduce_dim), y.dtype(), false);
-        auto dy_tmp = reshape<T>(dy_reduce_res, phi::vectorize(y.dims()));
+            dy_res.sum(common::vectorize(reduce_dim), y.dtype(), false);
+        auto dy_tmp = reshape<T>(dy_reduce_res, common::vectorize(y.dims()));
         set_output<T>(dy_tmp, dy);
       }
     } else {
@@ -448,8 +448,8 @@ void elementwise_pow_grad(const Tensor& x,
         set_output<T>(dx_res, dx);
       } else {
         auto dx_reduce_res =
-            dx_res.sum(phi::vectorize(reduce_dim), x.dtype(), false);
-        auto dx_tmp = reshape<T>(dx_reduce_res, phi::vectorize(x.dims()));
+            dx_res.sum(common::vectorize(reduce_dim), x.dtype(), false);
+        auto dx_tmp = reshape<T>(dx_reduce_res, common::vectorize(x.dims()));
         set_output<T>(dx_tmp, dx);
       }
 
@@ -508,7 +508,7 @@ void layer_norm_grad(const Tensor& x,
   auto tmp = (1.0 / (variance_ + epsilon));  // M,1
   // auto sqrt_var_1 = sqrt<T>(tmp);            // M,1
   auto sqrt_var_1 = elementwise_pow<T>(
-      tmp, full<T>(phi::vectorize(tmp.dims()), 0.5, tmp.dtype()));
+      tmp, full<T>(common::vectorize(tmp.dims()), 0.5, tmp.dtype()));
   auto x_sub_mean_mul_sqrt_var_1 = x_sub_mean * sqrt_var_1;
 
   if (x_grad) {
@@ -528,7 +528,7 @@ void layer_norm_grad(const Tensor& x,
 
     auto d_mean_d_std = (1.0 / shape_2) * (d_mean + d_std);
     auto x_grad_tmp = dx_end - d_mean_d_std;
-    x_grad_tmp = reshape<T>(x_grad_tmp, phi::vectorize(x.dims()));
+    x_grad_tmp = reshape<T>(x_grad_tmp, common::vectorize(x.dims()));
 
     if (x.dtype() == phi::DataType::FLOAT16 ||
         x.dtype() == phi::DataType::BFLOAT16) {
@@ -601,8 +601,9 @@ void dropout_grad(const Tensor& mask,
 template <typename T>
 void erf_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {
   if (x_grad) {
-    auto m_2_sqrt_pi = full<T>(phi::vectorize(x.dims()), M_2_SQRTPI, x.dtype());
-    auto neg_one = full<T>(phi::vectorize(x.dims()), -1.0, x.dtype());
+    auto m_2_sqrt_pi =
+        full<T>(common::vectorize(x.dims()), M_2_SQRTPI, x.dtype());
+    auto neg_one = full<T>(common::vectorize(x.dims()), -1.0, x.dtype());
     auto neg_tmp = neg_one * x * x;
     auto mul_tmp = m_2_sqrt_pi * exp<T>(neg_tmp);
     set_output<T>(out_grad * mul_tmp, x_grad);
@@ -615,13 +616,13 @@ void expand_grad(const Tensor& x,
                  const IntArray& shape,
                  Tensor* x_grad) {
   if (x_grad) {
-    auto out_dims = phi::make_ddim(shape.GetData());
+    auto out_dims = common::make_ddim(shape.GetData());
     if (out_dims != x.dims()) {
       auto axes = get_reduce_dims(x.dims(), out_dims);
       if (!axes.size()) {
         by_pass<T>(out_grad, x_grad);
       } else {
-        auto reduced = out_grad.sum(phi::vectorize(axes), x.dtype(), false);
+        auto reduced = out_grad.sum(common::vectorize(axes), x.dtype(), false);
         if (reduced.dims().size() != x.dims().size()) {
           reduced = reshape<T>(reduced, x.shape());
         }
@@ -732,8 +733,8 @@ void maximum_grad(const Tensor& x,
         set_output<T>(dx_res, x_grad);
       } else {
         auto dx_reduce_res =
-            dx_res.sum(phi::vectorize(reduce_dim), x.dtype(), false);
-        auto dx_tmp = reshape<T>(dx_reduce_res, phi::vectorize(x.dims()));
+            dx_res.sum(common::vectorize(reduce_dim), x.dtype(), false);
+        auto dx_tmp = reshape<T>(dx_reduce_res, common::vectorize(x.dims()));
         set_output<T>(dx_tmp, x_grad);
       }
     } else {
@@ -751,8 +752,8 @@ void maximum_grad(const Tensor& x,
         set_output<T>(dy_res, y_grad);
       } else {
         auto dy_reduce_res =
-            dy_res.sum(phi::vectorize(reduce_dim), y.dtype(), false);
-        auto dy_tmp = reshape<T>(dy_reduce_res, phi::vectorize(y.dims()));
+            dy_res.sum(common::vectorize(reduce_dim), y.dtype(), false);
+        auto dy_tmp = reshape<T>(dy_reduce_res, common::vectorize(y.dims()));
         set_output<T>(dy_tmp, y_grad);
       }
     } else {
@@ -765,10 +766,11 @@ template <typename T>
 void relu_grad(const Tensor& out, const Tensor& out_grad, Tensor* x_grad) {
   if (x_grad) {
     auto condition = greater_than<T>(
-        out, full<T>(phi::vectorize(out.dims()), 0.0, out.dtype()));
-    auto res = where<T>(condition,
-                        out_grad,
-                        full<T>(phi::vectorize(out.dims()), 0.0, out.dtype()));
+        out, full<T>(common::vectorize(out.dims()), 0.0, out.dtype()));
+    auto res =
+        where<T>(condition,
+                 out_grad,
+                 full<T>(common::vectorize(out.dims()), 0.0, out.dtype()));
     set_output<T>(res, x_grad);
   }
 }
@@ -779,7 +781,7 @@ void gather_nd_grad(const Tensor& x,
                     const Tensor& out_grad,
                     Tensor* x_grad) {
   if (x_grad) {
-    auto zero_tensor = full<T>(phi::vectorize(x.dims()), 0.0, x.dtype());
+    auto zero_tensor = full<T>(common::vectorize(x.dims()), 0.0, x.dtype());
     auto x_grad_tmp = scatter_nd_add<T>(zero_tensor, index, out_grad);
     set_output<T>(x_grad_tmp, x_grad);
   }
@@ -822,8 +824,8 @@ void max_grad(const Tensor& x,
   if (!x_grad) {
     return;
   }
-  auto zero_tensor = full<T>(phi::vectorize(x.dims()), 0.0, x.dtype());
-  std::vector<int64_t> x_dim = phi::vectorize<int64_t>(x.dims());
+  auto zero_tensor = full<T>(common::vectorize(x.dims()), 0.0, x.dtype());
+  std::vector<int64_t> x_dim = common::vectorize<int64_t>(x.dims());
   int64_t axis_size = axis.size();
   int64_t x_dim_size = x_dim.size();
   reduce_all = false;
@@ -882,7 +884,7 @@ void slice_grad(const Tensor& input,
     if (decrease_size > 0) {
       if (decrease_size == static_cast<size_t>(in_dims.size())) {
         // all dims decrease
-        out_dims = phi::make_ddim(std::vector<int>(decrease_size, 1));
+        out_dims = common::make_ddim(std::vector<int>(decrease_size, 1));
       } else {
         origin_out_shape.resize(out_dims.size() + decrease_size, -1);
         for (size_t i = 0; i < decrease_size; ++i) {
@@ -896,7 +898,7 @@ void slice_grad(const Tensor& input,
             ++index;
           }
         }
-        out_dims = phi::make_ddim(origin_out_shape);
+        out_dims = common::make_ddim(origin_out_shape);
       }
     }
 
@@ -937,13 +939,13 @@ void tile_grad(const Tensor& x,
                Tensor* x_grad) {
   if (x_grad) {
     auto repeat_times_data = repeat_times.GetData();
-    auto out_grad_shape = phi::vectorize<int>(out_grad.dims());
+    auto out_grad_shape = common::vectorize<int>(out_grad.dims());
     auto result = out_grad;
     for (int i = 0; i < static_cast<int>(repeat_times_data.size()); i++) {
       int size = out_grad_shape[i] / repeat_times_data[i];
       std::vector<int> sections(repeat_times_data[i], size);
       auto split_arr = split<T>(result, IntArray(sections), i);
-      result = full<T>(phi::vectorize(split_arr[0].dims()), 0.0, x.dtype());
+      result = full<T>(common::vectorize(split_arr[0].dims()), 0.0, x.dtype());
       for (int j = 0; j < static_cast<int>(split_arr.size()); j++) {
         result = split_arr[j] + result;
       }
diff --git a/paddle/fluid/primitive/type/lazy_tensor.h b/paddle/fluid/primitive/type/lazy_tensor.h
index df9f7f35ae8a85..792ccaa208fbad 100644
--- a/paddle/fluid/primitive/type/lazy_tensor.h
+++ b/paddle/fluid/primitive/type/lazy_tensor.h
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #pragma once
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
 #include "paddle/fluid/pir/dialect/operator/utils/utils.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/extended_tensor.h"
 #include "paddle/phi/core/utils/data_type.h"
 #include "paddle/pir/core/value.h"
diff --git a/paddle/fluid/primitive/utils/CMakeLists.txt b/paddle/fluid/primitive/utils/CMakeLists.txt
index babaa5cd7da7ff..6b3458d7844bca 100644
--- a/paddle/fluid/primitive/utils/CMakeLists.txt
+++ b/paddle/fluid/primitive/utils/CMakeLists.txt
@@ -2,9 +2,9 @@ if(WITH_PYTHON OR NOT ON_INFER)
   cc_library(
     primitive_eager_utils_experimental
     SRCS eager_utils.cc
-    DEPS phi common_infer_shape_functions)
+    DEPS phi common common_infer_shape_functions)
 endif()
 cc_library(
   primitive_static_utils_experimental
   SRCS static_utils.cc
-  DEPS phi common_infer_shape_functions op_dialect)
+  DEPS phi common common_infer_shape_functions op_dialect)
diff --git a/paddle/fluid/primitive/utils/utils.h b/paddle/fluid/primitive/utils/utils.h
index 4490cc683ab70a..e38398f4814859 100644
--- a/paddle/fluid/primitive/utils/utils.h
+++ b/paddle/fluid/primitive/utils/utils.h
@@ -15,10 +15,10 @@
 #pragma once
 #include <vector>
 
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
 #include "paddle/fluid/primitive/type/lazy_tensor.h"
 #include "paddle/phi/api/include/tensor.h"
-#include "paddle/phi/core/ddim.h"
 
 namespace paddle {
 namespace primitive {
@@ -133,7 +133,7 @@ static phi::DDim get_reduce_dims_from_out(const phi::DDim& dout_dims,
               i));
     }
   }
-  return phi::make_ddim(result);
+  return common::make_ddim(result);
 }
 
 static phi::DDim get_reduce_dims(const phi::DDim& x_dims,
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index d3e0d0ec00343a..a864a70da2db57 100755
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -19,6 +19,7 @@ set(PYBIND_DEPS
     layer
     tracer
     engine
+    common
     scope_pool
     analysis_predictor
     imperative_profiler
@@ -43,7 +44,6 @@ set(PYBIND_DEPS
     program_translator
     pir_transforms
     pir
-    common
     new_profiler
     jit_layer
     jit_property
@@ -64,7 +64,8 @@ if(WITH_PSCORE)
   endif()
 endif()
 if(WITH_RPC)
-  set(PYBIND_DEPS ${PYBIND_DEPS} paddle_rpc ${EXTERNAL_BRPC_DEPS} zlib phi)
+  set(PYBIND_DEPS ${PYBIND_DEPS} paddle_rpc ${EXTERNAL_BRPC_DEPS} zlib phi
+                  common)
 endif()
 if(WITH_GPU OR WITH_ROCM)
   set(PYBIND_DEPS ${PYBIND_DEPS} dynload_cuda)
@@ -361,10 +362,11 @@ if(WITH_PYTHON)
       list(APPEND EAGER_OP_IMPL_DEPS ${op_impl_path}/ir.dll)
     endif()
 
-    # add_custom_command(
-    #   OUTPUT ${op_impl_path}/common.dll COMMAND ${CMAKE_COMMAND} -E copy
-    #                                             ${COMMON_LIB} ${op_impl_path})
-    # list(APPEND EAGER_OP_IMPL_DEPS ${op_impl_path}/common.dll)
+    add_custom_command(
+      OUTPUT ${op_impl_path}/common.dll
+      COMMAND ${CMAKE_COMMAND} -E copy ${COMMON_LIB} ${op_impl_path}
+      DEPENDS common)
+    list(APPEND EAGER_OP_IMPL_DEPS ${op_impl_path}/common.dll)
 
     if(${CBLAS_PROVIDER} STREQUAL MKLML)
       add_custom_command(
@@ -502,6 +504,7 @@ if(WITH_PYTHON)
     list(APPEND PYBIND_DEPS backward)
     list(APPEND PYBIND_DEPS grad_node_info)
     list(APPEND PYBIND_DEPS phi)
+    list(APPEND PYBIND_DEPS common)
     list(APPEND PYBIND_DEPS final_dygraph_function)
     list(APPEND PYBIND_DEPS final_dygraph_node)
     list(APPEND PYBIND_DEPS dygraph_function)
diff --git a/paddle/fluid/pybind/auto_parallel_py.cc b/paddle/fluid/pybind/auto_parallel_py.cc
index d993da4c64fa67..4b73d24163d83d 100644
--- a/paddle/fluid/pybind/auto_parallel_py.cc
+++ b/paddle/fluid/pybind/auto_parallel_py.cc
@@ -795,7 +795,7 @@ static void parse_tensors(PyObject *obj,
     DistTensorSpec in = py::cast<DistTensorSpec>(PyList_GetItem(obj, i));
     VLOG(6) << "Vector emplace_back DistTensorSpec: " << in.to_string();
     ins.emplace_back(phi::distributed::DistMetaTensor(
-        phi::make_ddim(in.shape()), in.dist_attr()));
+        common::make_ddim(in.shape()), in.dist_attr()));
   }
   ctx->EmplaceBackInputs(ins);
 }
@@ -807,7 +807,7 @@ static void parse_tensor(PyObject *obj,
   DistTensorSpec in = py::cast<DistTensorSpec>(obj);
   VLOG(6) << "DistTensorSpec: " << in.to_string();
   ctx->EmplaceBackInput(phi::distributed::DistMetaTensor(
-      phi::make_ddim(in.shape()), in.dist_attr()));
+      common::make_ddim(in.shape()), in.dist_attr()));
 }
 
 // TODO(ljz) support other types
diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
index d60fe8799c10ff..894ede8db18d2b 100644
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -83,7 +83,7 @@ void EmptyTensorInitializer(TensorObject* self,
                             const std::vector<int>& dims = {0},
                             framework::proto::VarType::Type var_type =
                                 paddle::framework::proto::VarType::LOD_TENSOR) {
-  auto ddims = phi::make_ddim(dims);
+  auto ddims = common::make_ddim(dims);
   self->tensor.set_name(name);
   auto autograd_meta = egr::EagerUtils::autograd_meta(&(self->tensor));
   autograd_meta->SetPersistable(persistable);
@@ -126,7 +126,7 @@ void EmptyStringTensorInitializer(TensorObject* self,
                                   const std::string& name,
                                   const paddle::platform::Place& place,
                                   const std::vector<int>& dims = {}) {
-  auto ddims = phi::make_ddim(dims);
+  auto ddims = common::make_ddim(dims);
   self->tensor.set_name(name);
   // Note(zhoushunjie): Only support CPUPlace when create StringTensor
   auto actual_place = platform::CPUPlace();
@@ -135,7 +135,7 @@ void EmptyStringTensorInitializer(TensorObject* self,
   std::shared_ptr<phi::StringTensor> string_tensor =
       std::make_shared<phi::StringTensor>(&string_allocator,
                                           phi::StringTensorMeta{ddims});
-  if (phi::product(ddims) > 0) {
+  if (common::product(ddims) > 0) {
     string_tensor->mutable_data(actual_place);
   }
   self->tensor.set_impl(string_tensor);
@@ -153,7 +153,7 @@ void CreateDistTensorWithNumpyValue(TensorObject* self,
                                         paddle::framework::proto::VarType::FP32,
                                     const std::vector<int>& dims = {0}) {
 #ifdef PADDLE_WITH_DISTRIBUTE
-  auto ddims = phi::make_ddim(dims);
+  auto ddims = common::make_ddim(dims);
   self->tensor.set_name(name);
   auto autograd_meta = egr::EagerUtils::autograd_meta(&(self->tensor));
   autograd_meta->SetPersistable(persistable);
@@ -229,7 +229,7 @@ void CreateDistTensorWithNumpyValue(TensorObject* self,
                                         paddle::framework::proto::VarType::FP32,
                                     const std::vector<int>& dims = {0}) {
 #ifdef PADDLE_WITH_DISTRIBUTE
-  auto ddims = phi::make_ddim(dims);
+  auto ddims = common::make_ddim(dims);
   self->tensor.set_name(name);
   auto autograd_meta = egr::EagerUtils::autograd_meta(&(self->tensor));
   autograd_meta->SetPersistable(persistable);
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index 356b447988db07..df84ca68b9182b 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -512,7 +512,7 @@ static PyObject* eager_api__get_custom_operator_inplace_reverse_idx(
 // This function copies from function `EmptyTensorInitializer` with default
 // parameters
 static Tensor InitializedEmptyTensor() {
-  auto ddims = phi::make_ddim({0});
+  auto ddims = common::make_ddim({0});
   auto tensor = paddle::Tensor();
   tensor.set_name(
       egr::Controller::Instance().GenerateUniqueName("generated_tensor"));
@@ -882,7 +882,7 @@ static PyObject* eager_api_sparse_coo_tensor(PyObject* self,
     // sort and merge duplicate indices
     std::shared_ptr<phi::SparseCooTensor> coo_tensor =
         std::make_shared<phi::SparseCooTensor>(
-            *dense_indices, *dense_elements, phi::make_ddim(dense_shape));
+            *dense_indices, *dense_elements, common::make_ddim(dense_shape));
     tensor.set_impl(coo_tensor);
     auto name =
         egr::Controller::Instance().GenerateUniqueName("generated_tensor");
@@ -932,7 +932,7 @@ static PyObject* eager_api_sparse_csr_tensor(PyObject* self,
         std::make_shared<phi::SparseCsrTensor>(*dense_crows,
                                                *dense_cols,
                                                *dense_elements,
-                                               phi::make_ddim(dense_shape));
+                                               common::make_ddim(dense_shape));
     tensor.set_impl(csr_tensor);
     auto name =
         egr::Controller::Instance().GenerateUniqueName("generated_tensor");
diff --git a/paddle/fluid/pybind/eager_math_op_patch.cc b/paddle/fluid/pybind/eager_math_op_patch.cc
index aa7a27db207364..bfe10107b24d87 100644
--- a/paddle/fluid/pybind/eager_math_op_patch.cc
+++ b/paddle/fluid/pybind/eager_math_op_patch.cc
@@ -44,13 +44,13 @@ typedef SSIZE_T ssize_t;
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/python_headers.h"
 #include "paddle/fluid/memory/allocation/mmap_allocator.h"
 #include "paddle/fluid/pybind/op_function_common.h"
 #include "paddle/fluid/pybind/tensor_py.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 6323c895d3896e..5effab997848d9 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -52,6 +52,7 @@ typedef SSIZE_T ssize_t;
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/eager/amp_utils.h"
 #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
 #include "paddle/fluid/eager/eager_amp_auto_cast.h"
@@ -59,7 +60,6 @@ typedef SSIZE_T ssize_t;
 #include "paddle/fluid/memory/allocation/mmap_allocator.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/phi/api/lib/data_transform.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
 #include "paddle/phi/core/distributed/auto_parallel/reshard/reshard_function.h"
 #include "paddle/phi/core/distributed/auto_parallel/reshard/reshard_function_registry.h"
@@ -1461,7 +1461,7 @@ static PyObject* tensor__getitem_from_offset(TensorObject* self,
   const auto& tensor_dims = tensor.dims();
 
   std::vector<size_t> dims(tensor_dims.size());
-  std::vector<size_t> stride = phi::vectorize<size_t>(tensor.strides());
+  std::vector<size_t> stride = common::vectorize<size_t>(tensor.strides());
 
   size_t numel = 1;
   for (int i = tensor_dims.size() - 1; i >= 0; --i) {
diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc
index 582d15909e9411..985086d05b5f1e 100644
--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -583,7 +583,7 @@ PyObject* tensor_properties_get_local_shape(TensorObject* self, void* closure) {
 #ifdef PADDLE_WITH_DISTRIBUTE
     phi::distributed::DistTensor* dist_tensor =
         static_cast<phi::distributed::DistTensor*>(self->tensor.impl().get());
-    return ToPyObject(phi::vectorize<int64_t>(dist_tensor->local_dims()));
+    return ToPyObject(common::vectorize<int64_t>(dist_tensor->local_dims()));
 #else
     PADDLE_THROW(platform::errors::Unavailable(
         "The `_local_shape` property of (Dist)Tensor is not supported "
@@ -657,7 +657,7 @@ PyObject* tensor_properties_get_shape(TensorObject* self, void* closure) {
             << " tensor layout: " << self->tensor.layout()
             << " tensor's shape size is : " << value.size();
     std::vector<int64_t> dims = value;
-    if (change_dim && phi::DataLayoutToString(desired_layout) == "NCHW") {
+    if (change_dim && common::DataLayoutToString(desired_layout) == "NCHW") {
       // NCHW -> NHWC
       VLOG(6) << "layout autotune get Shape from NCHW -> NHWC " << value[0]
               << " " << value[1] << " " << value[2] << " " << value[3] << " to "
@@ -667,7 +667,7 @@ PyObject* tensor_properties_get_shape(TensorObject* self, void* closure) {
       value[2] = dims[3];
       value[3] = dims[1];
     } else if (change_dim &&
-               phi::DataLayoutToString(desired_layout) == "NHWC") {
+               common::DataLayoutToString(desired_layout) == "NHWC") {
       // NHWC -> NCHW
       VLOG(6) << "layout autotune get Shape from NHWC -> NCHW " << value[0]
               << " " << value[1] << " " << value[2] << " " << value[3] << " to "
@@ -798,7 +798,7 @@ PyObject* tensor_properties_get_layout(TensorObject* self, void* closure) {
     VLOG(3) << "VariableCompatTensor does not support `layout` method.";
     return ToPyObject(layout);
   } else {
-    return ToPyObject(phi::DataLayoutToString(self->tensor.layout()));
+    return ToPyObject(common::DataLayoutToString(self->tensor.layout()));
   }
 
   return ToPyObject(layout);
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index a465dde78f2637..2b8f36f8988cfa 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #include "paddle/fluid/pybind/eager_utils.h"
 #include <Python.h>
+#include "paddle/common/exception.h"
 #include "paddle/pir/core/value.h"
 // Avoid a problem with copysign defined in pyconfig.h on Windows.
 #ifdef copysign
@@ -136,7 +137,7 @@ void ConvertToDistTensor(Tensor* x, const phi::distributed::ProcessMesh* mesh) {
             "as it's not phi::DenseTensor.",
             x->name()));
     phi::distributed::TensorDistAttr dist_attr(
-        phi::vectorize(x->impl()->dims()));
+        common::vectorize(x->impl()->dims()));
     dist_attr.set_process_mesh(*mesh);
     auto dense_t = std::static_pointer_cast<phi::DenseTensor>(x->impl());
     // auto parallel in dygraph doesn't support strided kernel.
@@ -1878,7 +1879,7 @@ paddle::Tensor CreateTensorFromVarDesc(
 
   auto var_type = var_desc.GetType();
 
-  auto ddims = phi::make_ddim(dims);
+  auto ddims = common::make_ddim(dims);
   tensor.set_name(var_desc.Name());
   auto autograd_meta = egr::EagerUtils::autograd_meta(&tensor);
   autograd_meta->SetPersistable(false);
diff --git a/paddle/fluid/pybind/eval_frame_tools.cc b/paddle/fluid/pybind/eval_frame_tools.cc
index 3b8df99eb2a3f3..da09c2478c02cd 100644
--- a/paddle/fluid/pybind/eval_frame_tools.cc
+++ b/paddle/fluid/pybind/eval_frame_tools.cc
@@ -18,9 +18,9 @@
 
 #include <unordered_set>
 
+#include "paddle/common/errors.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 
 /*============================ Dict Tree ================================*/
 
diff --git a/paddle/fluid/pybind/exception.cc b/paddle/fluid/pybind/exception.cc
index 7c166021f7b004..bede935a14dc3b 100644
--- a/paddle/fluid/pybind/exception.cc
+++ b/paddle/fluid/pybind/exception.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/pybind/exception.h"
 
+#include "paddle/common/exception.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
-#include "paddle/phi/api/ext/exception.h"
 namespace paddle {
 namespace pybind {
 
@@ -128,7 +128,7 @@ void ThrowExceptionToPython(std::exception_ptr p) {
         PyErr_SetString(EnforceNotMetException, e.what());
         break;
     }
-  } catch (const paddle::PD_Exception& e) {
+  } catch (const common::PD_Exception& e) {
     PyErr_SetString(PyExc_OSError, e.what());
   }
 }
diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc
index 7f8cac9ee1edf9..57ae90688e7689 100644
--- a/paddle/fluid/pybind/global_value_getter_setter.cc
+++ b/paddle/fluid/pybind/global_value_getter_setter.cc
@@ -22,11 +22,11 @@
 #include <utility>
 #include <vector>
 
+#include "paddle/common/macros.h"
 #include "paddle/fluid/framework/python_headers.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
 #include "paddle/fluid/platform/macros.h"
-#include "paddle/phi/core/macros.h"
 #include "paddle/utils/flags.h"
 #include "pybind11/stl.h"
 
diff --git a/paddle/fluid/pybind/pir.cc b/paddle/fluid/pybind/pir.cc
index 7d5aa7863f0e7d..e44ccfe88915f7 100644
--- a/paddle/fluid/pybind/pir.cc
+++ b/paddle/fluid/pybind/pir.cc
@@ -1156,7 +1156,7 @@ SplitedResult SplitForwardBackward(
     }
     auto value_type = v.type().dyn_cast<DenseTensorType>();
     auto dtype = paddle::dialect::TransToPhiDataType(value_type.dtype());
-    auto shape = phi::vectorize(value_type.dims());
+    auto shape = common::vectorize(value_type.dims());
     auto place = phi::Place();
 
     paddle::dialect::DataOp op =
diff --git a/paddle/fluid/pybind/pir.h b/paddle/fluid/pybind/pir.h
index 81ae155bbd28ef..30c3e83af4a9e8 100644
--- a/paddle/fluid/pybind/pir.h
+++ b/paddle/fluid/pybind/pir.h
@@ -15,8 +15,8 @@
 #pragma once
 
 #include <pybind11/pybind11.h>
+#include "paddle/common/ddim.h"
 #include "paddle/phi/common/data_type.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/pir/core/op_result.h"
 
 namespace paddle {
diff --git a/paddle/fluid/pybind/process_group_utils.h b/paddle/fluid/pybind/process_group_utils.h
index 5795e1aa8a8ce7..3ba9ec3239c371 100644
--- a/paddle/fluid/pybind/process_group_utils.h
+++ b/paddle/fluid/pybind/process_group_utils.h
@@ -97,21 +97,21 @@ struct SplitDenseTensor<platform::CustomDeviceContext, T> {
                                       std::vector<phi::DenseTensor *>);
     auto *kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
 
-    auto in_dims = phi::vectorize(in.dims());
-    auto origin_out_dims = phi::vectorize(out->at(0)->dims());
+    auto in_dims = common::vectorize(in.dims());
+    auto origin_out_dims = common::vectorize(out->at(0)->dims());
     for (auto *tensor : *out) {
       if (origin_out_dims.size() != in_dims.size()) {
         std::vector<int> new_dims({1});
         new_dims.insert(
             new_dims.end(), origin_out_dims.begin(), origin_out_dims.end());
-        tensor->Resize(phi::make_ddim(new_dims));
+        tensor->Resize(common::make_ddim(new_dims));
       }
     }
     (*kernel_fn)(context, in, out->size(), phi::Scalar(0), *out);
     for (auto *tensor : *out) {
-      auto tensor_dims = phi::vectorize(tensor->dims());
+      auto tensor_dims = common::vectorize(tensor->dims());
       if (tensor_dims.size() != origin_out_dims.size()) {
-        tensor->Resize(phi::make_ddim(origin_out_dims));
+        tensor->Resize(common::make_ddim(origin_out_dims));
       }
     }
   }
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 41e897bb8a2431..cfd8fb5f6e36f4 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -635,7 +635,7 @@ static void inline CreateVariableIfNotExist(
         Py_DECREF(py_var_desc);
         var = const_cast<framework::Scope *>(&scope)->Var(para_name);
         auto *tensor_temp = var->GetMutable<phi::DenseTensor>();
-        tensor_temp->Resize(phi::make_ddim(var_desc.GetShape()));
+        tensor_temp->Resize(common::make_ddim(var_desc.GetShape()));
         tensor_temp->mutable_data(
             exe->GetPlace(),
             framework::TransToPhiDataType(var_desc.GetDataType()));
@@ -1016,8 +1016,8 @@ PYBIND11_MODULE(libpaddle, m) {
   m.def(
       "broadcast_shape",
       [](const std::vector<int64_t> &x_dim, const std::vector<int64_t> &y_dim) {
-        return phi::vectorize(operators::details::BroadcastTwoDims(
-            phi::make_ddim(x_dim), phi::make_ddim(y_dim), -1));
+        return common::vectorize(operators::details::BroadcastTwoDims(
+            common::make_ddim(x_dim), common::make_ddim(y_dim), -1));
       });
 
   m.def(
diff --git a/paddle/fluid/pybind/reader_py.cc b/paddle/fluid/pybind/reader_py.cc
index f4b4ff4cf42ca6..a136afe4cca383 100644
--- a/paddle/fluid/pybind/reader_py.cc
+++ b/paddle/fluid/pybind/reader_py.cc
@@ -23,6 +23,7 @@
 
 #include "Python.h"
 
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/imperative/tracer.h"
@@ -30,7 +31,6 @@
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
 #include "paddle/fluid/operators/reader/py_reader.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/flags.h"
 #include "paddle/utils/flags.h"
 #include "pybind11/stl.h"
@@ -56,7 +56,7 @@ static paddle::optional<std::vector<int64_t>> DiffTensorShape(
 
   if (UNLIKELY(rank == 0)) {
     if (!target_shape.empty()) {  // Tensor rank = 0 but desc does not match
-      return phi::vectorize<int64_t>(tensor_shape);
+      return common::vectorize<int64_t>(tensor_shape);
     } else {
       return paddle::none;
     }
@@ -77,12 +77,12 @@ static paddle::optional<std::vector<int64_t>> DiffTensorShape(
     tensor_shape[0] = split_size;
     if (target_shape[0] >= 0) {  // need check dim 0
       if (tensor_shape[0] != target_shape[0]) {
-        return phi::vectorize<int64_t>(tensor_shape);
+        return common::vectorize<int64_t>(tensor_shape);
       }
 
       if (remainder > 0) {
         tensor_shape[0] = remainder;
-        return phi::vectorize<int64_t>(tensor_shape);
+        return common::vectorize<int64_t>(tensor_shape);
       }
     }
   }
@@ -95,7 +95,7 @@ static paddle::optional<std::vector<int64_t>> DiffTensorShape(
             "Tensor shape at dim %d must not be less than 0", idx));
     if (target_shape[idx] >= 0 &&
         tensor_shape[static_cast<int>(idx)] != target_shape[idx]) {
-      return phi::vectorize<int64_t>(tensor_shape);
+      return common::vectorize<int64_t>(tensor_shape);
     }
   }
 
@@ -152,7 +152,7 @@ class MultiDeviceFeedReader {
         pin_memory_(pin_memory) {
     std::vector<framework::DDim> dims;
     for (auto &shape : shapes) {
-      dims.push_back(phi::make_ddim(shape));
+      dims.push_back(common::make_ddim(shape));
     }
 
     auto first_reader = std::make_shared<reader::PyReader>(
diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc
index de0f3da2deb14e..aa4e92c6e8af67 100644
--- a/paddle/fluid/pybind/tensor.cc
+++ b/paddle/fluid/pybind/tensor.cc
@@ -221,14 +221,16 @@ void BindTensor(pybind11::module &m) {  // NOLINT
       .def("_is_initialized",
            [](const phi::DenseTensor &self) { return self.IsInitialized(); })
       .def("_get_dims",
-           [](const phi::DenseTensor &self) { return vectorize(self.dims()); })
+           [](const phi::DenseTensor &self) {
+             return common::vectorize(self.dims());
+           })
       .def("_set_dims",
            [](phi::DenseTensor &self, const std::vector<int64_t> &dim) {
-             self.Resize(phi::make_ddim(dim));
+             self.Resize(common::make_ddim(dim));
            })
       .def("_set_layout",
            [](phi::DenseTensor &self, const std::string &layout) {
-             self.set_layout(phi::StringToDataLayout(layout));
+             self.set_layout(common::StringToDataLayout(layout));
            })
       .def("_alloc_float",
            [](phi::DenseTensor &self, paddle::platform::CustomPlace &place) {
@@ -402,7 +404,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
 
       .def(
           "shape",
-          [](phi::DenseTensor &self) { return vectorize(self.dims()); },
+          [](phi::DenseTensor &self) { return common::vectorize(self.dims()); },
           R"DOC(
            Return the shape of Tensor.
 
@@ -468,7 +470,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
            })
       .def("_layout",
            [](phi::DenseTensor &self) {
-             return phi::DataLayoutToString(self.layout());
+             return common::DataLayoutToString(self.layout());
            })
       .def("_share_data_with", &phi::DenseTensor::ShareDataWith)
       .def("__getitem__", PySliceTensor, py::return_value_policy::reference)
@@ -517,7 +519,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
             new_lod.reserve(lod.size());
             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
             PADDLE_ENFORCE_EQ(
-                CheckLoD(new_lod, vectorize(self.dims()).front()),
+                CheckLoD(new_lod, common::vectorize(self.dims()).front()),
                 true,
                 platform::errors::InvalidArgument(
                     "The provided LoD is invalid, the LoD is %s", new_lod));
@@ -559,7 +561,8 @@ void BindTensor(pybind11::module &m) {  // NOLINT
                       std::back_inserter(new_lod));
             LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
             PADDLE_ENFORCE_EQ(
-                CheckLoD(new_offset_lod, vectorize(self.dims()).front()),
+                CheckLoD(new_offset_lod,
+                         common::vectorize(self.dims()).front()),
                 true,
                 platform::errors::InvalidArgument(
                     "The provided recursive_sequence_lengths info is "
@@ -660,8 +663,9 @@ void BindTensor(pybind11::module &m) {  // NOLINT
           [](phi::DenseTensor &self) -> bool {
             // Check that the lod info is valid and match the outermost
             // dimension of the Tensor data
-            return CheckLoD(self.lod(),
-                            static_cast<int>(vectorize(self.dims()).front()));
+            return CheckLoD(
+                self.lod(),
+                static_cast<int>(common::vectorize(self.dims()).front()));
           },
           R"DOC(
            Check whether the LoD of the Tensor is valid.
@@ -725,7 +729,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
              size_t size = t[0].cast<size_t>();
              auto dtype =
                  static_cast<phi::DataType>(t[1].cast<int>());
-             auto dims = phi::make_ddim(t[2].cast<std::vector<int>>());
+             auto dims = common::make_ddim(t[2].cast<std::vector<int>>());
              auto lod_info = t[3].cast<framework::LoD>();
              auto device_id = t[4].cast<int>();
 
@@ -787,8 +791,12 @@ void BindTensor(pybind11::module &m) {  // NOLINT
                  framework::SizeOfType(
                      framework::TransToProtoVarType(self.type()));
 
-             return py::make_tuple(_handle, (py::size_t)offset_bytes, data_size,
-                                   type_idx, vectorize(self.dims()), self.lod(),
+             return py::make_tuple(_handle,
+                                   (py::size_t)offset_bytes,
+                                   data_size,
+                                   type_idx,
+                                   common::vectorize(self.dims()),
+                                   self.lod(),
                                    device_id);
            },
            R"DOC(
@@ -832,7 +840,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
              tensor.ResetHolderWithType(
                  shared_reader_holder,
                  static_cast<phi::DataType>(t[3].cast<int>()));
-             tensor.Resize(phi::make_ddim(t[4].cast<std::vector<int>>()));
+             tensor.Resize(common::make_ddim(t[4].cast<std::vector<int>>()));
              tensor.set_lod(t[5].cast<framework::LoD>());
 
              return tensor;
@@ -911,7 +919,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
 
              return py::make_tuple(mmap_allocation->ipc_name(),
                                    mmap_allocation->size(), type_idx,
-                                   vectorize(self.dims()), self.lod());
+                                   common::vectorize(self.dims()), self.lod());
            },
            R"DOC(
            Serialize CPU lod tensor in shared memory to tuple.
@@ -953,7 +961,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
              tensor.ResetHolderWithType(
                  shared_holder,
                  static_cast<phi::DataType>(t[2].cast<int>()));
-             tensor.Resize(phi::make_ddim(t[3].cast<std::vector<int>>()));
+             tensor.Resize(common::make_ddim(t[3].cast<std::vector<int>>()));
              tensor.set_lod(t[4].cast<framework::LoD>());
 
              return tensor;
@@ -1017,7 +1025,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
 
             return py::make_tuple(mmap_writer_allocation->ipc_name(),
                                   mmap_writer_allocation->size(), type_idx,
-                                  vectorize(t.dims()), t.lod());
+                                  common::vectorize(t.dims()), t.lod());
           },
           [](py::tuple t) {  // __setstate__
             if (t.size() != 5)
@@ -1041,7 +1049,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
             tensor.ResetHolderWithType(
                 shared_reader_holder,
                 static_cast<phi::DataType>(t[2].cast<int>()));
-            tensor.Resize(phi::make_ddim(t[3].cast<std::vector<int>>()));
+            tensor.Resize(common::make_ddim(t[3].cast<std::vector<int>>()));
             tensor.set_lod(t[4].cast<framework::LoD>());
 
             return tensor;
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 4b50fd5084ed69..dd5bd7f1d91c4d 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -389,7 +389,7 @@ void SetTensorFromPyArrayT(
   for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) {
     dims.push_back(static_cast<int64_t>(array.shape()[i]));
   }
-  self->Resize(phi::make_ddim(dims));
+  self->Resize(common::make_ddim(dims));
 
   if (paddle::platform::is_cpu_place(place)) {
     if (zero_copy) {
@@ -556,7 +556,7 @@ void SetStringTensorFromPyArray(phi::StringTensor *self,
   for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) {
     dims.push_back(static_cast<int>(array.shape()[i]));
   }
-  self->Resize(phi::make_ddim(dims));
+  self->Resize(common::make_ddim(dims));
   auto itemsize = array.itemsize();
   if (paddle::platform::is_cpu_place(place)) {
     auto dst = self->mutable_data(place);
@@ -609,7 +609,7 @@ void SetUVATensorFromPyArrayImpl(
     dims.emplace_back(static_cast<int64_t>(array.shape()[i]));
     numel *= static_cast<int64_t>(array.shape()[i]);
   }
-  self_tensor->Resize(phi::make_ddim(dims));
+  self_tensor->Resize(common::make_ddim(dims));
 
   auto data_type = framework::ToDataType(std::type_index(typeid(T)));
   const auto &need_allocate_size = numel * framework::SizeOfType(data_type);
@@ -652,7 +652,7 @@ void SetUVATensorFromPyArray(const std::shared_ptr<paddle::Tensor> &self,
 #if defined(PADDLE_WITH_CUDA)
   VLOG(4) << "Running in SetUVATensorFromPyArray for Phi::Tensor.";
   phi::DenseTensorMeta meta =
-      phi::DenseTensorMeta(phi::DataType::FLOAT32, phi::make_ddim({1, 1}));
+      phi::DenseTensorMeta(phi::DataType::FLOAT32, common::make_ddim({1, 1}));
   std::shared_ptr<phi::DenseTensor> tmp_t = std::make_shared<phi::DenseTensor>(
       std::make_unique<paddle::experimental::DefaultAllocator>(
           paddle::platform::CPUPlace())
@@ -672,7 +672,7 @@ void _sliceCompute(const phi::DenseTensor *in,
                    const std::vector<int> &axes,
                    const std::vector<int> &starts) {
   auto &eigen_place = *ctx.eigen_device();
-  auto out_dims = phi::vectorize<int>(out->dims());
+  auto out_dims = common::vectorize<int>(out->dims());
   auto in_dims = in->dims();
 
   auto offsets = Eigen::DSizes<Eigen::DenseIndex, D>();
@@ -708,8 +708,8 @@ void _concatCompute(const std::vector<phi::DenseTensor> &ins,
   if (axis == 0 && ins.size() < 10) {
     size_t output_offset = 0;
     for (auto &in : ins) {
-      auto in_stride = phi::stride_numel(in.dims());
-      auto out_stride = phi::stride_numel(out->dims());
+      auto in_stride = common::stride_numel(in.dims());
+      auto out_stride = common::stride_numel(out->dims());
       phi::funcs::StridedNumelCopyWithAxis<T, phi::CPUContext>(
           ctx,
           axis,
diff --git a/paddle/fluid/sub_graph/sub_graph_checker.cc b/paddle/fluid/sub_graph/sub_graph_checker.cc
index 89a7a00d58d553..b13f25fcb7ff20 100644
--- a/paddle/fluid/sub_graph/sub_graph_checker.cc
+++ b/paddle/fluid/sub_graph/sub_graph_checker.cc
@@ -336,7 +336,7 @@ void SubGraphChecker::InitInputs(const std::vector<pir::Value>& input_values,
   for (size_t i = 0; i < input_values.size(); ++i) {
     auto tensor_type =
         input_values[i].type().dyn_cast<paddle::dialect::DenseTensorType>();
-    auto shape = phi::vectorize<int64_t>(tensor_type.dims());
+    auto shape = common::vectorize<int64_t>(tensor_type.dims());
     auto random =
         builder
             .Build<paddle::dialect::UniformOp>(
diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
index 45e4b8bd085d5e..64c18b2b60ff0a 100644
--- a/paddle/phi/CMakeLists.txt
+++ b/paddle/phi/CMakeLists.txt
@@ -42,7 +42,8 @@ set(PHI_DEPS
     eigen3
     xxhash
     cblas
-    utf8proc)
+    utf8proc
+    common)
 
 set(INFERENCE_DEPS phi_profiler_proto auto_parallel_proto)
 
diff --git a/paddle/phi/api/all.h b/paddle/phi/api/all.h
index 74a016a183b2b4..ec521021859706 100644
--- a/paddle/phi/api/all.h
+++ b/paddle/phi/api/all.h
@@ -29,14 +29,14 @@ limitations under the License. */
 #include "paddle/phi/api/include/tensor_utils.h"
 
 // phi common headers
+#include "paddle/common/layout.h"
 #include "paddle/phi/common/backend.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/int_array.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/common/scalar.h"
 
 // original custom op headers
+#include "paddle/common/exception.h"
 #include "paddle/phi/api/ext/dispatch.h"
-#include "paddle/phi/api/ext/exception.h"
 #include "paddle/phi/api/ext/op_meta_info.h"
 #include "paddle/phi/api/ext/tensor_compat.h"
diff --git a/paddle/phi/api/ext/exception.h b/paddle/phi/api/ext/exception.h
deleted file mode 100644
index 92b17b4898d3f7..00000000000000
--- a/paddle/phi/api/ext/exception.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <iostream>
-#include <sstream>
-#include <string>
-
-namespace paddle {
-
-//////////////// Exception handling and Error Message  /////////////////
-#if !defined(_WIN32)
-#define PD_UNLIKELY(expr) (__builtin_expect(static_cast<bool>(expr), 0))
-#define PD_LIKELY(expr) (__builtin_expect(static_cast<bool>(expr), 1))
-#else
-#define PD_UNLIKELY(expr) (expr)
-#define PD_LIKELY(expr) (expr)
-#endif
-
-struct PD_Exception : public std::exception {
- public:
-  template <typename... Args>
-  explicit PD_Exception(const std::string& msg,
-                        const char* file,
-                        int line,
-                        const char* default_msg) {
-    std::ostringstream sout;
-    if (msg.empty()) {
-      sout << default_msg << "\n  [" << file << ":" << line << "]";
-    } else {
-      sout << msg << "\n  [" << file << ":" << line << "]";
-    }
-    err_msg_ = sout.str();
-  }
-
-  const char* what() const noexcept override { return err_msg_.c_str(); }
-
- private:
-  std::string err_msg_;
-};
-
-class ErrorMessage {
- public:
-  template <typename... Args>
-  explicit ErrorMessage(const Args&... args) {
-    build_string(args...);
-  }
-
-  void build_string() { oss << ""; }
-
-  template <typename T>
-  void build_string(const T& t) {
-    oss << t;
-  }
-
-  template <typename T, typename... Args>
-  void build_string(const T& t, const Args&... args) {
-    build_string(t);
-    build_string(args...);
-  }
-
-  std::string to_string() { return oss.str(); }
-
- private:
-  std::ostringstream oss;
-};
-
-#define PD_CHECK(COND, ...)                                               \
-  do {                                                                    \
-    if (PD_UNLIKELY(!(COND))) {                                           \
-      auto __message__ = ::paddle::ErrorMessage(__VA_ARGS__).to_string(); \
-      throw ::paddle::PD_Exception(__message__,                           \
-                                   __FILE__,                              \
-                                   __LINE__,                              \
-                                   "Expected " #COND                      \
-                                   ", but it's not satisfied.");          \
-    }                                                                     \
-  } while (0)
-
-#define PD_THROW(...)                                                   \
-  do {                                                                  \
-    auto __message__ = ::paddle::ErrorMessage(__VA_ARGS__).to_string(); \
-    throw ::paddle::PD_Exception(                                       \
-        __message__, __FILE__, __LINE__, "An error occurred.");         \
-  } while (0)
-
-}  // namespace paddle
diff --git a/paddle/phi/api/ext/op_meta_info.h b/paddle/phi/api/ext/op_meta_info.h
index 6ece2298980c4b..c324a783cb67d0 100644
--- a/paddle/phi/api/ext/op_meta_info.h
+++ b/paddle/phi/api/ext/op_meta_info.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include "paddle/phi/api/ext/exception.h"
+#include "paddle/common/exception.h"
 #include "paddle/phi/api/include/dll_decl.h"
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/utils/any.h"
diff --git a/paddle/phi/api/include/context_pool.h b/paddle/phi/api/include/context_pool.h
index 6b6fe290d6d288..86ba7b9cf75764 100644
--- a/paddle/phi/api/include/context_pool.h
+++ b/paddle/phi/api/include/context_pool.h
@@ -16,9 +16,9 @@ limitations under the License. */
 
 #include <mutex>
 
+#include "paddle/common/macros.h"
 #include "paddle/phi/api/include/dll_decl.h"
 #include "paddle/phi/common/place.h"
-#include "paddle/phi/core/macros.h"
 #include "paddle/utils/flat_hash_map.h"
 
 namespace phi {
diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h
index b8d66f6c228c72..a6e78686e1e4ce 100644
--- a/paddle/phi/api/include/tensor.h
+++ b/paddle/phi/api/include/tensor.h
@@ -29,21 +29,21 @@ using gpuStream_t = cudaStream_t;
 using gpuStream_t = hipStream_t;
 #endif
 
+#include "paddle/common/layout.h"
 #include "paddle/phi/api/include/dll_decl.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/int_array.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/common/scalar.h"
 
 namespace phi {
 class DenseTensor;
+class TensorBase;
 }  // namespace phi
 
-namespace phi {
-class TensorBase;
+namespace common {
 class DDim;
-}  // namespace phi
+}  // namespace common
 
 namespace paddle {
 // TODO(chenweihang): Remove the experimental namespace for Scalar and IntArray
@@ -173,9 +173,9 @@ class PADDLE_API Tensor final {
   /**
    * @brief Return the dimensions of Tensor.
    *
-   * @return phi::DDim
+   * @return common::DDim
    */
-  const phi::DDim& dims() const;
+  const common::DDim& dims() const;
 
   /**
    * @brief Return the shape (dimensions) of Tensor.
@@ -190,9 +190,9 @@ class PADDLE_API Tensor final {
   /**
    * @brief Return the strides (dimensions) of Tensor.
    *
-   * @return phi::DDim
+   * @return common::DDim
    */
-  const phi::DDim& strides() const;
+  const common::DDim& strides() const;
 
   /**
    * @brief Reset the shape of the tensor.
diff --git a/paddle/phi/api/lib/api_gen_utils.cc b/paddle/phi/api/lib/api_gen_utils.cc
index 0dea1ecb3db8e1..5a0316eddade33 100644
--- a/paddle/phi/api/lib/api_gen_utils.cc
+++ b/paddle/phi/api/lib/api_gen_utils.cc
@@ -338,8 +338,8 @@ void TransStride(const Context& dev_ctx,
                          phi::StridedCopyKernel<data_t, Context>(
                              dev_ctx,
                              *from,
-                             phi::vectorize<int64_t>(to->dims()),
-                             phi::vectorize<int64_t>(to->strides()),
+                             common::vectorize<int64_t>(to->dims()),
+                             common::vectorize<int64_t>(to->strides()),
                              to->offset(),
                              to);
                        }));
@@ -357,8 +357,8 @@ void TransStride(const Context& dev_ctx,
                            phi::StridedCopyKernel<data_t, Context>(
                                dev_ctx,
                                *from[i],
-                               phi::vectorize<int64_t>(to[i]->dims()),
-                               phi::vectorize<int64_t>(to[i]->strides()),
+                               common::vectorize<int64_t>(to[i]->dims()),
+                               common::vectorize<int64_t>(to[i]->strides()),
                                to[i]->offset(),
                                to[i]);
                          }));
@@ -377,8 +377,8 @@ void TransStride(phi::DeviceContext* dev_ctx,
                            phi::StridedCopyKernel<data_t, phi::CPUContext>(
                                *cpu_ctx,
                                *from,
-                               phi::vectorize<int64_t>(to->dims()),
-                               phi::vectorize<int64_t>(to->strides()),
+                               common::vectorize<int64_t>(to->dims()),
+                               common::vectorize<int64_t>(to->strides()),
                                to->offset(),
                                to);
                          }));
@@ -392,8 +392,8 @@ void TransStride(phi::DeviceContext* dev_ctx,
                            phi::StridedCopyKernel<data_t, phi::GPUContext>(
                                *gpu_ctx,
                                *from,
-                               phi::vectorize<int64_t>(to->dims()),
-                               phi::vectorize<int64_t>(to->strides()),
+                               common::vectorize<int64_t>(to->dims()),
+                               common::vectorize<int64_t>(to->strides()),
                                to->offset(),
                                to);
                          }));
@@ -408,8 +408,8 @@ void TransStride(phi::DeviceContext* dev_ctx,
                            phi::StridedCopyKernel<data_t, phi::XPUContext>(
                                *xpu_ctx,
                                *from,
-                               phi::vectorize<int64_t>(to->dims()),
-                               phi::vectorize<int64_t>(to->strides()),
+                               common::vectorize<int64_t>(to->dims()),
+                               common::vectorize<int64_t>(to->strides()),
                                to->offset(),
                                to);
                          }));
@@ -430,8 +430,8 @@ void TransStrideLegacy(phi::DeviceContext* dev_ctx,
                            phi::StridedCopyKernel<data_t, phi::CPUContext>(
                                *cpu_ctx,
                                *from,
-                               phi::vectorize<int64_t>(to->dims()),
-                               phi::vectorize<int64_t>(to->strides()),
+                               common::vectorize<int64_t>(to->dims()),
+                               common::vectorize<int64_t>(to->strides()),
                                to->offset(),
                                to);
                          }));
@@ -444,8 +444,8 @@ void TransStrideLegacy(phi::DeviceContext* dev_ctx,
                            phi::StridedCopyKernel<data_t, phi::GPUContext>(
                                *gpu_ctx,
                                *from,
-                               phi::vectorize<int64_t>(to->dims()),
-                               phi::vectorize<int64_t>(to->strides()),
+                               common::vectorize<int64_t>(to->dims()),
+                               common::vectorize<int64_t>(to->strides()),
                                to->offset(),
                                to);
                          }));
@@ -459,8 +459,8 @@ void TransStrideLegacy(phi::DeviceContext* dev_ctx,
                            phi::StridedCopyKernel<data_t, phi::XPUContext>(
                                *xpu_ctx,
                                *from,
-                               phi::vectorize<int64_t>(to->dims()),
-                               phi::vectorize<int64_t>(to->strides()),
+                               common::vectorize<int64_t>(to->dims()),
+                               common::vectorize<int64_t>(to->strides()),
                                to->offset(),
                                to);
                          }));
@@ -481,8 +481,8 @@ void TransStride(phi::DeviceContext* dev_ctx,
                              phi::StridedCopyKernel<data_t, phi::CPUContext>(
                                  *cpu_ctx,
                                  *from[i],
-                                 phi::vectorize<int64_t>(to[i]->dims()),
-                                 phi::vectorize<int64_t>(to[i]->strides()),
+                                 common::vectorize<int64_t>(to[i]->dims()),
+                                 common::vectorize<int64_t>(to[i]->strides()),
                                  to[i]->offset(),
                                  to[i]);
                            }));
@@ -496,8 +496,8 @@ void TransStride(phi::DeviceContext* dev_ctx,
                              phi::StridedCopyKernel<data_t, phi::GPUContext>(
                                  *gpu_ctx,
                                  *from[i],
-                                 phi::vectorize<int64_t>(to[i]->dims()),
-                                 phi::vectorize<int64_t>(to[i]->strides()),
+                                 common::vectorize<int64_t>(to[i]->dims()),
+                                 common::vectorize<int64_t>(to[i]->strides()),
                                  to[i]->offset(),
                                  to[i]);
                            }));
@@ -512,8 +512,8 @@ void TransStride(phi::DeviceContext* dev_ctx,
                              phi::StridedCopyKernel<data_t, phi::XPUContext>(
                                  *xpu_ctx,
                                  *from[i],
-                                 phi::vectorize<int64_t>(to[i]->dims()),
-                                 phi::vectorize<int64_t>(to[i]->strides()),
+                                 common::vectorize<int64_t>(to[i]->dims()),
+                                 common::vectorize<int64_t>(to[i]->strides()),
                                  to[i]->offset(),
                                  to[i]);
                            }));
@@ -721,7 +721,7 @@ void SetReplicatedDistAttrForOutput(
   if (out) {
     // For inplace output, we also need to set replicated dist attr
     auto dist_attr =
-        phi::distributed::TensorDistAttr(phi::vectorize(out->dims()));
+        phi::distributed::TensorDistAttr(common::vectorize(out->dims()));
     dist_attr.set_process_mesh(process_mesh);
     out->unsafe_set_dist_attr(dist_attr);
   }
diff --git a/paddle/phi/api/lib/backend_set.h b/paddle/phi/api/lib/backend_set.h
index 51e08374bc9d65..af4de2580f5784 100644
--- a/paddle/phi/api/lib/backend_set.h
+++ b/paddle/phi/api/lib/backend_set.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <ostream>
 
-#include "paddle/phi/api/ext/exception.h"
+#include "paddle/common/exception.h"
 #include "paddle/phi/common/backend.h"
 namespace paddle {
 namespace experimental {
diff --git a/paddle/phi/api/lib/data_type_set.h b/paddle/phi/api/lib/data_type_set.h
index 04580e37f8fb52..7f9a1c44de3d3c 100644
--- a/paddle/phi/api/lib/data_type_set.h
+++ b/paddle/phi/api/lib/data_type_set.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <ostream>
 
-#include "paddle/phi/api/ext/exception.h"
+#include "paddle/common/exception.h"
 #include "paddle/phi/common/data_type.h"
 namespace paddle {
 namespace experimental {
diff --git a/paddle/phi/api/lib/kernel_dispatch.cc b/paddle/phi/api/lib/kernel_dispatch.cc
index 2cece35ba1582e..62d05e18c4a614 100644
--- a/paddle/phi/api/lib/kernel_dispatch.cc
+++ b/paddle/phi/api/lib/kernel_dispatch.cc
@@ -167,11 +167,12 @@ Backend ParseBackendWithInputOrder(const Place& place, const Tensor& tensor) {
              : ParseBackend(tensor);
 }
 
-DataLayout ParseLayout(DataLayout layout) { return layout; }
-DataLayout ParseLayout(const Tensor& tensor) { return tensor.layout(); }
+phi::DataLayout ParseLayout(phi::DataLayout layout) { return layout; }
+phi::DataLayout ParseLayout(const Tensor& tensor) { return tensor.layout(); }
 
-DataLayout ParseLayoutWithInputOrder(DataLayout layout, const Tensor& tensor) {
-  return layout != DataLayout::UNDEFINED ? layout : ParseLayout(tensor);
+phi::DataLayout ParseLayoutWithInputOrder(phi::DataLayout layout,
+                                          const Tensor& tensor) {
+  return layout != phi::DataLayout::UNDEFINED ? layout : ParseLayout(tensor);
 }
 
 }  // namespace experimental
diff --git a/paddle/phi/api/lib/kernel_dispatch.h b/paddle/phi/api/lib/kernel_dispatch.h
index 7bd3524ed795c3..9d7de268309e33 100644
--- a/paddle/phi/api/lib/kernel_dispatch.h
+++ b/paddle/phi/api/lib/kernel_dispatch.h
@@ -18,12 +18,12 @@ limitations under the License. */
 #include <limits>
 #include <string>
 #include <utility>
+#include "paddle/common/layout.h"
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/api/lib/backend_set.h"
 #include "paddle/phi/api/lib/data_type_set.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/common/data_type.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
 #include "paddle/phi/core/selected_rows.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
@@ -52,7 +52,7 @@ enum class KernelType {
 // TODO(chenweihang): support DataLayout and DataType selected
 struct KernelKeySet {
   BackendSet backend_set{Backend::UNDEFINED};
-  DataLayout layout{DataLayout::UNDEFINED};
+  phi::DataLayout layout{phi::DataLayout::UNDEFINED};
   DataType dtype{DataType::UNDEFINED};
 
   // TODO(chenweihang): iterate all kernelkey for kernel selection
@@ -239,9 +239,10 @@ Backend ParseBackend(T t, Args... args) {
 }
 Backend ParseBackendWithInputOrder(const Place& place, const Tensor& tensor);
 
-DataLayout ParseLayout(DataLayout layout);
-DataLayout ParseLayout(const Tensor& tensor);
-DataLayout ParseLayoutWithInputOrder(DataLayout layout, const Tensor& tensor);
+phi::DataLayout ParseLayout(phi::DataLayout layout);
+phi::DataLayout ParseLayout(const Tensor& tensor);
+phi::DataLayout ParseLayoutWithInputOrder(phi::DataLayout layout,
+                                          const Tensor& tensor);
 
 template <typename... Args>
 bool AllInputsAreDistTensor(const Args&... args) {
diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc
index 206d5082e62dd1..49c47cbcce363c 100644
--- a/paddle/phi/api/lib/tensor.cc
+++ b/paddle/phi/api/lib/tensor.cc
@@ -20,11 +20,11 @@ limitations under the License. */
 
 #include "glog/logging.h"
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/api/include/context_pool.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
 #include "paddle/phi/core/enforce.h"
@@ -72,8 +72,9 @@ Tensor::Tensor(const Place &place) {
   DefaultAllocator alloc(place);
   impl_ = std::make_shared<phi::DenseTensor>(
       &alloc,
-      phi::DenseTensorMeta(
-          phi::DataType::FLOAT32, phi::make_ddim({}), phi::DataLayout::NCHW));
+      phi::DenseTensorMeta(phi::DataType::FLOAT32,
+                           common::make_ddim({}),
+                           phi::DataLayout::NCHW));
 }
 
 Tensor::Tensor(const Place &place, const std::vector<int64_t> &shape) {
@@ -89,7 +90,7 @@ Tensor::Tensor(const Place &place, const std::vector<int64_t> &shape) {
   impl_ = std::make_shared<phi::DenseTensor>(
       &alloc,
       phi::DenseTensorMeta(phi::DataType::FLOAT32,
-                           phi::make_ddim({shape}),
+                           common::make_ddim({shape}),
                            phi::DataLayout::NCHW));
 }
 
@@ -107,7 +108,7 @@ const phi::DDim &Tensor::dims() const { return impl_->dims(); }
 
 std::vector<int64_t> Tensor::shape() const {
   const auto &dims = impl_->dims();
-  return phi::vectorize<int64_t>(dims);
+  return common::vectorize<int64_t>(dims);
 }
 
 const phi::DDim &Tensor::strides() const {
@@ -134,7 +135,8 @@ void Tensor::reshape(const std::vector<int64_t> &shape) {
          "touching underlying data, this requires the total size of "
          "the tensor to remain constant.";
   if (is_dense_tensor()) {
-    static_cast<phi::DenseTensor *>(impl_.get())->Resize(phi::make_ddim(shape));
+    static_cast<phi::DenseTensor *>(impl_.get())
+        ->Resize(common::make_ddim(shape));
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Only support reshape operation on DenseTensor now."));
@@ -145,7 +147,7 @@ DataType Tensor::dtype() const { return impl_->dtype(); }
 
 DataType Tensor::type() const { return impl_->dtype(); }
 
-DataLayout Tensor::layout() const { return impl_->layout(); }
+phi::DataLayout Tensor::layout() const { return impl_->layout(); }
 
 bool Tensor::is_dense_tensor() const {
   return phi::DenseTensor::classof(impl_.get());
diff --git a/paddle/phi/api/lib/tensor_utils.cc b/paddle/phi/api/lib/tensor_utils.cc
index aa9a678f2e10b5..09f5429a6623c6 100644
--- a/paddle/phi/api/lib/tensor_utils.cc
+++ b/paddle/phi/api/lib/tensor_utils.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include "paddle/phi/api/lib/api_registry.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h"
+#include "paddle/phi/core/enforce.h"
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #ifdef PADDLE_WITH_CUDA
@@ -91,7 +92,7 @@ PADDLE_API Tensor from_blob(void* data,
   }
 
   auto meta =
-      phi::DenseTensorMeta(dtype, phi::make_ddim(shape.GetData()), layout);
+      phi::DenseTensorMeta(dtype, common::make_ddim(shape.GetData()), layout);
 
   size_t size = SizeOf(dtype) * (meta.is_scalar ? 1 : product(meta.dims));
 
diff --git a/paddle/phi/api/profiler/common_event.h b/paddle/phi/api/profiler/common_event.h
index 76b9d5fa609b9b..d9e3ed74fd397a 100644
--- a/paddle/phi/api/profiler/common_event.h
+++ b/paddle/phi/api/profiler/common_event.h
@@ -18,10 +18,10 @@
 #include <functional>
 #include <string>
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/api/profiler/event.h"  // import EventRole, TODO(TIEXING): remove later
 #include "paddle/phi/api/profiler/trace_event.h"
 #include "paddle/phi/core/attribute.h"
-#include "paddle/phi/core/ddim.h"
 
 namespace phi {
 
diff --git a/paddle/phi/api/profiler/host_event_recorder.h b/paddle/phi/api/profiler/host_event_recorder.h
index 349a31a25ad29d..bd75d5e3689d3d 100644
--- a/paddle/phi/api/profiler/host_event_recorder.h
+++ b/paddle/phi/api/profiler/host_event_recorder.h
@@ -18,8 +18,8 @@
 #include <type_traits>
 #include <vector>
 
+#include "paddle/common/macros.h"
 #include "paddle/phi/common/thread_data_registry.h"
-#include "paddle/phi/core/macros.h"
 #include "paddle/phi/core/os_info.h"
 
 namespace phi {
diff --git a/paddle/phi/api/profiler/supplement_tracing.h b/paddle/phi/api/profiler/supplement_tracing.h
index e93ad63b607ade..fc20f041ec02a7 100644
--- a/paddle/phi/api/profiler/supplement_tracing.h
+++ b/paddle/phi/api/profiler/supplement_tracing.h
@@ -18,8 +18,8 @@ limitations under the License. */
 #include <string>
 #include <utility>
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/core/attribute.h"
-#include "paddle/phi/core/ddim.h"
 
 namespace phi {
 
diff --git a/paddle/phi/api/yaml/generator/tensor_operants_gen.py b/paddle/phi/api/yaml/generator/tensor_operants_gen.py
index bb6d42dc03964d..b09e336a138c66 100644
--- a/paddle/phi/api/yaml/generator/tensor_operants_gen.py
+++ b/paddle/phi/api/yaml/generator/tensor_operants_gen.py
@@ -228,7 +228,7 @@ class TensorOperantsBase {
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/common/int_array.h"
-#include "paddle/phi/core/macros.h"
+#include "paddle/common/macros.h"
 
 """
 
@@ -349,7 +349,7 @@ class PhiTensorOperants : public TensorOperantsBase {
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/common/int_array.h"
-#include "paddle/phi/core/macros.h"
+#include "paddle/common/macros.h"
 #include "paddle/utils/test_macros.h"
 
 """
@@ -444,7 +444,7 @@ class TEST_API OperantsManager {
 
 #include "glog/logging.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
+#include "paddle/common/errors.h"
 #include "paddle/utils/flags.h"
 
 """
diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt
index 55e629de34e7e2..ed47487553bee7 100644
--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -64,5 +64,5 @@ if(WITH_CUSTOM_DEVICE)
   cc_test(
     capi_test
     SRCS custom/capi_test.cc
-    DEPS phi)
+    DEPS phi common)
 endif()
diff --git a/paddle/phi/backends/c_comm_lib.h b/paddle/phi/backends/c_comm_lib.h
index 30ebe6d2fa4961..682fc841e05b1a 100644
--- a/paddle/phi/backends/c_comm_lib.h
+++ b/paddle/phi/backends/c_comm_lib.h
@@ -15,11 +15,11 @@
 #pragma once
 #include <vector>
 
+#include "paddle/common/errors.h"
+#include "paddle/common/macros.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
-#include "paddle/phi/core/macros.h"
 
 #include "paddle/phi/common/reduce_type.h"
 
diff --git a/paddle/phi/backends/context_pool.h b/paddle/phi/backends/context_pool.h
index a2bf9ed1cb03df..52f0ced275ac5e 100644
--- a/paddle/phi/backends/context_pool.h
+++ b/paddle/phi/backends/context_pool.h
@@ -20,10 +20,10 @@ limitations under the License. */
 #include <mutex>  // NOLINT
 #include <set>
 
+#include "paddle/common/macros.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/device_context.h"
-#include "paddle/phi/core/macros.h"
 #include "paddle/utils/test_macros.h"
 
 namespace phi {
diff --git a/paddle/phi/backends/device_memory_aligment.h b/paddle/phi/backends/device_memory_aligment.h
index 8508d5206558d2..c65e06364acd0e 100644
--- a/paddle/phi/backends/device_memory_aligment.h
+++ b/paddle/phi/backends/device_memory_aligment.h
@@ -15,10 +15,10 @@ limitations under the License. */
 #pragma once
 #include <stddef.h>
 
+#include "paddle/common/errors.h"
 #include "paddle/phi/backends/cpu/cpu_info.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/xpu/xpu_info.h"
diff --git a/paddle/phi/backends/dynload/CMakeLists.txt b/paddle/phi/backends/dynload/CMakeLists.txt
index 838b623ae7b381..2db75d7022f0a5 100644
--- a/paddle/phi/backends/dynload/CMakeLists.txt
+++ b/paddle/phi/backends/dynload/CMakeLists.txt
@@ -77,7 +77,7 @@ if(WITH_XPU)
   cc_library(
     phi_dynload_xpti
     SRCS xpti.cc
-    DEPS phi)
+    DEPS phi common)
 endif()
 
 if(WITH_FLASHATTN)
@@ -101,5 +101,5 @@ if(WITH_CUDNN_FRONTEND)
   nv_test(
     cudnn_frontend_test
     SRCS cudnn_frontend_test.cc
-    DEPS phi cudnn-frontend)
+    DEPS phi common cudnn-frontend)
 endif()
diff --git a/paddle/phi/backends/event.h b/paddle/phi/backends/event.h
index 21dc9f47d7b89e..0ac87f376bfccb 100644
--- a/paddle/phi/backends/event.h
+++ b/paddle/phi/backends/event.h
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #pragma once
+#include "paddle/common/macros.h"
 #include "paddle/phi/common/place.h"
-#include "paddle/phi/core/macros.h"
 
 namespace phi {
 
diff --git a/paddle/phi/backends/gpu/cuda/cuda_graph.h b/paddle/phi/backends/gpu/cuda/cuda_graph.h
index 7b5644128c7cd4..a40cfc93ab81a2 100644
--- a/paddle/phi/backends/gpu/cuda/cuda_graph.h
+++ b/paddle/phi/backends/gpu/cuda/cuda_graph.h
@@ -26,14 +26,14 @@
 
 #include "glog/logging.h"
 
+#include "paddle/common/errors.h"
+#include "paddle/common/macros.h"
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/device_code.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
-#include "paddle/phi/core/macros.h"
 #include "paddle/utils/optional.h"
 
 #if CUDA_VERSION < 11000
diff --git a/paddle/phi/backends/gpu/cuda/cudnn_desc.h b/paddle/phi/backends/gpu/cuda/cudnn_desc.h
index d4fb6930bcc550..33565ba87413ff 100644
--- a/paddle/phi/backends/gpu/cuda/cudnn_desc.h
+++ b/paddle/phi/backends/gpu/cuda/cudnn_desc.h
@@ -132,7 +132,7 @@ class TensorDescriptor {
   T* desc() { return desc_.get(); }
   T* desc() const { return desc_.get(); }
   void set(const phi::DenseTensor& tensor, const int groups = 1) {
-    auto dims = phi::vectorize<int>(tensor.dims());
+    auto dims = common::vectorize<int>(tensor.dims());
     std::vector<int> strides(dims.size());
     strides[dims.size() - 1] = 1;
     for (int i = dims.size() - 2; i >= 0; i--) {
@@ -168,7 +168,7 @@ class TensorDescriptor {
   }
 
   void set(const phi::DenseTensor& tensor, const cudnnTensorFormat_t format) {
-    auto dims = phi::vectorize<int>(tensor.dims());
+    auto dims = common::vectorize<int>(tensor.dims());
     auto dtype = ToCudnnDataType(tensor.dtype());
     set(dims, format, dtype);
   }
@@ -222,7 +222,7 @@ class FilterDescriptor {
   void set(const phi::DenseTensor& tensor,
            const cudnnTensorFormat_t format,
            const int groups = 1) {
-    auto dims = phi::vectorize<int>(tensor.dims());
+    auto dims = common::vectorize<int>(tensor.dims());
     auto dtype = ToCudnnDataType(tensor.dtype());
     set(dims, format, dtype, groups);
   }
diff --git a/paddle/phi/backends/gpu/cuda/cudnn_helper.h b/paddle/phi/backends/gpu/cuda/cudnn_helper.h
index 74db3fc75bcd10..9ca5551857238d 100644
--- a/paddle/phi/backends/gpu/cuda/cudnn_helper.h
+++ b/paddle/phi/backends/gpu/cuda/cudnn_helper.h
@@ -17,14 +17,14 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#include "paddle/common/errors.h"
+#include "paddle/common/macros.h"
 #include "paddle/phi/backends/dynload/cudnn.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
-#include "paddle/phi/core/macros.h"
 #include "paddle/utils/flags.h"
 
 PD_DECLARE_bool(cudnn_deterministic);
@@ -374,7 +374,8 @@ class ScopedDropoutDescriptor {
       PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetDropoutDescriptor(
           desc_, handle, dropout_prob_, dropout_state_data, state_size, seed));
     } else {
-      auto dropout_state_dims = phi::vectorize<int64_t>(dropout_state_->dims());
+      auto dropout_state_dims =
+          common::vectorize<int64_t>(dropout_state_->dims());
       state_size = dropout_state_dims[0];
       PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRestoreDropoutDescriptor(
           desc_, handle, dropout_prob_, dropout_state_data, state_size, 0));
diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc
index f87e3b3d805393..2a074e24a57405 100644
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -24,7 +24,7 @@ limitations under the License. */
 #include <unordered_map>
 
 #include "glog/logging.h"
-#include "paddle/phi/api/ext/exception.h"
+#include "paddle/common/exception.h"
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
diff --git a/paddle/phi/backends/gpu/rocm/miopen_desc.h b/paddle/phi/backends/gpu/rocm/miopen_desc.h
index ae0e274ca650ef..55758968a30f67 100644
--- a/paddle/phi/backends/gpu/rocm/miopen_desc.h
+++ b/paddle/phi/backends/gpu/rocm/miopen_desc.h
@@ -121,7 +121,7 @@ class TensorDescriptor {
   T* desc() const { return desc_.get(); }
 
   void set(const phi::DenseTensor& tensor, const int groups = 1) {
-    auto dims = phi::vectorize<int>(tensor.dims());
+    auto dims = common::vectorize<int>(tensor.dims());
     std::vector<int> strides(dims.size());
     strides[dims.size() - 1] = 1;
     for (int i = dims.size() - 2; i >= 0; i--) {
@@ -145,7 +145,7 @@ class TensorDescriptor {
         format,
         MIOPEN_TENSOR_NCHW,
         phi::errors::InvalidArgument("format should ONLY be NCHW in MIOPEN."));
-    auto dims = phi::vectorize<int>(tensor.dims());
+    auto dims = common::vectorize<int>(tensor.dims());
     std::vector<int> strides(dims.size());
     strides[dims.size() - 1] = 1;
     for (int i = dims.size() - 2; i >= 0; i--) {
@@ -195,7 +195,7 @@ class FilterDescriptor {
         format,
         MIOPEN_TENSOR_NCHW,
         phi::errors::InvalidArgument("format should ONLY be NCHW in MIOPEN."));
-    auto dims = phi::vectorize<int>(tensor.dims());
+    auto dims = common::vectorize<int>(tensor.dims());
     std::vector<int> strides(dims.size());
     strides[dims.size() - 1] = 1;
     for (int i = dims.size() - 2; i >= 0; i--) {
diff --git a/paddle/phi/backends/gpu/rocm/miopen_helper.h b/paddle/phi/backends/gpu/rocm/miopen_helper.h
index f7815e2ed851e0..47603d0950f400 100644
--- a/paddle/phi/backends/gpu/rocm/miopen_helper.h
+++ b/paddle/phi/backends/gpu/rocm/miopen_helper.h
@@ -19,14 +19,14 @@ limitations under the License. */
 
 #include "paddle/utils/flags.h"
 
+#include "paddle/common/errors.h"
+#include "paddle/common/macros.h"
 #include "paddle/phi/backends/dynload/miopen.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
-#include "paddle/phi/core/macros.h"
 
 // MIOPEN do not have epslion definition
 #define CUDNN_BN_MIN_EPSILON 1e-05
diff --git a/paddle/phi/backends/onednn/matmul_utils.cc b/paddle/phi/backends/onednn/matmul_utils.cc
index 815663ca1e0ccc..ac2b5ab8c575df 100644
--- a/paddle/phi/backends/onednn/matmul_utils.cc
+++ b/paddle/phi/backends/onednn/matmul_utils.cc
@@ -18,11 +18,11 @@ namespace phi {
 namespace funcs {
 
 DDim RowMatrixDimsFromVector(const DDim& x_dim) {
-  return x_dim.size() > 1 ? x_dim : make_ddim({1, x_dim[0]});
+  return x_dim.size() > 1 ? x_dim : common::make_ddim({1, x_dim[0]});
 }
 
 DDim ColumnMatrixDimsFromVector(const DDim& y_dim) {
-  return y_dim.size() > 1 ? y_dim : make_ddim({y_dim[0], 1});
+  return y_dim.size() > 1 ? y_dim : common::make_ddim({y_dim[0], 1});
 }
 
 std::vector<int64_t> TransposeAxis(const std::vector<int64_t>& x,
diff --git a/paddle/phi/backends/onednn/matmul_utils.h b/paddle/phi/backends/onednn/matmul_utils.h
index 7248e64fe60b16..e1a4777041ceb4 100644
--- a/paddle/phi/backends/onednn/matmul_utils.h
+++ b/paddle/phi/backends/onednn/matmul_utils.h
@@ -146,7 +146,7 @@ inline void ExecuteMul(const OneDNNContext& dev_ctx,
   // This kernel is flattening dims so then we need to unflattened version
   // that should be set in out reshape require plain layout, but
   // MatmulV2MKLDNNHanlder enforces one so it should work
-  auto reshape_dims = out->dims().size() != 0 ? vectorize(out->dims())
+  auto reshape_dims = out->dims().size() != 0 ? common::vectorize(out->dims())
                                               : std::vector<int64_t>{1};
   out->set_mem_desc(dst_memory_p->get_desc().reshape(reshape_dims));
 }
@@ -178,7 +178,7 @@ inline void ExecuteMatmul(const OneDNNContext& dev_ctx,
   matmul_p->execute(astream, matmul_args);
   astream.wait();
 
-  auto reshape_dims = out->dims().size() != 0 ? vectorize(out->dims())
+  auto reshape_dims = out->dims().size() != 0 ? common::vectorize(out->dims())
                                               : std::vector<int64_t>{1};
   out->set_mem_desc(dst_memory_p->get_desc().reshape(reshape_dims));
 }
diff --git a/paddle/phi/backends/onednn/onednn_context.h b/paddle/phi/backends/onednn/onednn_context.h
index b9f1d490874841..aec9f7f0d5e4f5 100644
--- a/paddle/phi/backends/onednn/onednn_context.h
+++ b/paddle/phi/backends/onednn/onednn_context.h
@@ -17,8 +17,8 @@ limitations under the License. */
 #include <memory>
 #include <mutex>     // NOLINT
 #include "dnnl.hpp"  // NOLINT
+#include "paddle/common/layout.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/attribute.h"
 #include "paddle/utils/test_macros.h"
diff --git a/paddle/phi/backends/onednn/onednn_helper.h b/paddle/phi/backends/onednn/onednn_helper.h
index 1d61004b36161f..60c531c7b74435 100644
--- a/paddle/phi/backends/onednn/onednn_helper.h
+++ b/paddle/phi/backends/onednn/onednn_helper.h
@@ -18,8 +18,8 @@
 #include "dnnl.hpp"  // NOLINT
 #include "glog/logging.h"
 
+#include "paddle/common/layout.h"
 #include "paddle/phi/backends/onednn/onednn_context.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/dense_tensor.h"
 
@@ -263,16 +263,16 @@ inline void MatchShapeToLayout(DenseTensor* tensor_in,
   // be done. Similarly for dim==1 when you have just one possible combination.
   if (tensor_in->dims().size() < 3) {
     VLOG(3) << "Keeping ONEDNN/NHWC/NDHWC output_shape"
-            << print_dims(phi::vectorize<int>(tensor_in->dims()));
+            << print_dims(common::vectorize<int>(tensor_in->dims()));
     return;
   }
 
   switch (from) {
     case DataLayout::ONEDNN:
       if ((to == DataLayout::NHWC) || (to == DataLayout::NDHWC)) {
-        auto dims = phi::vectorize<int>(tensor_in->dims());
+        auto dims = common::vectorize<int>(tensor_in->dims());
         std::rotate(dims.begin() + 1, dims.begin() + 2, dims.end());
-        tensor_in->Resize(phi::make_ddim(dims));
+        tensor_in->Resize(common::make_ddim(dims));
         VLOG(3) << "Rotating Shape from: ONEDNN to: NHWC/NDHWC output_shape"
                 << print_dims(dims);
       }
@@ -280,9 +280,9 @@ inline void MatchShapeToLayout(DenseTensor* tensor_in,
     case DataLayout::NHWC:
     case DataLayout::NDHWC:
       if (to == DataLayout::ONEDNN) {
-        auto dims = phi::vectorize<int>(tensor_in->dims());
+        auto dims = common::vectorize<int>(tensor_in->dims());
         std::rotate(dims.begin() + 1, dims.end() - 1, dims.end());
-        tensor_in->Resize(phi::make_ddim(dims));
+        tensor_in->Resize(common::make_ddim(dims));
         VLOG(3) << "Rotating Shape from: NHWC/NDHWC to: ONEDNN output_shape"
                 << print_dims(dims);
       }
diff --git a/paddle/phi/backends/onednn/onednn_reuse.h b/paddle/phi/backends/onednn/onednn_reuse.h
index 990c6ea10bbadd..d9719c6f3e5b2c 100644
--- a/paddle/phi/backends/onednn/onednn_reuse.h
+++ b/paddle/phi/backends/onednn/onednn_reuse.h
@@ -957,8 +957,8 @@ class BinaryOneDNNHandler : public OneDNNHandlerNoCachingT<T, dnnl::binary> {
       : OneDNNHandlerNoCachingT<T, dnnl::binary>(engine, cpu_place) {
     use_broadcasting_hack = false;
     swin_case = false;
-    const auto src_x_tz = vectorize(x->dims());
-    const auto src_y_tz = vectorize(y->dims());
+    const auto src_x_tz = common::vectorize(x->dims());
+    const auto src_y_tz = common::vectorize(y->dims());
     // if output tensor(z) is nullptr then we are computing into oneDNN
     // managed buffer
     auto rankdiff = x->dims().size() - y->dims().size();
@@ -968,7 +968,7 @@ class BinaryOneDNNHandler : public OneDNNHandlerNoCachingT<T, dnnl::binary> {
                             : (y->dims().size() == 0 ? std::vector<int64_t>{1}
                                                      : src_x_tz))
             : (out->dims().size() == 0 ? std::vector<int64_t>{1}
-                                       : vectorize(out->dims()));
+                                       : common::vectorize(out->dims()));
 
     auto src0_md = x->mem_desc();
     auto src1_md = y->mem_desc();
@@ -1216,8 +1216,9 @@ class BroadcastDataOneDNNHandler
                              float scale_y,
                              const std::vector<int64_t>& extended_x_dims)
       : OneDNNHandlerNoCachingT<T, dnnl::binary>(engine, cpu_place) {
-    const auto src0_tz = out->dims().size() == 0 ? std::vector<int64_t>{1}
-                                                 : vectorize(out->dims());
+    const auto src0_tz = out->dims().size() == 0
+                             ? std::vector<int64_t>{1}
+                             : common::vectorize(out->dims());
     const auto src0_md = dnnl::memory::desc(
         src0_tz, OneDNNGetDataType<T>(), GetPlainOneDNNFormat(src0_tz.size()));
     const auto reshape_dims =
@@ -1264,7 +1265,7 @@ class PReluOneDNNHandler
                      const bool is_test)
       : OneDNNHandlerNoCachingT<T, dnnl::prelu_forward, dnnl::prelu_backward>(
             engine, cpu_place) {
-    auto weights_dims = vectorize(weights.dims());
+    auto weights_dims = common::vectorize(weights.dims());
     // weights must have same size as X only for "element" case
     if (weights.dims().size() != x.dims().size()) {
       auto new_weights_dims = std::vector<int64_t>(x.dims().size(), 1);
@@ -1465,7 +1466,7 @@ class BatchNormOneDNNHandler
   }
 
   std::shared_ptr<dnnl::memory> AcquireScaleMemory(const DenseTensor* scale) {
-    auto scale_tz = vectorize(scale->dims());
+    auto scale_tz = common::vectorize(scale->dims());
     PADDLE_ENFORCE_EQ(
         scale_tz.size(),
         1,
@@ -1480,7 +1481,7 @@ class BatchNormOneDNNHandler
   }
 
   std::shared_ptr<dnnl::memory> AcquireShiftMemory(const DenseTensor* shift) {
-    auto shift_tz = vectorize(shift->dims());
+    auto shift_tz = common::vectorize(shift->dims());
     PADDLE_ENFORCE_EQ(
         shift_tz.size(),
         1,
@@ -1600,8 +1601,8 @@ class PoolingOneDNNHandler
     auto onednn_paddings = ToOneDNNPadding(copied_paddings);
 
     const auto dt = ToOneDNNDataType(input->dtype());
-    const auto src_tz = vectorize(input->dims());
-    const auto dst_tz = vectorize(output->dims());
+    const auto src_tz = common::vectorize(input->dims());
+    const auto dst_tz = common::vectorize(output->dims());
     const auto dst_md = OneDNNMemDesc(dst_tz, dt, OneDNNMemoryFormat::any);
 
     if (ceil_mode) {
@@ -1689,9 +1690,9 @@ class PoolingOneDNNHandler
                            copied_strides,
                            copied_kernel_size);
 
-    auto src_tz = vectorize<int64_t>(in_x->dims());
-    auto diff_src_tz = vectorize<int64_t>(in_x_grad->dims());
-    auto diff_dst_tz = vectorize<int64_t>(out_grad->dims());
+    auto src_tz = common::vectorize<int64_t>(in_x->dims());
+    auto diff_src_tz = common::vectorize<int64_t>(in_x_grad->dims());
+    auto diff_dst_tz = common::vectorize<int64_t>(out_grad->dims());
 
     const auto dt = ToOneDNNDataType(in_x->dtype());
     auto dst_md = dnnl::memory::desc(diff_dst_tz, dt, OneDNNMemoryFormat::any);
@@ -1910,7 +1911,7 @@ static void SetOutMemDescWithUnsqueeze2FuseSupport(
     }
   }
   out->set_mem_desc(out_md.reshape(unsqueezed_op_tz));
-  out->Resize(make_ddim(unsqueezed_op_tz));
+  out->Resize(common::make_ddim(unsqueezed_op_tz));
 }
 
 static void SetOutMemDescWithReshape2FuseSupport(
@@ -1934,7 +1935,7 @@ static void SetOutMemDescWithReshape2FuseSupport(
   }
 
   out->set_mem_desc(out_md.reshape(fused_reshape2_shape));
-  out->Resize(phi::make_ddim(fused_reshape2_shape));
+  out->Resize(common::make_ddim(fused_reshape2_shape));
 }
 
 }  // namespace funcs
diff --git a/paddle/phi/backends/stream.h b/paddle/phi/backends/stream.h
index 4219b1cec49044..43f15ef08c0349 100644
--- a/paddle/phi/backends/stream.h
+++ b/paddle/phi/backends/stream.h
@@ -14,9 +14,9 @@
 
 #pragma once
 
+#include "paddle/common/macros.h"
 #include "paddle/phi/backends/callback_manager.h"
 #include "paddle/phi/common/place.h"
-#include "paddle/phi/core/macros.h"
 
 namespace phi {
 
diff --git a/paddle/phi/backends/xpu/xpu_context.cc b/paddle/phi/backends/xpu/xpu_context.cc
index 14164c4e9ddc7e..e3931d3da19b19 100644
--- a/paddle/phi/backends/xpu/xpu_context.cc
+++ b/paddle/phi/backends/xpu/xpu_context.cc
@@ -18,7 +18,7 @@
 
 #include "glog/logging.h"
 
-#include "paddle/phi/api/ext/exception.h"
+#include "paddle/common/exception.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/os_info.h"
diff --git a/paddle/phi/capi/include/type_utils.h b/paddle/phi/capi/include/type_utils.h
index 029ee42fe091bc..69b1a213bf01f4 100644
--- a/paddle/phi/capi/include/type_utils.h
+++ b/paddle/phi/capi/include/type_utils.h
@@ -15,9 +15,9 @@
 #pragma once
 #if !defined(_WIN32)
 
+#include "paddle/common/layout.h"
 #include "paddle/phi/capi/include/c_data_type.h"
 #include "paddle/phi/common/data_type.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/enforce.h"
 
 namespace phi {
diff --git a/paddle/phi/capi/include/wrapper_base.h b/paddle/phi/capi/include/wrapper_base.h
index 9924f4d5efb6ba..061561008a95e7 100644
--- a/paddle/phi/capi/include/wrapper_base.h
+++ b/paddle/phi/capi/include/wrapper_base.h
@@ -23,7 +23,7 @@
 #include <typeinfo>
 #include <vector>
 
-#include "paddle/phi/api/ext/exception.h"
+#include "paddle/common/exception.h"
 #include "paddle/phi/capi/include/c_device_context.h"
 #include "paddle/phi/capi/include/c_infer_meta_context.h"
 #include "paddle/phi/capi/include/c_int_array.h"
diff --git a/paddle/phi/capi/lib/c_meta_tensor.cc b/paddle/phi/capi/lib/c_meta_tensor.cc
index d2493058081584..6ea6eda1a7f23e 100644
--- a/paddle/phi/capi/lib/c_meta_tensor.cc
+++ b/paddle/phi/capi/lib/c_meta_tensor.cc
@@ -114,7 +114,7 @@ void PD_MetaTensorSetDims(PD_MetaTensor *tensor,
   }
   auto cc_tensor = reinterpret_cast<phi::MetaTensor *>(tensor);
   std::vector<int> shape(dims, dims + ndims);
-  cc_tensor->set_dims(phi::make_ddim(shape));
+  cc_tensor->set_dims(common::make_ddim(shape));
 }
 
 void PD_MetaTensorSetDataType(PD_MetaTensor *tensor,
diff --git a/paddle/phi/capi/lib/c_tensor.cc b/paddle/phi/capi/lib/c_tensor.cc
index b460d2e368607c..31a724447b7c7f 100644
--- a/paddle/phi/capi/lib/c_tensor.cc
+++ b/paddle/phi/capi/lib/c_tensor.cc
@@ -198,7 +198,7 @@ void PD_TensorSetDims(PD_Tensor* tensor,
   }
   auto cc_tensor = reinterpret_cast<phi::DenseTensor*>(tensor);
   std::vector<int> shape(dims, dims + ndims);
-  cc_tensor->Resize(phi::make_ddim(shape));
+  cc_tensor->Resize(common::make_ddim(shape));
 }
 
 void PD_TensorSetDataType(PD_Tensor* tensor,
diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h
index 5540592d5013c8..64dab3ccdeb3b4 100644
--- a/paddle/phi/common/backend.h
+++ b/paddle/phi/common/backend.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <ostream>
 
-#include "paddle/phi/api/ext/exception.h"
+#include "paddle/common/exception.h"
 #include "paddle/phi/common/place.h"
 
 namespace paddle {
diff --git a/paddle/phi/common/cpstring_impl.h b/paddle/phi/common/cpstring_impl.h
index 6783799026d44b..1906fd4e57a444 100644
--- a/paddle/phi/common/cpstring_impl.h
+++ b/paddle/phi/common/cpstring_impl.h
@@ -24,7 +24,7 @@ limitations under the License. */
 #include <stdlib.h>
 #include <string.h>
 
-#include "paddle/phi/core/macros.h"
+#include "paddle/common/macros.h"
 
 #if (defined(__NVCC__) || defined(__HIPCC__))
 #define HOSTDEVICE __host__ __device__
diff --git a/paddle/phi/common/data_type.h b/paddle/phi/common/data_type.h
index 58852cf5adb022..36761d673d5396 100644
--- a/paddle/phi/common/data_type.h
+++ b/paddle/phi/common/data_type.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/api/ext/exception.h"
+#include "paddle/common/exception.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
diff --git a/paddle/phi/common/int_array.cc b/paddle/phi/common/int_array.cc
index 4b5d553006685b..75440bd2d5b818 100644
--- a/paddle/phi/common/int_array.cc
+++ b/paddle/phi/common/int_array.cc
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #include "paddle/phi/common/int_array.h"
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/place.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/tensor_utils.h"
 
 namespace paddle {
diff --git a/paddle/phi/common/int_array.h b/paddle/phi/common/int_array.h
index 0c4b3d4c8ca5b7..6eab8609e54b26 100644
--- a/paddle/phi/common/int_array.h
+++ b/paddle/phi/common/int_array.h
@@ -16,13 +16,13 @@ limitations under the License. */
 
 #include <vector>
 
-#include "paddle/phi/api/ext/exception.h"
+#include "paddle/common/exception.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/tensor_ref.h"
 
-namespace phi {
+namespace common {
 class DDim;
-}  // namespace phi
+}  // namespace common
 
 namespace paddle {
 class Tensor;
@@ -55,7 +55,7 @@ class IntArrayBase {
 
   void SetFromTensor(bool val) { is_from_tensor_ = val; }
 
-  explicit IntArrayBase(const phi::DDim& dims);
+  explicit IntArrayBase(const common::DDim& dims);
 
   // The Tensor must have one dim
   IntArrayBase(const T& tensor);  // NOLINT
diff --git a/paddle/phi/common/memory_utils.h b/paddle/phi/common/memory_utils.h
index 2571ec6d2788b7..e2a590ee4d210c 100644
--- a/paddle/phi/common/memory_utils.h
+++ b/paddle/phi/common/memory_utils.h
@@ -17,11 +17,11 @@
 #include <future>  // NOLINT
 #include <unordered_map>
 
+#include "paddle/common/macros.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/allocator.h"
 #include "paddle/phi/core/device_context.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/macros.h"
 #include "paddle/phi/core/stream.h"
 #include "paddle/utils/test_macros.h"
 
diff --git a/paddle/phi/common/place.cc b/paddle/phi/common/place.cc
index 61bb82176388c2..008f45aa935544 100644
--- a/paddle/phi/common/place.cc
+++ b/paddle/phi/common/place.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include <string>
 
 #include "glog/logging.h"
-#include "paddle/phi/api/ext/exception.h"
+#include "paddle/common/exception.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 
 namespace phi {
diff --git a/paddle/phi/common/place.h b/paddle/phi/common/place.h
index 03072468f62e20..8865d9c2690cdb 100644
--- a/paddle/phi/common/place.h
+++ b/paddle/phi/common/place.h
@@ -18,8 +18,8 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 
+#include "paddle/common/macros.h"
 #include "paddle/phi/api/include/dll_decl.h"
-#include "paddle/phi/core/macros.h"
 #include "paddle/utils/test_macros.h"
 
 namespace paddle {
diff --git a/paddle/phi/common/scalar.h b/paddle/phi/common/scalar.h
index 5ed843653887b4..12de9149a96af6 100644
--- a/paddle/phi/common/scalar.h
+++ b/paddle/phi/common/scalar.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include <sstream>
 #include <vector>
 
-#include "paddle/phi/api/ext/exception.h"
+#include "paddle/common/exception.h"
 #include "paddle/phi/common/data_type.h"
 
 namespace paddle {
diff --git a/paddle/phi/common/tensor_ref.h b/paddle/phi/common/tensor_ref.h
index aa0338006f4ccd..6ecea89b06ba44 100644
--- a/paddle/phi/common/tensor_ref.h
+++ b/paddle/phi/common/tensor_ref.h
@@ -20,7 +20,7 @@
 #include <sstream>
 #include <vector>
 
-#include "paddle/phi/api/ext/exception.h"
+#include "paddle/common/exception.h"
 #include "paddle/phi/common/data_type.h"
 
 namespace phi {
diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt
index b9061b64087b08..b582ef84942df1 100644
--- a/paddle/phi/core/CMakeLists.txt
+++ b/paddle/phi/core/CMakeLists.txt
@@ -10,12 +10,10 @@ collect_srcs(
   core_srcs
   SRCS
   flags.cc
-  errors.cc
   enforce.cc
   storage_properties.cc
   os_info.cc
   kernel_context.cc
-  ddim.cc
   tensor_base.cc
   allocator.cc
   tensor_meta.cc
diff --git a/paddle/phi/core/attribute.h b/paddle/phi/core/attribute.h
index 6f032f4a5bd99c..88ab2dbf1df496 100644
--- a/paddle/phi/core/attribute.h
+++ b/paddle/phi/core/attribute.h
@@ -17,9 +17,9 @@
 #include <string>
 #include <vector>
 
+#include "paddle/common/layout.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/int_array.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/common/tensor_ref.h"
 #include "paddle/utils/flat_hash_map.h"
diff --git a/paddle/phi/core/compat/convert_utils.h b/paddle/phi/core/compat/convert_utils.h
index a6ae22ff669fcc..632b7a6d17ef27 100644
--- a/paddle/phi/core/compat/convert_utils.h
+++ b/paddle/phi/core/compat/convert_utils.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/common/layout.h"
 #include "paddle/phi/common/backend.h"
 #include "paddle/phi/common/data_type.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/tensor_meta.h"
 
diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h
index cfa647149669c8..b2c334d89023df 100644
--- a/paddle/phi/core/compat/op_utils.h
+++ b/paddle/phi/core/compat/op_utils.h
@@ -18,10 +18,10 @@ limitations under the License. */
 #include <unordered_set>
 
 #include "glog/logging.h"
+#include "paddle/common/macros.h"
 #include "paddle/phi/core/compat/arg_map_context.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/core/macros.h"
 #include "paddle/phi/core/type_defs.h"
 #include "paddle/utils/flat_hash_map.h"
 
diff --git a/paddle/phi/core/custom_kernel.h b/paddle/phi/core/custom_kernel.h
index 5ba14de6a6131c..1f6386a378aeec 100644
--- a/paddle/phi/core/custom_kernel.h
+++ b/paddle/phi/core/custom_kernel.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
+#include "paddle/common/macros.h"
 #include "paddle/phi/core/kernel_factory.h"
-#include "paddle/phi/core/macros.h"
 
 namespace phi {
 /**
diff --git a/paddle/phi/core/ddim.cc b/paddle/phi/core/ddim.cc
deleted file mode 100644
index ff95346be17c7a..00000000000000
--- a/paddle/phi/core/ddim.cc
+++ /dev/null
@@ -1,230 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/ddim.h"
-
-#include <set>
-
-namespace phi {
-
-DDim make_ddim(std::initializer_list<int64_t> dims) {
-  return DDim(dims.begin(), static_cast<int>(dims.size()));
-}
-
-DDim make_ddim(const std::vector<int64_t>& dims) {
-  return DDim(dims.data(), static_cast<int>(dims.size()));
-}
-
-DDim make_ddim(const std::vector<int>& dims) {
-  return DDim(dims.data(), static_cast<int>(dims.size()));
-}
-
-struct DDimEqualityVisitor {
-  explicit DDimEqualityVisitor(const int64_t* d) : d_(d) {}
-
-  template <int D>
-  inline bool operator()(const Dim<D>& self) const {
-    return UnrollCompare<D>::Run(self.Get(), d_);
-  }
-
-  const int64_t* d_;
-};
-
-bool DDim::operator==(const DDim& d) const {
-  if (size() == -1 && d.size() == -1) {
-    return true;
-  } else if (size() == -1 || d.size() == -1) {
-    return false;
-  } else {
-    return size() == d.size() &&
-           this->apply_visitor(DDimEqualityVisitor(d.Get()));
-  }
-}
-
-bool DDim::operator!=(const DDim& d) const { return !(*this == d); }
-
-std::string DDim::to_str() const {
-  std::stringstream ss;
-  ss << '[';
-  if (rank_ > 0) ss << dim_[0];
-
-  for (int i = 1; i < rank_; ++i) ss << ", " << dim_[i];
-  ss << ']';
-  return ss.str();
-}
-
-struct ProductVisitor {
-  template <int D>
-  inline int64_t operator()(const Dim<D>& dim) {
-    return product(dim);
-  }
-};
-
-int64_t product(const DDim& ddim) {
-  if (ddim.size() == -1) {
-    return 0;
-  }
-  return ddim.apply_visitor(ProductVisitor());
-}
-
-bool contain_unknown_dim(const DDim& ddim) {
-  for (int i = 0; i < ddim.size(); ++i) {
-    if (ddim[i] < 0) {
-      return true;
-    }
-  }
-
-  return false;
-}
-
-DDim slice_ddim(const DDim& dim, int begin, int end) {
-  PADDLE_ENFORCE_EQ(
-      (begin >= 0 && end <= dim.size()),
-      true,
-      phi::errors::InvalidArgument(
-          "[begin(%d), end(%d)) must be inside [0, %d) in ddim slice.",
-          begin,
-          end,
-          dim.size()));
-  // Constructor of DDim would check whether end - begin is valid
-  return DDim(dim.Get() + begin, end - begin);
-}
-
-int arity(const DDim& d) { return d.size(); }
-
-struct DDimPrinter {
-  std::ostream& os;
-  explicit DDimPrinter(std::ostream& os_) : os(os_) {}
-
-  template <int D>
-  void operator()(const Dim<D>& t) {
-    os << t;
-  }
-};
-
-std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
-  if (ddim.size() == -1) {
-    return os;
-  }
-  ddim.apply_visitor(DDimPrinter(os));
-  return os;
-}
-
-DDim flatten_to_3d(const DDim& src, int num_row_dims, int num_col_dims) {
-  PADDLE_ENFORCE_GE(
-      src.size(),
-      3,
-      phi::errors::InvalidArgument("The rank of src dim should be at least 3 "
-                                   "in flatten_to_3d, but received %d.",
-                                   src.size()));
-  PADDLE_ENFORCE_EQ(
-      (num_row_dims >= 1 && num_row_dims < src.size()),
-      true,
-      phi::errors::InvalidArgument("The num_row_dims should be inside [1, %d] "
-                                   "in flatten_to_3d, but received %d.",
-                                   src.size() - 1,
-                                   num_row_dims));
-  PADDLE_ENFORCE_EQ(
-      (num_col_dims >= 2 && num_col_dims <= src.size()),
-      true,
-      phi::errors::InvalidArgument("The num_col_dims should be inside [2, %d] "
-                                   "in flatten_to_3d, but received %d.",
-                                   src.size(),
-                                   num_col_dims));
-  PADDLE_ENFORCE_GE(
-      num_col_dims,
-      num_row_dims,
-      phi::errors::InvalidArgument(
-          "The num_row_dims should be less than num_col_dims in flatten_to_3d,"
-          "but received num_row_dims = %d, num_col_dims = %d.",
-          num_row_dims,
-          num_col_dims));
-
-  return DDim({product(slice_ddim(src, 0, num_row_dims)),
-               product(slice_ddim(src, num_row_dims, num_col_dims)),
-               product(slice_ddim(src, num_col_dims, src.size()))});
-}
-
-DDim flatten_to_2d(const DDim& src, int num_col_dims) {
-  return DDim({product(slice_ddim(src, 0, num_col_dims)),
-               product(slice_ddim(src, num_col_dims, src.size()))});
-}
-
-DDim flatten_to_1d(const DDim& src) { return DDim({product(src)}); }
-
-DDim stride(const DDim& ddim) {
-  DDim strides;
-  strides.rank_ = ddim.size();
-  if (ddim.size() > 0) strides[ddim.size() - 1] = 1;
-  for (int i = ddim.size() - 2; i >= 0; --i) {
-    strides[i] = strides[i + 1] * ddim[i + 1];
-  }
-  return strides;
-}
-
-DDim stride_numel(const DDim& ddim) {
-  DDim strides;
-  strides.rank_ = ddim.size();
-  if (ddim.size() > 0) strides[ddim.size() - 1] = ddim[ddim.size() - 1];
-  for (int i = ddim.size() - 2; i >= 0; --i) {
-    strides[i] = strides[i + 1] * ddim[i];
-  }
-  return strides;
-}
-
-DDim DDim::reshape(std::vector<int>& shape) const {
-  const DDim& in_dims = *this;
-
-  for (int i = 0; i < static_cast<int>(shape.size()); ++i) {
-    if (shape[i] == 0) {
-      shape[i] = static_cast<int>(in_dims.at(i));
-    }
-  }
-
-  // Dim marked as "-1" must be inferred
-  auto it = std::find(shape.begin(), shape.end(), -1);
-  if (it != shape.end()) {
-    int index = static_cast<int>(std::distance(shape.begin(), it));
-    int reshape_out_product =
-        std::accumulate(shape.begin(), shape.end(), -1, std::multiplies<int>());
-    shape[index] = static_cast<int>(product(in_dims)) / reshape_out_product;
-  }
-
-  return phi::make_ddim(shape);
-}
-
-DDim DDim::transpose(const std::vector<int>& axis) const {
-  const DDim& in_dims = *this;
-
-  DDim out_dims(in_dims);
-  for (int i = 0; i < static_cast<int>(axis.size()); i++) {
-    out_dims[i] = in_dims[axis[i]];
-  }
-  return out_dims;
-}
-
-}  // namespace phi
-
-namespace std {
-
-std::size_t hash<phi::DDim>::operator()(phi::DDim const& ddim) const {
-  int ndim = ddim.size();
-  std::size_t seed = ndim;
-  for (int i = 0; i < ndim; ++i) {
-    seed ^= ddim.Get()[i] + 0x9e3779b9 + (seed << 6) + (seed >> 2);
-  }
-  return seed;
-}
-
-}  // namespace std
diff --git a/paddle/phi/core/ddim.h b/paddle/phi/core/ddim.h
deleted file mode 100644
index ff2abdb3b84b39..00000000000000
--- a/paddle/phi/core/ddim.h
+++ /dev/null
@@ -1,284 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <initializer_list>
-#include <numeric>
-#include <stdexcept>
-#include <string>
-#include <vector>
-
-#include "paddle/phi/api/ext/exception.h"
-#include "paddle/phi/core/utils/dim.h"
-#include "paddle/utils/test_macros.h"
-
-namespace phi {
-
-#define PADDLE_VISIT_DDIM_BASE(rank, callback) \
-  case (rank): {                               \
-    constexpr auto kRank = (rank);             \
-    return (callback);                         \
-  }
-
-#define PADDLE_VISIT_DDIM(rank, callback)                                    \
-  switch (rank) {                                                            \
-    PADDLE_VISIT_DDIM_BASE(0, callback);                                     \
-    PADDLE_VISIT_DDIM_BASE(1, callback);                                     \
-    PADDLE_VISIT_DDIM_BASE(2, callback);                                     \
-    PADDLE_VISIT_DDIM_BASE(3, callback);                                     \
-    PADDLE_VISIT_DDIM_BASE(4, callback);                                     \
-    PADDLE_VISIT_DDIM_BASE(5, callback);                                     \
-    PADDLE_VISIT_DDIM_BASE(6, callback);                                     \
-    PADDLE_VISIT_DDIM_BASE(7, callback);                                     \
-    PADDLE_VISIT_DDIM_BASE(8, callback);                                     \
-    PADDLE_VISIT_DDIM_BASE(9, callback);                                     \
-    default:                                                                 \
-      PD_THROW(                                                              \
-          "Unimplemented error. Invalid dimension to be accessed. Now only " \
-          "supports access to "                                              \
-          "dimension 0 to 9, but received dimension is ",                    \
-          rank,                                                              \
-          ".");                                                              \
-  }
-
-template <typename T1, typename T2>
-inline void dynamic_dim_assign(const T1* in, T2* out, int n) {
-  if (n == -1) {
-    return;
-  }
-  PADDLE_VISIT_DDIM(n, (static_dim_assign<kRank, T1, T2>(in, out)));
-}
-
-/**
- * \brief A dynamically sized dimension.
- *
- * The number of dimensions must be between [1, 9].
- */
-class DDim {
- public:
-  constexpr static int kMaxRank = 9;
-
-  DDim() : rank_(-1) { dim_[0] = 0; }
-
-  DDim(const DDim& ddim) : dim_() { CopyFrom(ddim); }
-
-  DDim(const int* d, int n) : rank_(n) {
-    dynamic_dim_assign(d, dim_.GetMutable(), n);
-  }
-
-  DDim(const int64_t* d, int n) : rank_(n) {
-    dynamic_dim_assign(d, dim_.GetMutable(), n);
-  }
-
-  template <int D>
-  /*implicit*/ DDim(const Dim<D>& in) : rank_(D) {  // NOLINT
-    UnsafeCast<D>() = in;
-  }
-
-  /*implicit*/ DDim(std::initializer_list<int64_t> init_list)
-      : DDim(init_list.begin(), init_list.size()) {}
-
-  inline DDim& operator=(const DDim& ddim) { return CopyFrom(ddim); }
-
-  template <int D>
-  inline DDim& operator=(const Dim<D>& dim) {
-    rank_ = D;
-    UnsafeCast<D>() = dim;
-    return *this;
-  }
-
-  inline int64_t& operator[](int idx) { return dim_[idx]; }
-
-  inline int64_t operator[](int idx) const { return dim_[idx]; }
-
-  int64_t& at(int idx) {
-    PADDLE_ENFORCE_GE(idx,
-                      0,
-                      phi::errors::InvalidArgument(
-                          "Invalid DDim index to be accessed. The valid index "
-                          "is between 0 and %d, but received index is %d.",
-                          rank_,
-                          idx));
-    PADDLE_ENFORCE_LT(idx,
-                      rank_,
-                      phi::errors::InvalidArgument(
-                          "Invalid DDim index to be accessed. The valid index "
-                          "is between 0 and %d, but received index is %d.",
-                          rank_,
-                          idx));
-    return dim_[idx];
-  }
-
-  int64_t at(int idx) const {
-    PADDLE_ENFORCE_GE(idx,
-                      0,
-                      phi::errors::InvalidArgument(
-                          "Invalid DDim index to be accessed. The valid index "
-                          "is between 0 and %d, but received index is %d.",
-                          rank_,
-                          idx));
-    PADDLE_ENFORCE_LT(idx,
-                      rank_,
-                      phi::errors::InvalidArgument(
-                          "Invalid DDim index to be accessed. The valid index "
-                          "is between 0 and %d, but received index is %d.",
-                          rank_,
-                          idx));
-    return dim_[idx];
-  }
-
-  template <typename Visitor>
-  typename std::result_of<Visitor(Dim<0>&)>::type apply_visitor(
-      Visitor&& visitor) {
-    PADDLE_VISIT_DDIM(rank_, visitor(UnsafeCast<kRank>()));
-  }
-
-  template <typename Visitor>
-  typename std::result_of<Visitor(const Dim<0>&)>::type apply_visitor(
-      Visitor&& visitor) const {
-    PADDLE_VISIT_DDIM(rank_, visitor(UnsafeCast<kRank>()));
-  }
-
-  bool operator==(const DDim& d) const;
-
-  bool operator!=(const DDim& d) const;
-
-  inline const int64_t* Get() const { return dim_.Get(); }
-
-  inline int64_t* GetMutable() { return dim_.GetMutable(); }
-
-  inline int size() const { return rank_; }
-
-  std::string to_str() const;
-
-  DDim reshape(std::vector<int>& shape) const;  // NOLINT
-
-  DDim transpose(const std::vector<int>& axis) const;
-
- private:
-  template <int D>
-  inline Dim<D>& UnsafeCast() {
-    static_assert(D >= 0 && D <= kMaxRank, "Invalid rank");
-    auto* p = static_cast<void*>(&dim_);
-    return *reinterpret_cast<Dim<D>*>(p);
-  }
-
-  template <int D>
-  inline const Dim<D>& UnsafeCast() const {
-    static_assert(D >= 0 && D <= kMaxRank, "Invalid rank");
-    auto* p = static_cast<const void*>(&dim_);
-    return *reinterpret_cast<const Dim<D>*>(p);
-  }
-
-  inline DDim& CopyFrom(const DDim& ddim) {
-    if (ddim.rank_ == -1) {
-      rank_ = -1;
-      return *this;
-    }
-    PADDLE_VISIT_DDIM(ddim.rank_, (*this = ddim.UnsafeCast<kRank>()));
-  }
-
-  friend DDim stride(const DDim& ddim);
-  friend DDim stride_numel(const DDim& ddim);
-
- private:
-  Dim<kMaxRank> dim_;
-  int rank_;
-};
-
-#undef PADDLE_VISIT_DDIM_BASE
-#undef PADDLE_VISIT_DDIM
-
-/**
- * \brief Make a DDim from std::vector<int64_t>
- *
- * \param dims An vector of ints. Must be sized between [1, 9]
- */
-TEST_API DDim make_ddim(const std::vector<int64_t>& dims);
-
-TEST_API DDim make_ddim(const std::vector<int>& dims);
-
-/**
- * \brief Make a DDim from an initializer list
- *
- * \param dims An initializer list of ints. Must be sized between [1, 9]
- *
- */
-TEST_API DDim make_ddim(std::initializer_list<int64_t> dims);
-
-template <typename T = int64_t>
-std::vector<T> vectorize(const DDim& ddim) {
-  if (ddim.size() == -1) {
-    return std::vector<T>({0});
-  }
-  std::vector<T> result(DDim::kMaxRank);
-  dynamic_dim_assign(ddim.Get(), result.data(), ddim.size());
-  result.resize(ddim.size());
-  return result;
-}
-
-TEST_API int64_t product(const DDim& ddim);
-
-bool contain_unknown_dim(const DDim& ddim);
-
-/**
- * \brief Slice a ddim
- *
- * Slice dim with [begin, end).
- * e.g.  DDim d = make_ddim({1,2,3,4,5});
- *       slice_ddim(d, 1, 3); ====> {2,3}
- */
-DDim slice_ddim(const DDim& dim, int begin, int end);
-
-/**
- * \brief What is the length of this dimension?
- *
- * \param Dynamic dimension to inspect
- */
-
-int arity(const DDim& ddim);
-
-std::ostream& operator<<(std::ostream&, const DDim&);
-
-/**
- * \brief Flatten dim to 3d
- * e.g., DDim d = mak_ddim({1, 2, 3, 4, 5, 6})
- *       flatten_to_3d(d, 2, 4); ===> {1*2, 3*4, 5*6} ===> {2, 12, 30}
- */
-DDim flatten_to_3d(const DDim& src, int num_row_dims, int num_col_dims);
-
-// Reshape a tensor to a matrix. The matrix's first dimension(column length)
-// will be the product of tensor's first `num_col_dims` dimensions.
-DDim flatten_to_2d(const DDim& src, int num_col_dims);
-
-DDim flatten_to_1d(const DDim& src);
-
-DDim stride(const DDim& ddim);
-
-DDim stride_numel(const DDim& ddim);
-}  // namespace phi
-
-namespace paddle {
-namespace framework {
-
-using DDim = phi::DDim;
-
-}  // namespace framework
-}  // namespace paddle
-
-namespace std {
-template <>
-struct hash<phi::DDim> {
-  std::size_t operator()(phi::DDim const& ddim) const;
-};
-}  // namespace std
diff --git a/paddle/phi/core/dense_tensor.h b/paddle/phi/core/dense_tensor.h
index 3cbbfde38fe9a7..bcc2b07a89e3a3 100644
--- a/paddle/phi/core/dense_tensor.h
+++ b/paddle/phi/core/dense_tensor.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/phi/core/allocator.h"
+#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/storage_properties.h"
 #include "paddle/phi/core/stream.h"
 #include "paddle/phi/core/tensor_base.h"
diff --git a/paddle/phi/core/distributed/auto_parallel/dist_tensor.cc b/paddle/phi/core/distributed/auto_parallel/dist_tensor.cc
index f4d6be6c779b5e..99161488b54af8 100644
--- a/paddle/phi/core/distributed/auto_parallel/dist_tensor.cc
+++ b/paddle/phi/core/distributed/auto_parallel/dist_tensor.cc
@@ -45,7 +45,8 @@ DistTensor::DistTensor(const std::shared_ptr<phi::DenseTensor>& global_value,
     if (!dist_attr.is_replicated()) {
       value_ = std::make_shared<DenseTensor>();
       // 1. create replicated global tensor
-      TensorDistAttr replicated_dist_attr(vectorize(global_value->dims()));
+      TensorDistAttr replicated_dist_attr(
+          common::vectorize(global_value->dims()));
       replicated_dist_attr.set_process_mesh(dist_attr.process_mesh());
       DistTensor replicated_tensor(global_value, replicated_dist_attr);
 
@@ -80,7 +81,7 @@ DistTensor::DistTensor(const std::shared_ptr<phi::DenseTensor>& global_value,
     }
     idx++;
   }
-  TensorDistAttr dist_attr(vectorize(dist_tensor_meta_.dims()));
+  TensorDistAttr dist_attr(common::vectorize(dist_tensor_meta_.dims()));
   dist_attr.set_process_mesh(dist_tensor_meta_.process_mesh());
   dist_attr.set_dims_mapping(dist_tensor_meta_.dim_mapping());
   dist_attr.set_partial_status(partial_dims);
@@ -94,7 +95,8 @@ DistTensor::DistTensor(const std::shared_ptr<phi::DenseTensor>& global_value,
     if (!dist_tensor_meta_.is_replicated()) {
       value_ = std::make_shared<DenseTensor>();
       // 1. create replicated global tensor
-      TensorDistAttr replicated_dist_attr(vectorize(global_value->dims()));
+      TensorDistAttr replicated_dist_attr(
+          common::vectorize(global_value->dims()));
       replicated_dist_attr.set_process_mesh(process_mesh);
       DistTensor replicated_tensor(global_value, replicated_dist_attr);
 
diff --git a/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h b/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h
index 2d444decf640ab..71395507a09519 100644
--- a/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h
+++ b/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <typeinfo>
 #include <utility>
 
+#include "paddle/common/macros.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/attribute.h"
@@ -26,7 +27,6 @@ limitations under the License. */
 #include "paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.h"
 #include "paddle/phi/core/distributed/type_defs.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/macros.h"
 #include "paddle/phi/core/type_defs.h"
 #include "paddle/utils/any.h"
 #include "paddle/utils/flat_hash_map.h"
diff --git a/paddle/phi/core/distributed/auto_parallel/placement_types.h b/paddle/phi/core/distributed/auto_parallel/placement_types.h
index 08e128d9c6f379..ca92eb8003d64a 100644
--- a/paddle/phi/core/distributed/auto_parallel/placement_types.h
+++ b/paddle/phi/core/distributed/auto_parallel/placement_types.h
@@ -24,10 +24,10 @@
 #include <string>
 #include <vector>
 
+#include "paddle/common/errors.h"
 #include "paddle/phi/common/reduce_type.h"
 #include "paddle/phi/core/distributed/auto_parallel/process_mesh.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/tensor_meta.h"
 
 namespace phi {
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/nd_mesh_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/reshard/nd_mesh_reshard_function.cc
index 778cf72e27a612..7c0f9017366338 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard/nd_mesh_reshard_function.cc
+++ b/paddle/phi/core/distributed/auto_parallel/reshard/nd_mesh_reshard_function.cc
@@ -120,13 +120,13 @@ void SameNdMeshReshardFunction::Eval(phi::DeviceContext* dev_ctx,
       ProcessMesh sub_mesh = GetSubProcessMesh(process_mesh, kv.first);
 
       // 1.3 Calculate the input one dim dist attr
-      TensorDistAttr in_one_dim_dist_attr(vectorize(in.dims()));
+      TensorDistAttr in_one_dim_dist_attr(common::vectorize(in.dims()));
       in_one_dim_dist_attr.set_process_mesh(sub_mesh);
       in_one_dim_dist_attr.set_partial_status(std::vector<int64_t>{0},
                                               kv.second);
 
       // 1.4 Calculate the output one dim dist attr
-      TensorDistAttr out_one_dim_dist_attr(vectorize(in.dims()));
+      TensorDistAttr out_one_dim_dist_attr(common::vectorize(in.dims()));
       out_one_dim_dist_attr.set_process_mesh(sub_mesh);
 
       // 1.5 Change from partial to replicated
@@ -158,7 +158,7 @@ void SameNdMeshReshardFunction::Eval(phi::DeviceContext* dev_ctx,
       ProcessMesh sub_mesh = GetSubProcessMesh(process_mesh, in_mesh_axis);
 
       // 2.3 Calculate the input one dim dist attr
-      TensorDistAttr in_one_dim_dist_attr(vectorize(in.dims()));
+      TensorDistAttr in_one_dim_dist_attr(common::vectorize(in.dims()));
       in_one_dim_dist_attr.set_process_mesh(sub_mesh);
       std::vector<int64_t> in_one_dims_mapping =
           in_one_dim_dist_attr.dims_mapping();
@@ -166,7 +166,7 @@ void SameNdMeshReshardFunction::Eval(phi::DeviceContext* dev_ctx,
       in_one_dim_dist_attr.set_dims_mapping(in_one_dims_mapping);
 
       // 2.4 Calculate the output one dim dist attr
-      TensorDistAttr out_one_dim_dist_attr(vectorize(in.dims()));
+      TensorDistAttr out_one_dim_dist_attr(common::vectorize(in.dims()));
       out_one_dim_dist_attr.set_process_mesh(sub_mesh);
 
       // 2.5 Change from shard to replicated
@@ -198,11 +198,11 @@ void SameNdMeshReshardFunction::Eval(phi::DeviceContext* dev_ctx,
       ProcessMesh sub_mesh = GetSubProcessMesh(process_mesh, kv.first);
 
       // 3.3 Calculate the input one dim dist attr
-      TensorDistAttr in_one_dim_dist_attr(vectorize(in.dims()));
+      TensorDistAttr in_one_dim_dist_attr(common::vectorize(in.dims()));
       in_one_dim_dist_attr.set_process_mesh(sub_mesh);
 
       // 3.4 Calculate the output one dim dist attr
-      TensorDistAttr out_one_dim_dist_attr(vectorize(in.dims()));
+      TensorDistAttr out_one_dim_dist_attr(common::vectorize(in.dims()));
       out_one_dim_dist_attr.set_process_mesh(sub_mesh);
       out_one_dim_dist_attr.set_partial_status(std::vector<int64_t>{0});
 
@@ -238,11 +238,11 @@ void SameNdMeshReshardFunction::Eval(phi::DeviceContext* dev_ctx,
       ProcessMesh sub_mesh = GetSubProcessMesh(process_mesh, out_mesh_axis);
 
       // 4.3 Calculate the input one dim dist attr
-      TensorDistAttr in_one_dim_dist_attr(vectorize(in.dims()));
+      TensorDistAttr in_one_dim_dist_attr(common::vectorize(in.dims()));
       in_one_dim_dist_attr.set_process_mesh(sub_mesh);
 
       // 4.4 Calculate the output one dim dist attr
-      TensorDistAttr out_one_dim_dist_attr(vectorize(in.dims()));
+      TensorDistAttr out_one_dim_dist_attr(common::vectorize(in.dims()));
       out_one_dim_dist_attr.set_process_mesh(sub_mesh);
       std::vector<int64_t> out_one_dims_mapping =
           out_one_dim_dist_attr.dims_mapping();
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/p_to_s_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/reshard/p_to_s_reshard_function.cc
index dcb9096544b3a5..07b047db612507 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard/p_to_s_reshard_function.cc
+++ b/paddle/phi/core/distributed/auto_parallel/reshard/p_to_s_reshard_function.cc
@@ -58,7 +58,7 @@ void PToSReshardFunction::Eval(DeviceContext* dev_ctx,
   DenseTensor in_reduce_scatter = in.value();
   std::vector<int> axis;
   if (out_split_axis != 0) {
-    for (size_t i = 0; i < vectorize(logical_ddim).size(); ++i) {
+    for (size_t i = 0; i < common::vectorize(logical_ddim).size(); ++i) {
       axis.emplace_back(i);
     }
     std::swap(axis[0], axis[out_split_axis]);
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/reshard/reshard_function.cc
index 3f766eb8d6acd7..3669e09890ba8a 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard/reshard_function.cc
+++ b/paddle/phi/core/distributed/auto_parallel/reshard/reshard_function.cc
@@ -43,7 +43,7 @@ void ReshardFunction::SetValue(DistTensor* tensor, const DenseTensor& value) {
 void ReshardFunction::SetDistProps(DistTensor* tensor,
                                    const DDim& dims,
                                    const TensorDistAttr& dist_attr) {
-  PADDLE_ENFORCE_EQ(dist_attr.verify(vectorize(dims)),
+  PADDLE_ENFORCE_EQ(dist_attr.verify(common::vectorize(dims)),
                     true,
                     phi::errors::InvalidArgument(
                         "The input dist_attr [%s] and dims [%s] are improper.",
@@ -56,7 +56,7 @@ void ReshardFunction::SetDistProps(DistTensor* tensor,
 
 void ReshardFunction::SetDistProps(DistTensor* tensor,
                                    const TensorDistAttr& dist_attr) {
-  PADDLE_ENFORCE_EQ(dist_attr.verify(vectorize(tensor->dims())),
+  PADDLE_ENFORCE_EQ(dist_attr.verify(common::vectorize(tensor->dims())),
                     true,
                     phi::errors::InvalidArgument(
                         "The input dist_attr and dims are improper."));
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/s_to_s_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/reshard/s_to_s_reshard_function.cc
index 931d3d8bc1d89a..fa1c78c3160e0b 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard/s_to_s_reshard_function.cc
+++ b/paddle/phi/core/distributed/auto_parallel/reshard/s_to_s_reshard_function.cc
@@ -62,7 +62,7 @@ void SToSReshardFunction::Eval(phi::DeviceContext* dev_ctx,
   // 1. preprocess, reshape and transpose the input tensor
   if (out_split_axis != 0) {
     // 1.1 calc the shape and reshape
-    std::vector<int64_t> pre_shape_vec = vectorize(logical_ddim);
+    std::vector<int64_t> pre_shape_vec = common::vectorize(logical_ddim);
     pre_shape_vec[in_split_axis] /= nranks;
     pre_shape_vec[out_split_axis] /= nranks;
     pre_shape_vec.insert(pre_shape_vec.begin() + out_split_axis, nranks);
@@ -102,7 +102,7 @@ void SToSReshardFunction::Eval(phi::DeviceContext* dev_ctx,
   // 3. postprocess, reshape and transpose the output tensor
   if (in_split_axis != 0) {
     // 3.1 calc the shape and reshape
-    std::vector<int64_t> post_shape_vec = vectorize(logical_ddim);
+    std::vector<int64_t> post_shape_vec = common::vectorize(logical_ddim);
     post_shape_vec[in_split_axis] /= nranks;
     post_shape_vec[out_split_axis] /= nranks;
     post_shape_vec.insert(post_shape_vec.begin(), nranks);
diff --git a/paddle/phi/core/distributed/bkcl_comm_context.cc b/paddle/phi/core/distributed/bkcl_comm_context.cc
index 2f5fe0eb3ccbe4..bb301661ab8367 100644
--- a/paddle/phi/core/distributed/bkcl_comm_context.cc
+++ b/paddle/phi/core/distributed/bkcl_comm_context.cc
@@ -129,7 +129,7 @@ void BKCLCommContext::Recv(phi::DenseTensor* out_tensor,
                                        ToBKCLDataType(out_tensor->dtype()),
                                        stream));
   VLOG(3) << "rank " << GetRank() << " recv "
-          << phi::product(out_tensor->dims()) << " from " << peer;
+          << common::product(out_tensor->dims()) << " from " << peer;
 }
 
 void BKCLCommContext::AllReduce(phi::DenseTensor* out_tensor,
diff --git a/paddle/phi/core/distributed/check/nccl_dynamic_check.cc b/paddle/phi/core/distributed/check/nccl_dynamic_check.cc
index 57bdf12bce7992..9307af45bd622b 100644
--- a/paddle/phi/core/distributed/check/nccl_dynamic_check.cc
+++ b/paddle/phi/core/distributed/check/nccl_dynamic_check.cc
@@ -16,9 +16,9 @@
 
 #include "glog/logging.h"
 
+#include "paddle/common/errors.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 
 #if defined(PADDLE_WITH_RCCL)
 #include <hip/hip_runtime.h>
diff --git a/paddle/phi/core/distributed/check/static_check.cc b/paddle/phi/core/distributed/check/static_check.cc
index 8ec3e19e6038ea..25cdc8d01262e5 100644
--- a/paddle/phi/core/distributed/check/static_check.cc
+++ b/paddle/phi/core/distributed/check/static_check.cc
@@ -17,9 +17,9 @@
 #include <cstdlib>
 #include <cstring>
 
+#include "paddle/common/errors.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 
 namespace phi {
 namespace distributed {
diff --git a/paddle/phi/core/distributed/comm_context.h b/paddle/phi/core/distributed/comm_context.h
index 173ff6f8673d48..49c9a9238cf42c 100644
--- a/paddle/phi/core/distributed/comm_context.h
+++ b/paddle/phi/core/distributed/comm_context.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "paddle/phi/core/macros.h"
+#include "paddle/common/macros.h"
 
 namespace phi {
 namespace distributed {
diff --git a/paddle/phi/core/distributed/comm_context_manager.h b/paddle/phi/core/distributed/comm_context_manager.h
index cc6eff70c71ba9..8c4d802294986f 100644
--- a/paddle/phi/core/distributed/comm_context_manager.h
+++ b/paddle/phi/core/distributed/comm_context_manager.h
@@ -20,9 +20,9 @@
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/common/macros.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/distributed/comm_context.h"
-#include "paddle/phi/core/macros.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/phi/backends/gpu/forwards.h"
diff --git a/paddle/phi/core/distributed/comm_task.h b/paddle/phi/core/distributed/comm_task.h
index 05560eb67dafce..47ba01b980479a 100644
--- a/paddle/phi/core/distributed/comm_task.h
+++ b/paddle/phi/core/distributed/comm_task.h
@@ -18,9 +18,9 @@
 #include <condition_variable>
 #include <cstdint>
 #include <exception>
+#include "paddle/common/macros.h"
 #include "paddle/phi/core/distributed/utils.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/macros.h"
 
 #if defined(PADDLE_WITH_RCCL)
 #include "paddle/phi/backends/dynload/rccl.h"
diff --git a/paddle/phi/core/distributed/comm_task_manager.h b/paddle/phi/core/distributed/comm_task_manager.h
index bb739d5c6afdb8..cda83f54d5bef2 100644
--- a/paddle/phi/core/distributed/comm_task_manager.h
+++ b/paddle/phi/core/distributed/comm_task_manager.h
@@ -23,9 +23,9 @@
 #include <thread>
 #include <unordered_map>
 
+#include "paddle/common/macros.h"
 #include "paddle/phi/core/distributed/comm_context.h"
 #include "paddle/phi/core/distributed/comm_task.h"
-#include "paddle/phi/core/macros.h"
 
 namespace phi {
 namespace distributed {
diff --git a/paddle/phi/core/distributed/gloo_comm_context.h b/paddle/phi/core/distributed/gloo_comm_context.h
index c29935c061ef72..1fdbad11e848b3 100644
--- a/paddle/phi/core/distributed/gloo_comm_context.h
+++ b/paddle/phi/core/distributed/gloo_comm_context.h
@@ -19,8 +19,8 @@
 
 #include <memory>
 
+#include "paddle/common/macros.h"
 #include "paddle/phi/core/distributed/comm_context.h"
-#include "paddle/phi/core/macros.h"
 
 namespace phi {
 class DenseTensor;
diff --git a/paddle/phi/core/distributed/gloo_utils.cc b/paddle/phi/core/distributed/gloo_utils.cc
index 312681384a1996..55d4689ca3df80 100644
--- a/paddle/phi/core/distributed/gloo_utils.cc
+++ b/paddle/phi/core/distributed/gloo_utils.cc
@@ -26,10 +26,10 @@
 #include <cstdlib>
 #include <cstring>
 
+#include "paddle/common/errors.h"
 #include "paddle/phi/core/distributed/gloo_utils.h"
 #include "paddle/phi/core/distributed/store/tcp_utils.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 
 namespace phi {
 namespace distributed {
diff --git a/paddle/phi/core/distributed/nccl_comm_context.cc b/paddle/phi/core/distributed/nccl_comm_context.cc
index d1d92c98fb0fd6..8da676e74d911a 100644
--- a/paddle/phi/core/distributed/nccl_comm_context.cc
+++ b/paddle/phi/core/distributed/nccl_comm_context.cc
@@ -147,8 +147,8 @@ void NCCLCommContext::Send(const phi::DenseTensor& in_tensor,
                                     peer,
                                     nccl_comm_,
                                     stream));
-  VLOG(3) << "rank " << GetRank() << " send " << phi::product(in_tensor.dims())
-          << " to " << peer;
+  VLOG(3) << "rank " << GetRank() << " send "
+          << common::product(in_tensor.dims()) << " to " << peer;
 }
 
 void NCCLCommContext::Recv(phi::DenseTensor* out_tensor,
@@ -167,7 +167,7 @@ void NCCLCommContext::Recv(phi::DenseTensor* out_tensor,
                                     nccl_comm_,
                                     stream));
   VLOG(3) << "rank " << GetRank() << " recv "
-          << phi::product(out_tensor->dims()) << " from " << peer;
+          << common::product(out_tensor->dims()) << " from " << peer;
 }
 
 void NCCLCommContext::AllReduce(phi::DenseTensor* out_tensor,
diff --git a/paddle/phi/core/distributed/nccl_comm_context.h b/paddle/phi/core/distributed/nccl_comm_context.h
index b9fdce02f4b5f0..609b5e0defe079 100644
--- a/paddle/phi/core/distributed/nccl_comm_context.h
+++ b/paddle/phi/core/distributed/nccl_comm_context.h
@@ -22,10 +22,10 @@
 #include <hip/hip_runtime.h>
 #endif
 
+#include "paddle/common/macros.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 #include "paddle/phi/core/distributed/comm_context.h"
-#include "paddle/phi/core/macros.h"
 
 #if defined(PADDLE_WITH_RCCL)
 #include "paddle/phi/backends/dynload/rccl.h"
diff --git a/paddle/phi/core/distributed/nccl_comm_task.h b/paddle/phi/core/distributed/nccl_comm_task.h
index f9a8f3c2509220..fca9004cf0b2d4 100644
--- a/paddle/phi/core/distributed/nccl_comm_task.h
+++ b/paddle/phi/core/distributed/nccl_comm_task.h
@@ -13,11 +13,11 @@
 // limitations under the License.
 #pragma once
 
+#include "paddle/common/macros.h"
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 #include "paddle/phi/core/distributed/comm_context.h"
 #include "paddle/phi/core/distributed/comm_task.h"
 #include "paddle/phi/core/distributed/utils.h"
-#include "paddle/phi/core/macros.h"
 
 #if defined(PADDLE_WITH_RCCL)
 #include "paddle/phi/backends/dynload/rccl.h"
diff --git a/paddle/phi/core/distributed/nccl_tools.cc b/paddle/phi/core/distributed/nccl_tools.cc
index e419cfca905fa5..a5388796d1f45b 100644
--- a/paddle/phi/core/distributed/nccl_tools.cc
+++ b/paddle/phi/core/distributed/nccl_tools.cc
@@ -16,8 +16,8 @@
 
 #include <unordered_map>
 
+#include "paddle/common/errors.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 
 #if NCCL_VERSION_CODE >= 21300
 #define ENABLE_NCCL_GET_LAST_ERROR
diff --git a/paddle/phi/core/distributed/store/CMakeLists.txt b/paddle/phi/core/distributed/store/CMakeLists.txt
index 3b62a1367eea9e..c22b793e000f95 100644
--- a/paddle/phi/core/distributed/store/CMakeLists.txt
+++ b/paddle/phi/core/distributed/store/CMakeLists.txt
@@ -11,5 +11,5 @@ if(NOT WIN32)
   cc_test(
     test_c_tcp_store
     SRCS test_tcp_store.cc
-    DEPS phi)
+    DEPS phi common)
 endif()
diff --git a/paddle/phi/core/distributed/xccl_comm_context.cc b/paddle/phi/core/distributed/xccl_comm_context.cc
index 5c82e7baf0e82f..ba7e24ab06b9e1 100644
--- a/paddle/phi/core/distributed/xccl_comm_context.cc
+++ b/paddle/phi/core/distributed/xccl_comm_context.cc
@@ -122,8 +122,8 @@ void XCCLCommContext::Send(const phi::DenseTensor& in_tensor,
                               peer,
                               xccl_comm_,
                               stream);
-  VLOG(3) << "rank " << GetRank() << " send " << phi::product(in_tensor.dims())
-          << " to " << peer;
+  VLOG(3) << "rank " << GetRank() << " send "
+          << common::product(in_tensor.dims()) << " to " << peer;
 }
 
 void XCCLCommContext::Recv(phi::DenseTensor* out_tensor,
@@ -140,7 +140,7 @@ void XCCLCommContext::Recv(phi::DenseTensor* out_tensor,
                               xccl_comm_,
                               stream);
   VLOG(3) << "rank " << GetRank() << " recv "
-          << phi::product(out_tensor->dims()) << " from " << peer;
+          << common::product(out_tensor->dims()) << " from " << peer;
 }
 
 void XCCLCommContext::AllReduce(phi::DenseTensor* out_tensor,
diff --git a/paddle/phi/core/distributed/xccl_comm_context.h b/paddle/phi/core/distributed/xccl_comm_context.h
index 86f8dfc76a1eb3..0c253eb925bb4d 100644
--- a/paddle/phi/core/distributed/xccl_comm_context.h
+++ b/paddle/phi/core/distributed/xccl_comm_context.h
@@ -13,8 +13,8 @@
 // limitations under the License.
 #pragma once
 
+#include "paddle/common/macros.h"
 #include "paddle/phi/core/distributed/comm_context.h"
-#include "paddle/phi/core/macros.h"
 
 #include "paddle/phi/backends/device_manager.h"
 
diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h
index aa68dd802c0b4c..61e502951f24ee 100644
--- a/paddle/phi/core/enforce.h
+++ b/paddle/phi/core/enforce.h
@@ -11,19 +11,7 @@ limitations under the License. */
 
 #pragma once
 
-#ifdef __GNUC__
-#include <cxxabi.h>  // for __cxa_demangle
-#endif               // __GNUC__
-
-#if !defined(_WIN32)
-#include <dlfcn.h>   // dladdr
-#include <unistd.h>  // sleep, usleep
-#else                // _WIN32
-#ifndef NOMINMAX
-#define NOMINMAX  // msvc max/min macro conflict with std::min/max
-#endif
-#include <windows.h>  // GetModuleFileName, Sleep
-#endif
+#include "paddle/common/enforce.h"
 
 #ifdef PADDLE_WITH_CUDA
 #include <cublas_v2.h>
@@ -51,18 +39,11 @@ limitations under the License. */
 #include <string>
 #include <type_traits>
 #include <utility>
-#include "paddle/phi/core/macros.h"
+#include "paddle/common/macros.h"
 #if !defined(_WIN32) && !defined(PADDLE_WITH_MUSL)
 #include <execinfo.h>
 #endif
 
-#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
-#include "paddle/phi/core/errors.h"
-
-#include "paddle/utils/string/printf.h"
-#include "paddle/utils/string/to_string.h"
-#include "paddle/utils/test_macros.h"
-
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/phi/backends/dynload/cublas.h"
 #include "paddle/phi/backends/dynload/cudnn.h"
@@ -70,7 +51,6 @@ limitations under the License. */
 #include "paddle/phi/backends/dynload/cusolver.h"
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 #include <error.h>
-
 #include "paddle/phi/backends/dynload/nccl.h"
 #endif  // __APPLE__
 #endif  // PADDLE_WITH_CUDA
@@ -82,7 +62,6 @@ limitations under the License. */
 #include "paddle/phi/backends/dynload/rocblas.h"
 #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
 #include <error.h>  // NOLINT
-
 #include "paddle/phi/backends/dynload/rccl.h"
 #endif  // __APPLE__
 #endif  // PADDLE_WITH_HIP
@@ -97,56 +76,9 @@ limitations under the License. */
 #include "xpu/bkcl.h"
 #endif
 
-#include "paddle/utils/variant.h"
-
-namespace phi {
-class ErrorSummary;
-}  // namespace phi
-
-namespace phi {
-namespace proto {}  // namespace proto
-}  // namespace phi
-
 namespace phi {
 namespace enforce {
 
-/** HELPER MACROS AND FUNCTIONS **/
-#ifndef PADDLE_MAY_THROW
-#define PADDLE_MAY_THROW noexcept(false)
-#endif
-
-// Because most enforce conditions would evaluate to true, we can use
-// __builtin_expect to instruct the C++ compiler to generate code that
-// always forces branch prediction of true.
-// This generates faster binary code. __builtin_expect is since C++11.
-// For more details, please check https://stackoverflow.com/a/43870188/724872.
-#if !defined(_WIN32)
-#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
-#else
-// there is no equivalent intrinsics in msvc.
-#define UNLIKELY(condition) (condition)
-#endif
-
-#if !defined(_WIN32)
-#define LIKELY(condition) __builtin_expect(static_cast<bool>(condition), 1)
-#else
-// there is no equivalent intrinsics in msvc.
-#define LIKELY(condition) (condition)
-#endif
-
-#if defined _WIN32 && defined PADDLE_ON_INFERENCE && defined PADDLE_NO_PYTHON
-#define HANDLE_THE_ERROR try {
-#define END_HANDLE_THE_ERROR            \
-  }                                     \
-  catch (const std::exception& e) {     \
-    std::cout << e.what() << std::endl; \
-    throw;                              \
-  }
-#else
-#define HANDLE_THE_ERROR
-#define END_HANDLE_THE_ERROR
-#endif
-
 #ifdef __GNUC__
 inline std::string demangle(std::string name) {
   int status = -4;  // some arbitrary value to eliminate the compiler warning
@@ -318,7 +250,7 @@ struct EnforceNotMet : public std::exception {
     simple_err_str_ = SimplifyErrorTypeFormat(err_str_);
   }
 
-  EnforceNotMet(const phi::ErrorSummary& error, const char* file, int line)
+  EnforceNotMet(const common::ErrorSummary& error, const char* file, int line)
       : code_(error.code()),
         err_str_(GetTraceBackString(error.to_string(), file, line)) {
     simple_err_str_ = SimplifyErrorTypeFormat(err_str_);
@@ -332,7 +264,7 @@ struct EnforceNotMet : public std::exception {
     }
   }
 
-  phi::ErrorCode code() const { return code_; }
+  common::ErrorCode code() const { return code_; }
 
   const std::string& error_str() const { return err_str_; }
 
@@ -350,7 +282,7 @@ struct EnforceNotMet : public std::exception {
 
  private:
   // Used to determine the final type of exception thrown
-  phi::ErrorCode code_ = phi::ErrorCode::LEGACY;
+  common::ErrorCode code_ = common::ErrorCode::LEGACY;
   // Complete error message
   // e.g. InvalidArgumentError: ***
   std::string err_str_;
diff --git a/paddle/phi/core/errors.h b/paddle/phi/core/errors.h
deleted file mode 100644
index 1dd5cbcaaf6c74..00000000000000
--- a/paddle/phi/core/errors.h
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <stdexcept>
-#include <string>
-#include <tuple>
-#include <type_traits>
-
-#include "paddle/utils/string/printf.h"
-#include "paddle/utils/test_macros.h"
-
-namespace phi {
-enum ErrorCode {
-  // Legacy error.
-  // Error type string: "Error"
-  LEGACY = 0,
-
-  // Client specified an invalid argument.
-  // Error type string: "InvalidArgumentError"
-  INVALID_ARGUMENT = 1,
-
-  // Some requested entity (e.g., file or directory) was not found.
-  // Error type string: "NotFoundError"
-  NOT_FOUND = 2,
-
-  // Operation tried to iterate past the valid input range.  E.g., seeking or
-  // reading past end of file.
-  // Error type string: "OutOfRangeError"
-  OUT_OF_RANGE = 3,
-
-  // Some entity that we attempted to create (e.g., file or directory)
-  // already exists.
-  // Error type string: "AlreadyExistsError"
-  ALREADY_EXISTS = 4,
-
-  // Some resource has been exhausted, perhaps a per-user quota, or
-  // perhaps the entire file system is out of space.
-  // Error type string: "ResourceExhaustedError"
-  RESOURCE_EXHAUSTED = 5,
-
-  // Operation was rejected because the system is not in a state
-  // required for the operation's execution.
-  // Error type string: "PreconditionNotMetError"
-  PRECONDITION_NOT_MET = 6,
-
-  // The caller does not have permission to execute the specified
-  // operation.
-  // Error type string: "PermissionDeniedError"
-  PERMISSION_DENIED = 7,
-
-  // Deadline expired before operation could complete.
-  // Error type string: "ExecutionTimeout"
-  EXECUTION_TIMEOUT = 8,
-
-  // Operation is not implemented or not supported/enabled in this service.
-  // Error type string: "UnimplementedError"
-  UNIMPLEMENTED = 9,
-
-  // The service is currently unavailable.  This is a most likely a
-  // transient condition and may be corrected by retrying with
-  // a backoff.
-  // Error type string: "UnavailableError"
-  UNAVAILABLE = 10,
-
-  // Fatal errors.  Means some invariant expected by the underlying
-  // system has been broken.  If you see one of these errors,
-  // something is very broken.
-  // Error type string: "FatalError"
-  FATAL = 11,
-
-  // Third-party library error.
-  // Error type string: "ExternalError"
-  EXTERNAL = 12,
-};
-
-class ErrorSummary {
- public:
-  // Note(chenweihang): Final deprecated constructor
-  //   This constructor is used to be compatible with
-  //   current existing untyped PADDLE_ENFORCE_*
-  //   PADDLE_ENFORCE
-  // Note(chenweihang): Windows openblas need this
-  //   constructor for compiling PADDLE_ENFORCE in *.cu,
-  //   this is a bug cause we can't remove this
-  //   constructor now.
-  template <typename... Args>
-  explicit ErrorSummary(Args... args) {
-    code_ = phi::ErrorCode::LEGACY;
-    msg_ = paddle::string::Sprintf(args...);
-  }
-
-  // Note(chenweihang): Only recommended constructor
-  //   No longer supports PADDLE_ENFORCE without type or without error message
-  explicit ErrorSummary(ErrorCode code, std::string msg)
-      : code_(code), msg_(msg) {}
-
-  ErrorCode code() const { return code_; }
-
-  const std::string& error_message() const { return msg_; }
-
-  TEST_API std::string to_string() const;
-
- private:
-  ErrorCode code_;
-  std::string msg_;
-};
-
-namespace errors {
-
-#define REGISTER_ERROR(FUNC, CONST, ...)                            \
-  template <typename... Args>                                       \
-  ::phi::ErrorSummary FUNC(Args... args) {                          \
-    return ::phi::ErrorSummary(::phi::CONST,                        \
-                               ::paddle::string::Sprintf(args...)); \
-  }
-
-REGISTER_ERROR(InvalidArgument, ErrorCode::INVALID_ARGUMENT)
-REGISTER_ERROR(NotFound, ErrorCode::NOT_FOUND)
-REGISTER_ERROR(OutOfRange, ErrorCode::OUT_OF_RANGE)
-REGISTER_ERROR(AlreadyExists, ErrorCode::ALREADY_EXISTS)
-REGISTER_ERROR(ResourceExhausted, ErrorCode::RESOURCE_EXHAUSTED)
-REGISTER_ERROR(PreconditionNotMet, ErrorCode::PRECONDITION_NOT_MET)
-REGISTER_ERROR(PermissionDenied, ErrorCode::PERMISSION_DENIED)
-REGISTER_ERROR(ExecutionTimeout, ErrorCode::EXECUTION_TIMEOUT)
-REGISTER_ERROR(Unimplemented, ErrorCode::UNIMPLEMENTED)
-REGISTER_ERROR(Unavailable, ErrorCode::UNAVAILABLE)
-REGISTER_ERROR(Fatal, ErrorCode::FATAL)
-REGISTER_ERROR(External, ErrorCode::EXTERNAL)
-
-#undef REGISTER_ERROR
-
-}  // namespace errors
-}  // namespace phi
diff --git a/paddle/phi/core/extended_tensor.cc b/paddle/phi/core/extended_tensor.cc
index 31d0fb25c88c1d..03609316b840cd 100644
--- a/paddle/phi/core/extended_tensor.cc
+++ b/paddle/phi/core/extended_tensor.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/core/extended_tensor.h"
+#include "paddle/phi/core/enforce.h"
 
 namespace phi {
 
diff --git a/paddle/phi/core/flags.h b/paddle/phi/core/flags.h
index 7c905ec6c64914..9a5ff40596e7d8 100644
--- a/paddle/phi/core/flags.h
+++ b/paddle/phi/core/flags.h
@@ -20,7 +20,7 @@
 #include <string>
 #include <type_traits>
 
-#include "paddle/phi/core/macros.h"
+#include "paddle/common/macros.h"
 #include "paddle/utils/flags.h"
 #include "paddle/utils/test_macros.h"
 #include "paddle/utils/variant.h"
diff --git a/paddle/phi/core/infermeta_utils.h b/paddle/phi/core/infermeta_utils.h
index bc6ef528d3ba93..06036b2c138940 100644
--- a/paddle/phi/core/infermeta_utils.h
+++ b/paddle/phi/core/infermeta_utils.h
@@ -19,11 +19,11 @@ limitations under the License. */
 #include <typeinfo>
 #include <utility>
 
+#include "paddle/common/macros.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/attribute.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/macros.h"
 #include "paddle/phi/core/meta_tensor.h"
 #include "paddle/phi/core/type_defs.h"
 #include "paddle/utils/any.h"
diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc
index 69c7900def16ba..a5c5a3994a81b1 100644
--- a/paddle/phi/core/kernel_factory.cc
+++ b/paddle/phi/core/kernel_factory.cc
@@ -538,7 +538,7 @@ std::string KernelSelectionErrorMessage(const std::string& kernel_name,
     backend_set.insert(
         paddle::experimental::BackendToString(kernel_key.backend()));
     all_kernel_key[paddle::experimental::BackendToString(kernel_key.backend()) +
-                   ", " + phi::DataLayoutToString(kernel_key.layout())]
+                   ", " + common::DataLayoutToString(kernel_key.layout())]
         .push_back(DataTypeToString(kernel_key.dtype()));
   }
   // 1. If target_key not supports target backend, output "Selected wrong
diff --git a/paddle/phi/core/kernel_factory.h b/paddle/phi/core/kernel_factory.h
index 9e3c67fa9ad35b..c30d883ee6c462 100644
--- a/paddle/phi/core/kernel_factory.h
+++ b/paddle/phi/core/kernel_factory.h
@@ -18,9 +18,9 @@
 #include <ostream>
 #include <unordered_map>
 #include <unordered_set>
+#include "paddle/common/layout.h"
 #include "paddle/phi/common/backend.h"
 #include "paddle/phi/common/data_type.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/compat/get_kerneltype_forvar_utils.h"
 #include "paddle/phi/core/type_defs.h"
diff --git a/paddle/phi/core/macros.h b/paddle/phi/core/macros.h
deleted file mode 100644
index 6245d94c97cb10..00000000000000
--- a/paddle/phi/core/macros.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-namespace phi {
-
-// Disable the copy and assignment operator for a class.
-
-#define DISABLE_COPY_AND_ASSIGN(classname)         \
- private:                                          \
-  classname(const classname&) = delete;            \
-  classname(classname&&) = delete;                 \
-  classname& operator=(const classname&) = delete; \
-  classname& operator=(classname&&) = delete
-
-#define PD_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg) \
-  _PD_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)
-
-#define _PD_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)                    \
-  struct __test_global_namespace_##uniq_name##__ {};                          \
-  static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
-                             __test_global_namespace_##uniq_name##__>::value, \
-                msg)
-
-#ifdef __COUNTER__
-#define PD_ID __COUNTER__
-#else
-#define PD_ID __LINE__
-#endif
-
-#if defined(_WIN32)
-#define UNUSED
-#define __builtin_expect(EXP, C) (EXP)
-#else
-#define UNUSED __attribute__((unused))
-#endif
-
-#define PD_CONCATENATE(arg1, arg2) PD_CONCATENATE1(arg1, arg2)
-#define PD_CONCATENATE1(arg1, arg2) PD_CONCATENATE2(arg1, arg2)
-#define PD_CONCATENATE2(arg1, arg2) arg1##arg2
-#define PD_EXPAND(x) x
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-#define PADDLE_RESTRICT __restrict__
-#else
-#define PADDLE_RESTRICT
-#endif
-
-#ifndef PADDLE_WITH_MUSL
-#if defined(__FLT_MAX__)
-#define FLT_MAX __FLT_MAX__
-#endif  // __FLT_MAX__
-#endif  // PADDLE_WITH_MUSL
-
-}  // namespace phi
diff --git a/paddle/phi/core/meta_tensor.h b/paddle/phi/core/meta_tensor.h
index 6990f95a4b68ac..b28081c8d4ef77 100644
--- a/paddle/phi/core/meta_tensor.h
+++ b/paddle/phi/core/meta_tensor.h
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/common/ddim.h"
+#include "paddle/common/layout.h"
+#include "paddle/common/macros.h"
 #include "paddle/phi/common/data_type.h"
-#include "paddle/phi/common/layout.h"
-#include "paddle/phi/core/ddim.h"
-#include "paddle/phi/core/macros.h"
 #include "paddle/phi/core/tensor_base.h"
 #include "paddle/phi/core/tensor_meta.h"
 
diff --git a/paddle/phi/core/mixed_vector.h b/paddle/phi/core/mixed_vector.h
index d25a646608d3d2..251aa28232b8e8 100644
--- a/paddle/phi/core/mixed_vector.h
+++ b/paddle/phi/core/mixed_vector.h
@@ -22,10 +22,10 @@ limitations under the License. */
 #include <vector>
 
 #include "glog/logging.h"
+#include "paddle/common/errors.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/allocator.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/utils/none.h"
 #include "paddle/utils/optional.h"
 
diff --git a/paddle/phi/core/scope_guard.h b/paddle/phi/core/scope_guard.h
index 1c73133761dd67..5ad0cdb7e3a280 100644
--- a/paddle/phi/core/scope_guard.h
+++ b/paddle/phi/core/scope_guard.h
@@ -17,7 +17,7 @@
 #include <type_traits>
 #include <utility>
 
-#include "paddle/phi/core/macros.h"
+#include "paddle/common/macros.h"
 
 namespace phi {
 
diff --git a/paddle/phi/core/selected_rows_impl.h b/paddle/phi/core/selected_rows_impl.h
index a29f66b99420ab..3647583e5038fd 100644
--- a/paddle/phi/core/selected_rows_impl.h
+++ b/paddle/phi/core/selected_rows_impl.h
@@ -21,8 +21,8 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/common/place.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/utils/rw_lock.h"
@@ -142,9 +142,9 @@ class SelectedRowsImpl {
    * @brief Get complete Dims before
    */
   phi::DDim GetCompleteDims() const {
-    std::vector<int64_t> dims = vectorize(value_->dims());
+    std::vector<int64_t> dims = common::vectorize(value_->dims());
     dims[0] = height_;
-    return phi::make_ddim(dims);
+    return common::make_ddim(dims);
   }
 
   /// \brief Returns the number of elements contained in tensor.
diff --git a/paddle/phi/core/sparse_coo_tensor.cc b/paddle/phi/core/sparse_coo_tensor.cc
index f8517fb0ff007e..95501527f51fd0 100644
--- a/paddle/phi/core/sparse_coo_tensor.cc
+++ b/paddle/phi/core/sparse_coo_tensor.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/core/enforce.h"
 
 namespace phi {
 
@@ -112,7 +113,7 @@ void SparseCooTensor::Resize(const DDim& dense_dims,
       phi::errors::InvalidArgument(
           "the sparse_dim must be less than or equal dense_dims."));
 
-  DDim indices_dims = phi::make_ddim({sparse_dim, non_zero_num});
+  DDim indices_dims = common::make_ddim({sparse_dim, non_zero_num});
   auto dense_dim = dense_dims.size() - sparse_dim;
   DDim values_dims;
   if (dense_dim) {
@@ -121,9 +122,9 @@ void SparseCooTensor::Resize(const DDim& dense_dims,
     memcpy(&dense_dim_vec[1],
            dense_dims.Get() + sparse_dim,
            dense_dim * sizeof(dense_dims[0]));
-    values_dims = phi::make_ddim(dense_dim_vec);
+    values_dims = common::make_ddim(dense_dim_vec);
   } else {
-    values_dims = phi::make_ddim({non_zero_num});
+    values_dims = common::make_ddim({non_zero_num});
   }
 
   this->non_zero_indices_.Resize(indices_dims);
diff --git a/paddle/phi/core/sparse_csr_tensor.cc b/paddle/phi/core/sparse_csr_tensor.cc
index 0dc0807a36baf1..525f38cd8263db 100644
--- a/paddle/phi/core/sparse_csr_tensor.cc
+++ b/paddle/phi/core/sparse_csr_tensor.cc
@@ -111,10 +111,10 @@ void SparseCsrTensor::Resize(const DDim& dense_dims,
     crows_size = dense_dims[0] * (dense_dims[1] + 1);
   }
 
-  DDim crows_dims = phi::make_ddim({crows_size});
+  DDim crows_dims = common::make_ddim({crows_size});
   this->non_zero_crows_.Resize(crows_dims);
 
-  DDim col_dims = phi::make_ddim({non_zero_num});
+  DDim col_dims = common::make_ddim({non_zero_num});
   this->non_zero_cols_.Resize(col_dims);
   this->non_zero_elements_.Resize(col_dims);
 }
diff --git a/paddle/phi/core/storage_properties.h b/paddle/phi/core/storage_properties.h
index ac64875452bf8f..421f853872cb64 100644
--- a/paddle/phi/core/storage_properties.h
+++ b/paddle/phi/core/storage_properties.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <memory>
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 #include "paddle/phi/core/utils/type_registry.h"
 
 #ifdef PADDLE_WITH_DNNL
diff --git a/paddle/phi/core/tensor_array.cc b/paddle/phi/core/tensor_array.cc
index 1b5a1189483411..a1bcb23c4704b9 100644
--- a/paddle/phi/core/tensor_array.cc
+++ b/paddle/phi/core/tensor_array.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/core/tensor_array.h"
+#include "paddle/phi/core/enforce.h"
 
 namespace phi {
 
diff --git a/paddle/phi/core/tensor_base.h b/paddle/phi/core/tensor_base.h
index 069382720e19de..99318f86cf42b6 100644
--- a/paddle/phi/core/tensor_base.h
+++ b/paddle/phi/core/tensor_base.h
@@ -14,12 +14,12 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/common/ddim.h"
+#include "paddle/common/layout.h"
 #include "paddle/phi/common/backend.h"
 #include "paddle/phi/common/data_type.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/allocator.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/utils/type_registry.h"
 
 namespace phi {
diff --git a/paddle/phi/core/tensor_meta.cc b/paddle/phi/core/tensor_meta.cc
index aa0fa712ad5af8..d44ab61a59cd98 100644
--- a/paddle/phi/core/tensor_meta.cc
+++ b/paddle/phi/core/tensor_meta.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/core/tensor_meta.h"
-#include "paddle/pir/core/enforce.h"
+#include "paddle/phi/core/enforce.h"
 
 namespace phi {
 
diff --git a/paddle/phi/core/tensor_meta.h b/paddle/phi/core/tensor_meta.h
index 176ef60cda7cda..77f2e2bebb4ec3 100644
--- a/paddle/phi/core/tensor_meta.h
+++ b/paddle/phi/core/tensor_meta.h
@@ -16,10 +16,10 @@ limitations under the License. */
 
 #include <vector>
 
+#include "paddle/common/ddim.h"
+#include "paddle/common/layout.h"
 #include "paddle/phi/common/backend.h"
 #include "paddle/phi/common/data_type.h"
-#include "paddle/phi/common/layout.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/utils/any.h"
 #include "paddle/utils/optional.h"
 #include "paddle/utils/test_macros.h"
@@ -42,6 +42,7 @@ namespace phi {
  *    0 2 5 7 10 12 15 20
  */
 using LoD = std::vector<std::vector<size_t>>;
+using DDim = phi::DDim;
 
 /// \brief The meta data of dense tensor. Take the structure type
 /// and use all default operations.
diff --git a/paddle/phi/core/tensor_utils.cc b/paddle/phi/core/tensor_utils.cc
index 26e675326593cb..17fdef1b9cfbdd 100644
--- a/paddle/phi/core/tensor_utils.cc
+++ b/paddle/phi/core/tensor_utils.cc
@@ -915,7 +915,7 @@ phi::DenseTensor ReshapeToMatrix(const phi::DenseTensor& src,
   }
   phi::DenseTensor res;
   res.ShareDataWith(src);
-  res.Resize(phi::flatten_to_2d(src.dims(), num_col_dims));
+  res.Resize(common::flatten_to_2d(src.dims(), num_col_dims));
   return res;
 }
 
diff --git a/paddle/phi/core/threadpool.h b/paddle/phi/core/threadpool.h
index b45991f9a7f825..318ec38d3c8c58 100644
--- a/paddle/phi/core/threadpool.h
+++ b/paddle/phi/core/threadpool.h
@@ -24,8 +24,8 @@
 #include <utility>
 #include <vector>
 
+#include "paddle/common/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 
 namespace phi {
 
diff --git a/paddle/phi/core/utils/array.h b/paddle/phi/core/utils/array.h
deleted file mode 100644
index 44290b73737fb7..00000000000000
--- a/paddle/phi/core/utils/array.h
+++ /dev/null
@@ -1,142 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <cstdint>
-
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/utils/unroll_array_ops.h"
-
-namespace phi {
-
-template <typename T, size_t N>
-class Array {
- public:
-  static constexpr size_t kSize = N;
-
-  HOSTDEVICE inline Array() {}
-
-  template <typename... Args>
-  HOSTDEVICE inline explicit Array(const T &val, Args... args) {
-    static_assert(N == sizeof...(Args) + 1, "Invalid argument");
-    UnrollVarArgsAssign<T>::Run(data_, val, args...);
-  }
-
-  HOSTDEVICE inline void Fill(const T &val) {
-    UnrollFillConstant<N>::Run(data_, val);
-  }
-
-  HOSTDEVICE inline const T *Get() const { return data_; }
-
-  HOSTDEVICE inline T *GetMutable() { return data_; }
-
-  HOSTDEVICE inline T &operator[](size_t i) { return *advance(data_, i); }
-
-  // Writing "return data_[i]" would cause compilation warning/error:
-  // "array subscript is above array bound" in Python 35 CI.
-  // It seems that it is a false warning of GCC if we do not check the bounds
-  // of array index. But for better performance, we do not check in operator[]
-  // like what is in STL. If users want to check the bounds, use at() instead
-  HOSTDEVICE inline const T &operator[](size_t i) const {
-    return *advance(data_, i);
-  }
-
-  HOSTDEVICE inline T &at(size_t i) {
-#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__)
-    PADDLE_ENFORCE_LT(
-        i, N, phi::errors::OutOfRange("Array index out of bounds."));
-#endif
-    return (*this)[i];
-  }
-
-  HOSTDEVICE inline const T &at(size_t i) const {
-#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__)
-    PADDLE_ENFORCE_LT(
-        i, N, phi::errors::OutOfRange("Array index out of bounds."));
-#endif
-    return (*this)[i];
-  }
-
-  HOSTDEVICE constexpr size_t size() const { return N; }
-
-  HOSTDEVICE inline bool operator==(const Array<T, N> &other) const {
-    return UnrollCompare<N>::Run(data_, other.data_);
-  }
-
-  HOSTDEVICE inline bool operator!=(const Array<T, N> &other) const {
-    return !(*this == other);
-  }
-
- private:
-  template <typename U>
-  HOSTDEVICE static inline U *advance(U *ptr, size_t i) {
-    return ptr + i;
-  }
-
-  T data_[N] = {};
-};
-
-template <typename T>
-class Array<T, 0> {
- public:
-  static constexpr size_t kSize = 0;
-
-  HOSTDEVICE inline Array() {}
-
-  HOSTDEVICE inline void Fill(const T &val) {}
-
-  HOSTDEVICE inline constexpr T *Get() const { return nullptr; }
-
-  // Add constexpr to GetMutable() cause warning in MAC
-  HOSTDEVICE inline T *GetMutable() { return nullptr; }
-
-  HOSTDEVICE inline T &operator[](size_t) {
-#if defined(__HIPCC__) || defined(__CUDA_ARCH__)
-    // HIP and CUDA will have compile error, if use "obj()"
-    // function declared in block scope cannot have 'static' storage class
-    static T obj{};
-    return obj;
-#else
-    PADDLE_THROW(phi::errors::Unavailable("Array<T, 0> has no element."));
-#endif
-  }
-
-  HOSTDEVICE inline const T &operator[](size_t) const {
-#if defined(__HIPCC__) || defined(__CUDA_ARCH__)
-    // HIP and CUDA will have compile error, if use "obj()"
-    // function declared in block scope cannot have 'static' storage class
-    static const T obj{};
-    return obj;
-#else
-    PADDLE_THROW(phi::errors::Unavailable("Array<T, 0> has no element."));
-#endif
-  }
-
-  HOSTDEVICE inline T &at(size_t i) { return (*this)[i]; }
-
-  HOSTDEVICE inline const T &at(size_t i) const { return (*this)[i]; }
-
-  HOSTDEVICE constexpr size_t size() const { return 0; }
-
-  HOSTDEVICE constexpr bool operator==(const Array<T, 0> &other) const {
-    return true;
-  }
-
-  HOSTDEVICE constexpr bool operator!=(const Array<T, 0> &other) const {
-    return false;
-  }
-};
-
-}  // namespace phi
diff --git a/paddle/phi/core/utils/dim.h b/paddle/phi/core/utils/dim.h
deleted file mode 100644
index 7cc023f8cc7d16..00000000000000
--- a/paddle/phi/core/utils/dim.h
+++ /dev/null
@@ -1,111 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-#include <string>
-#include <type_traits>
-
-#include "paddle/phi/core/hostdevice.h"
-#include "paddle/phi/core/utils/array.h"
-
-namespace phi {
-
-// Statically sized, statically indexed dimension
-template <int D>
-class Dim : public Array<int64_t, D> {
- public:
-  static_assert(D >= 0, "D must be not less than 0");
-
-  static constexpr int kRank = D;
-  using BaseClass = Array<int64_t, D>;
-
-  inline Dim(int64_t head, const Dim<D - 1>& tail) {
-    (*this)[0] = head;
-    new (this->GetMutable() + 1) Dim<D - 1>(tail);
-  }
-
-  template <typename... Args>
-  HOSTDEVICE explicit Dim(int64_t head, Args... args)
-      : BaseClass(head, args...) {}
-
-  /** Construct a Dim with each dimension set to the given index */
-  HOSTDEVICE explicit Dim(int64_t idx) { this->Fill(idx); }
-
-  HOSTDEVICE Dim() = default;
-
-  HOST std::string to_string() const;
-};
-
-// Product of a Dim
-template <int D>
-HOSTDEVICE inline int64_t product(const Dim<D>& a) {
-  return UnrollProduct<D>::Run(a.Get());
-}
-
-/**
- * Helper function to create a Dim
- *
- * \param idxes The type of Dim constructed depends on the number of params
- *
- */
-
-template <typename... Args>
-HOSTDEVICE inline Dim<sizeof...(Args)> make_dim(Args... idxes) {
-  return Dim<sizeof...(Args)>(idxes...);
-}
-
-// Allows us to output a Dim
-template <int D>
-inline std::ostream& operator<<(std::ostream& os, const Dim<D>& d) {
-  if (D > 0) {
-    os << d[0];
-    for (int i = 1; i < D; ++i) {
-      os << ", " << d[i];
-    }
-  } else {
-    os << "";
-  }
-
-  return os;
-}
-
-inline std::ostream& operator<<(std::ostream& os, const Dim<0>& d UNUSED) {
-  return os;
-}
-
-template <int D>
-HOST std::string Dim<D>::to_string() const {
-  std::stringstream stream;
-  stream << *this;
-  return stream.str();
-}
-
-template <int D, typename T1, typename T2>
-inline void static_dim_assign(const T1* in, T2* out) {
-  UnrollAssign<D>::Run(in, out);
-}
-
-}  // namespace phi
-
-namespace paddle {
-namespace framework {
-template <int D>
-using Dim = phi::Dim<D>;
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/phi/core/utils/unroll_array_ops.h b/paddle/phi/core/utils/unroll_array_ops.h
deleted file mode 100644
index ea9d6273e4a4ef..00000000000000
--- a/paddle/phi/core/utils/unroll_array_ops.h
+++ /dev/null
@@ -1,129 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <cstddef>
-#include <type_traits>
-#include "paddle/phi/core/hostdevice.h"
-#include "paddle/phi/core/macros.h"
-
-namespace phi {
-namespace detail {
-template <size_t kStart, size_t kEnd, bool kStop>
-struct UnrollFillConstant {
-  template <typename T>
-  HOSTDEVICE inline static void Run(T *data, T val) {
-    data[kStart] = val;
-    UnrollFillConstant<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(data, val);
-  }
-};
-
-template <size_t kStart, size_t kEnd>
-struct UnrollFillConstant<kStart, kEnd, true> {
-  template <typename T>
-  HOSTDEVICE inline static void Run(T *data UNUSED, T val UNUSED) {}
-};
-
-template <size_t kStart, size_t kEnd, bool kStop>
-struct UnrollAssign {
-  template <typename Tin, typename Tout>
-  HOSTDEVICE inline static void Run(const Tin *d1, Tout *d2) {
-    d2[kStart] = static_cast<Tout>(d1[kStart]);
-    UnrollAssign<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d1, d2);
-  }
-};
-
-template <size_t kStart, size_t kEnd>
-struct UnrollAssign<kStart, kEnd, true> {
-  template <typename Tin, typename Tout>
-  HOSTDEVICE inline static void Run(const Tin *d1 UNUSED, Tout *d2 UNUSED) {}
-};
-
-template <typename T, size_t kStart, size_t kEnd, bool kStop>
-struct UnrollVarArgsAssignImpl {
-  template <typename... Args>
-  HOSTDEVICE inline static void Run(T *d, T val, Args... args) {
-    static_assert(sizeof...(args) + 1 == kEnd - kStart, "Wrong argument");
-    d[kStart] = val;
-    UnrollVarArgsAssignImpl<T, kStart + 1, kEnd, kStart + 1 == kEnd>::Run(
-        d, args...);
-  }
-};
-
-template <typename T, size_t kStart, size_t kEnd>
-struct UnrollVarArgsAssignImpl<T, kStart, kEnd, true> {
-  HOSTDEVICE inline static void Run(T *d) {}
-};
-
-template <typename T>
-struct UnrollVarArgsAssign {
-  template <typename... Args>
-  HOSTDEVICE inline static void Run(T *d, Args... args) {
-    UnrollVarArgsAssignImpl<T, 0, sizeof...(Args), sizeof...(Args) == 0>::Run(
-        d, args...);
-  }
-};
-
-template <size_t kStart, size_t kEnd, bool kStop>
-struct UnrollCompare {
-  template <typename T>
-  HOSTDEVICE inline static bool Run(const T *d1, const T *d2) {
-    return d1[kStart] == d2[kStart] &&
-           UnrollCompare<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d1, d2);
-  }
-};
-
-template <size_t kStart, size_t kEnd>
-struct UnrollCompare<kStart, kEnd, true> {
-  template <typename T>
-  HOSTDEVICE inline constexpr static bool Run(const T *d1 UNUSED,
-                                              const T *d2 UNUSED) {
-    return true;
-  }
-};
-
-template <size_t kStart, size_t kEnd, bool kStop>
-struct UnrollProduct {
-  template <typename T>
-  HOSTDEVICE inline static T Run(const T *d) {
-    return d[kStart] *
-           UnrollProduct<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d);
-  }
-};
-
-template <size_t kStart, size_t kEnd>
-struct UnrollProduct<kStart, kEnd, true> {
-  template <typename T>
-  HOSTDEVICE inline constexpr static T Run(const T *d UNUSED) {
-    return 1;
-  }
-};
-}  // namespace detail
-
-template <size_t N>
-using UnrollFillConstant = detail::UnrollFillConstant<0, N, N == 0>;
-
-template <size_t N>
-using UnrollAssign = detail::UnrollAssign<0, N, N == 0>;
-
-template <typename T>
-using UnrollVarArgsAssign = detail::UnrollVarArgsAssign<T>;
-
-template <size_t N>
-using UnrollCompare = detail::UnrollCompare<0, N, N == 0>;
-
-template <size_t N>
-using UnrollProduct = detail::UnrollProduct<0, N, N == 0>;
-
-}  // namespace phi
diff --git a/paddle/phi/core/visit_type.h b/paddle/phi/core/visit_type.h
index 047ba79bc15998..28f575295b47e1 100644
--- a/paddle/phi/core/visit_type.h
+++ b/paddle/phi/core/visit_type.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/api/ext/exception.h"
+#include "paddle/common/exception.h"
 #include "paddle/phi/common/data_type.h"
 
 namespace phi {
diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc
index a3eb7ce8c906b3..606ca86fad99ed 100644
--- a/paddle/phi/infermeta/backward.cc
+++ b/paddle/phi/infermeta/backward.cc
@@ -26,9 +26,9 @@ void AffineGridGradInferMeta(const MetaTensor& output_grad,
   if (input_grad) {
     auto output_dims = output_grad.dims();
     if (output_dims.size() == 4) {
-      input_grad->set_dims(phi::make_ddim({output_dims[0], 2, 3}));
+      input_grad->set_dims(common::make_ddim({output_dims[0], 2, 3}));
     } else {
-      input_grad->set_dims(phi::make_ddim({output_dims[0], 3, 4}));
+      input_grad->set_dims(common::make_ddim({output_dims[0], 3, 4}));
     }
   }
 }
@@ -588,7 +588,7 @@ void KernelWithXShapeInferMeta(const MetaTensor& xshape,
                                const MetaTensor& out,
                                MetaTensor* dx) {
   auto xshape_dims = xshape.dims();
-  auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
+  auto x_dims = common::slice_ddim(xshape_dims, 1, xshape_dims.size());
   dx->set_dims(x_dims);
   dx->set_dtype(out.dtype());
   dx->share_lod(xshape);
@@ -716,17 +716,17 @@ void MemoryEfficientAttentionGradInferMeta(const MetaTensor& query,
   std::vector<int64_t> value_grad_dims(
       {value_batch_size, value_seq_length, value_num_head, value_head_size});
 
-  query_grad->set_dims(phi::make_ddim(query_grad_dims));
+  query_grad->set_dims(common::make_ddim(query_grad_dims));
   query_grad->share_lod(query);
   query_grad->set_dtype(query.dtype());
   query_grad->set_layout(query.layout());
 
-  key_grad->set_dims(phi::make_ddim(key_grad_dims));
+  key_grad->set_dims(common::make_ddim(key_grad_dims));
   key_grad->share_lod(key);
   key_grad->set_dtype(key.dtype());
   key_grad->set_layout(key.layout());
 
-  value_grad->set_dims(phi::make_ddim(value_grad_dims));
+  value_grad->set_dims(common::make_ddim(value_grad_dims));
   value_grad->share_lod(value);
   value_grad->set_dtype(value.dtype());
   value_grad->set_layout(value.layout());
@@ -740,7 +740,7 @@ void MemoryEfficientAttentionGradInferMeta(const MetaTensor& query,
     std::vector<int64_t> bias_grad_dims(
         {bias_batch_size, bias_seq_length, bias_num_head, bias_head_size});
 
-    bias_grad->set_dims(phi::make_ddim(bias_grad_dims));
+    bias_grad->set_dims(common::make_ddim(bias_grad_dims));
     bias_grad->share_lod(bias);
     bias_grad->set_dtype(bias.dtype());
     bias_grad->set_layout(bias.layout());
@@ -818,8 +818,8 @@ void NllLossGradInferMeta(const MetaTensor& x,
   const auto& x_dims = x.dims();
   const auto& label_dims = label.dims();
   const auto& dout_dims = out_grad.dims();
-  bool contain_unknown_dim =
-      phi::contain_unknown_dim(x_dims) || phi::contain_unknown_dim(dout_dims);
+  bool contain_unknown_dim = common::contain_unknown_dim(x_dims) ||
+                             common::contain_unknown_dim(dout_dims);
   bool check = config.is_runtime || !contain_unknown_dim;
 
   if (check) {
@@ -1061,12 +1061,12 @@ void StackGradInferMeta(const MetaTensor& out_grad,
           x_grad.size(),
           static_cast<size_t>(dy_dim[axis])));
 
-  auto vec = phi::vectorize<int>(dy_dim);
+  auto vec = common::vectorize<int>(dy_dim);
   vec.erase(vec.begin() + axis);
 
   for (auto& grad : x_grad) {
     if (grad) {
-      grad->set_dims(phi::make_ddim(vec));
+      grad->set_dims(common::make_ddim(vec));
       grad->set_dtype(out_grad.dtype());
     }
   }
@@ -1153,9 +1153,9 @@ void UnStackGradInferMeta(const std::vector<const MetaTensor*>& out_grad,
                         rank));
   if (axis < 0) axis += (rank + 1);
 
-  auto vec = phi::vectorize<int>(input_dims[0]);
+  auto vec = common::vectorize<int>(input_dims[0]);
   vec.insert(vec.begin() + axis, static_cast<int>(input_dims.size()));
-  x_grad->set_dims(phi::make_ddim(vec));
+  x_grad->set_dims(common::make_ddim(vec));
   x_grad->set_dtype(out_grad[0]->dtype());
 }
 
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 53b3b00286a583..b41ec59d259741 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -18,11 +18,11 @@ limitations under the License. */
 #include <vector>
 
 #include "glog/logging.h"
+#include "paddle/common/ddim.h"
+#include "paddle/common/layout.h"
 #include "paddle/phi/api/lib/data_type_set.h"
 #include "paddle/phi/common/data_type.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/common/type_traits.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/infermeta/unary.h"
@@ -76,12 +76,12 @@ static void BinarySameInputDimsCheck(const MetaTensor& x,
 
 // Used in MatrixRankTolInferMeta
 static DDim CheckAndGetOutputDim(const DDim& dim_x) {
-  auto x_vec = phi::vectorize(dim_x);
+  auto x_vec = common::vectorize(dim_x);
   if (x_vec.size() == 2) {
-    return phi::make_ddim({});
+    return common::make_ddim({});
   }
   x_vec.erase(x_vec.end() - 2, x_vec.end());
-  return phi::make_ddim(x_vec);
+  return common::make_ddim(x_vec);
 }
 
 }  // namespace detail
@@ -91,7 +91,7 @@ void AllValueCompareInferMeta(const MetaTensor& x,
                               MetaTensor* out,
                               MetaConfig config) {
   detail::BinarySameInputDimsCheck(x, y, config);
-  out->set_dims(phi::make_ddim({}));
+  out->set_dims(common::make_ddim({}));
   out->set_dtype(DataType::BOOL);
 }
 
@@ -135,7 +135,7 @@ void KLDivInferMeta(const MetaTensor& x,
   if ("none" == reduction) {
     out->set_dims(dim_x);
   } else {
-    out->set_dims(phi::make_ddim({}));
+    out->set_dims(common::make_ddim({}));
   }
   out->set_dtype(x.dtype());
 }
@@ -212,7 +212,7 @@ void BCELossInferMeta(const MetaTensor& input,
 
   bool check = true;
   if ((!config.is_runtime) &&
-      (phi::product(input_dims) <= 0 || phi::product(label_dims) <= 0)) {
+      (common::product(input_dims) <= 0 || common::product(label_dims) <= 0)) {
     check = false;
   }
 
@@ -267,7 +267,7 @@ void BincountInferMeta(const MetaTensor& x,
             weights_dim,
             input_dim));
   }
-  out->set_dims(phi::make_ddim({-1}));
+  out->set_dims(common::make_ddim({-1}));
   if (weights) {
     out->set_dtype(weights.dtype());
   } else {
@@ -278,8 +278,8 @@ void BincountInferMeta(const MetaTensor& x,
 }
 
 void BmmInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
-  std::vector<int64_t> x_dims = phi::vectorize(x.dims());
-  std::vector<int64_t> y_dims = phi::vectorize(y.dims());
+  std::vector<int64_t> x_dims = common::vectorize(x.dims());
+  std::vector<int64_t> y_dims = common::vectorize(y.dims());
   std::size_t x_ndims = x_dims.size();
   std::size_t y_ndims = y_dims.size();
 
@@ -318,7 +318,7 @@ void BmmInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
       "Y's batch size [%s]"));
   dim_out.push_back(x_dims[1]);
   dim_out.push_back(y_dims[2]);
-  out->set_dims(phi::make_ddim(dim_out));
+  out->set_dims(common::make_ddim(dim_out));
   out->share_lod(x);
   out->set_dtype(x.dtype());
   out->set_layout(x.layout());
@@ -358,8 +358,8 @@ void CholeskySolveInferMeta(const MetaTensor& x,
                                    x_dims[x_dims_n - 2],
                                    y_dims[y_dims_n - 2]));
 
-  std::vector<int64_t> x_dims_vec = phi::vectorize(x_dims);
-  std::vector<int64_t> y_dims_vec = phi::vectorize(y_dims);
+  std::vector<int64_t> x_dims_vec = common::vectorize(x_dims);
+  std::vector<int64_t> y_dims_vec = common::vectorize(y_dims);
 
   std::vector<int64_t> x_dims_vec_cut(x_dims_vec.begin(), x_dims_vec.end() - 2);
   std::vector<int64_t> y_dims_vec_cut(y_dims_vec.begin(), y_dims_vec.end() - 2);
@@ -372,7 +372,7 @@ void CholeskySolveInferMeta(const MetaTensor& x,
                           {x_dims_vec[x_dims_n - 2], x_dims_vec[x_dims_n - 1]});
 
   // dim of 'out' is the same with 'X' after broadcast
-  out->set_dims(phi::make_ddim(x_broadcast_dims));
+  out->set_dims(common::make_ddim(x_broadcast_dims));
   out->set_dtype(x.dtype());
   out->set_layout(x.layout());
   out->share_lod(x);
@@ -401,7 +401,7 @@ void CompareRawInferMeta(const MetaTensor& x,
                                   max_dim,
                                   axis);
 
-    out->set_dims(make_ddim(out_dims_array));
+    out->set_dims(common::make_ddim(out_dims_array));
     out->share_lod(x);
   }
   if (!out->is_same_tensor(x)) {
@@ -426,15 +426,15 @@ void CompareAllInferMeta(const MetaTensor& x,
       errors::InvalidArgument(
           "The size of dim_y should not be greater than dim_x's."));
   out->share_lod(x);
-  out->set_dims(make_ddim({}));
+  out->set_dims(common::make_ddim({}));
 }
 
 void ComplexInferMeta(const MetaTensor& x,
                       const MetaTensor& y,
                       MetaTensor* out) {
   if (x.dims() == y.dims()) {
-    auto sizes = vectorize(x.dims());
-    out->set_dims(phi::make_ddim(sizes));
+    auto sizes = common::vectorize(x.dims());
+    out->set_dims(common::make_ddim(sizes));
     out->set_dtype(dtype::ToComplex(x.dtype()));
     // NOTE(chenfeiyu): lod & broadcasting is intrinsically contradictory
     // so tensors with lod are not supported here
@@ -455,7 +455,7 @@ void ComplexInferMeta(const MetaTensor& x,
                                        out_dims_array.data(),
                                        max_dim,
                                        axis);
-    out->set_dims(phi::make_ddim(out_dims_array));
+    out->set_dims(common::make_ddim(out_dims_array));
     out->set_dtype(dtype::ToComplex(x.dtype()));
   }
 }
@@ -540,7 +540,7 @@ void ConvInferMeta(const MetaTensor& input,
           in_dims.size(),
           in_dims,
           strides.size(),
-          phi::make_ddim(strides),
+          common::make_ddim(strides),
           in_sub_stride_size));
 
   const auto input_channels =
@@ -583,14 +583,15 @@ void ConvInferMeta(const MetaTensor& input,
 
   DDim in_data_dims;
   if (channel_last) {
-    in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
+    in_data_dims = common::slice_ddim(in_dims, 1, in_dims.size() - 1);
   } else {
-    in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
+    in_data_dims = common::slice_ddim(in_dims, 2, in_dims.size());
   }
 
-  DDim filter_data_dims = phi::slice_ddim(filter_dims, 2, filter_dims.size());
+  DDim filter_data_dims =
+      common::slice_ddim(filter_dims, 2, filter_dims.size());
 
-  std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
+  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
   phi::UpdatePaddingAndDilation(
       &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
 
@@ -616,7 +617,7 @@ void ConvInferMeta(const MetaTensor& input,
     output_shape.push_back(filter_dims[0]);
   }
 
-  out->set_dims(make_ddim(output_shape));
+  out->set_dims(common::make_ddim(output_shape));
   out->set_dtype(input.dtype());
 }
 
@@ -662,7 +663,7 @@ void ConvTransposeInferMeta(const MetaTensor& x,
 
   const DataLayout data_layout = config.is_run_mkldnn_kernel
                                      ? DataLayout::kNCHW
-                                     : phi::StringToDataLayout(data_format);
+                                     : common::StringToDataLayout(data_format);
 
   PADDLE_ENFORCE_EQ(
       x_dims.size() == 4 || x_dims.size() == 5,
@@ -750,7 +751,7 @@ void ConvTransposeInferMeta(const MetaTensor& x,
     x_data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1);
   }
   DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
-  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
   UpdatePaddingAndDilation(
       &paddings_, &dilations_, padding_algorithm, x_data_dims, strides, ksize);
 
@@ -775,7 +776,7 @@ void ConvTransposeInferMeta(const MetaTensor& x,
                 "output_size of Op(ConvTransposeOp) should not be "
                 "less than the infered output size. But received output_size = "
                 "[%s], whose dim %d is less than the infered output size [%s]",
-                make_ddim(output_size).to_str(),
+                common::make_ddim(output_size).to_str(),
                 i,
                 infer_shape));
         PADDLE_ENFORCE_LT(
@@ -786,7 +787,7 @@ void ConvTransposeInferMeta(const MetaTensor& x,
                 "than infered size + stride. But received output_size = [%s], "
                 "whose dim %d is not less than the infered output size (%d) + "
                 "stride (%d) = %d",
-                make_ddim(output_size).to_str(),
+                common::make_ddim(output_size).to_str(),
                 i,
                 infer_shape,
                 strides[i],
@@ -802,7 +803,7 @@ void ConvTransposeInferMeta(const MetaTensor& x,
                 "output_padding of Op(ConvTransposeOp) should not be "
                 "less than the 0. But received output_padding = "
                 "[%s], whose dim %d is less than 0",
-                make_ddim(output_padding).to_str(),
+                common::make_ddim(output_padding).to_str(),
                 i));
         PADDLE_ENFORCE_LT(
             output_padding[i],
@@ -813,7 +814,7 @@ void ConvTransposeInferMeta(const MetaTensor& x,
                 "[%s], "
                 "whose dim %d is not less than either stride (%d)  or "
                 "dilation (%d)",
-                make_ddim(output_size).to_str(),
+                common::make_ddim(output_size).to_str(),
                 i,
                 strides[i],
                 dilations_[i]));
@@ -827,7 +828,7 @@ void ConvTransposeInferMeta(const MetaTensor& x,
     output_shape.push_back(filter_dims[1] * groups);
   }
 
-  out->set_dims(make_ddim(output_shape));
+  out->set_dims(common::make_ddim(output_shape));
   out->set_dtype(x.dtype());
 }
 
@@ -1009,19 +1010,19 @@ void DistInferMeta(const MetaTensor& x,
   auto x_dims = x.dims();
   auto y_dims = y.dims();
 
-  PADDLE_ENFORCE_NE(phi::product(x_dims),
+  PADDLE_ENFORCE_NE(common::product(x_dims),
                     0,
                     phi::errors::InvalidArgument(
                         "The Input(X) has not been initialized properly. The "
                         "shape of Input(X) = [%s].",
                         x_dims));
-  PADDLE_ENFORCE_NE(phi::product(y_dims),
+  PADDLE_ENFORCE_NE(common::product(y_dims),
                     0,
                     phi::errors::InvalidArgument(
                         "The Input(Y) has not been initialized properly. The "
                         "shape of Input(Y) = [%s].",
                         y_dims));
-  out->set_dims(phi::make_ddim({}));
+  out->set_dims(common::make_ddim({}));
   out->set_dtype(x.dtype());
 }
 
@@ -1127,7 +1128,7 @@ void DropoutNdInferMeta(const MetaTensor& x,
             "equal to 0 and less than the dimensions of x. But "
             "received axis is {%s}, the dimension size of x is %d.",
             i,
-            phi::make_ddim(axis),
+            common::make_ddim(axis),
             x_dims.size()));
   }
 
@@ -1143,7 +1144,7 @@ void DropoutNdInferMeta(const MetaTensor& x,
           mask_dims[t] = x_dims[static_cast<int>(t)];
         });
 
-    mask->set_dims(make_ddim(mask_dims));
+    mask->set_dims(common::make_ddim(mask_dims));
     mask->set_dtype(DataType::UINT8);
   }
 }
@@ -1183,9 +1184,9 @@ void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
                         "with input tensor Y: %s",
                         x_dims.to_str(),
                         y_dims.to_str()));
-  std::vector<int64_t> x_dims_vec = phi::vectorize(x_dims);
+  std::vector<int64_t> x_dims_vec = common::vectorize(x_dims);
   std::vector<int64_t> x_dims_vec_cut(x_dims_vec.begin(), x_dims_vec.end() - 1);
-  x_dims = phi::make_ddim(x_dims_vec_cut);
+  x_dims = common::make_ddim(x_dims_vec_cut);
   out->set_dims(x_dims);
   out->set_dtype(x.dtype());
   out->set_layout(x.layout());
@@ -1239,13 +1240,13 @@ void ElementwiseRawInferMeta(const MetaTensor& x,
     if (should_rotate) {
       // Pick bigger shape and rotate this one
       bool x_over_y = (x_dims.size() > y_dims.size());
-      auto vdims =
-          x_over_y ? phi::vectorize<int>(x_dims) : phi::vectorize<int>(y_dims);
+      auto vdims = x_over_y ? common::vectorize<int>(x_dims)
+                            : common::vectorize<int>(y_dims);
       std::rotate(vdims.begin() + 1, vdims.begin() + 2, vdims.end());
       if (x_over_y) {
-        x_dims = phi::make_ddim(vdims);
+        x_dims = common::make_ddim(vdims);
       } else {
-        y_dims = phi::make_ddim(vdims);
+        y_dims = common::make_ddim(vdims);
       }
     }
 #endif
@@ -1263,7 +1264,7 @@ void ElementwiseRawInferMeta(const MetaTensor& x,
                   out_dims_array.end());
     }
 #endif
-    auto out_dims = phi::make_ddim(out_dims_array);
+    auto out_dims = common::make_ddim(out_dims_array);
     out->set_dims(out_dims);
   } else {
     out->set_dims(x.dims());
@@ -1298,9 +1299,9 @@ void EmbeddingInferMeta(const MetaTensor& x,
           table_dims.size(),
           table_dims));
 
-  auto output_dims = phi::vectorize(ids_dims);
+  auto output_dims = common::vectorize(ids_dims);
   output_dims.push_back(table_dims[1]);
-  out->set_dims(phi::make_ddim(output_dims));
+  out->set_dims(common::make_ddim(output_dims));
   out->set_dtype(weight.dtype());
   out->share_lod(x);
 }
@@ -1324,9 +1325,9 @@ void CEmbeddingInferMeta(const MetaTensor& weight,
           table_dims.size(),
           table_dims));
 
-  auto output_dims = phi::vectorize(ids_dims);
+  auto output_dims = common::vectorize(ids_dims);
   output_dims.push_back(table_dims[1]);
-  out->set_dims(phi::make_ddim(output_dims));
+  out->set_dims(common::make_ddim(output_dims));
   out->set_dtype(weight.dtype());
   out->share_lod(x);
 
@@ -1364,7 +1365,7 @@ void ExpandAsInferMeta(const MetaTensor& x,
                         "to %d. But received: rank %u.",
                         MAX_RANK_SUPPORTED,
                         target_shape.size()));
-  out->set_dims(phi::make_ddim(target_shape));
+  out->set_dims(common::make_ddim(target_shape));
   out->set_dtype(x.dtype());
 #undef MAX_RANK_SUPPORTED
 }
@@ -1408,7 +1409,7 @@ static std::vector<int64_t> GetInputShape(phi::DDim dim,
   if (is_input_fused) {
     dim = dim.reshape(shape).transpose(axis);
   }
-  return phi::vectorize(dim);
+  return common::vectorize(dim);
 }
 
 void FusedMatmulInferMeta(const MetaTensor& x,
@@ -1498,7 +1499,7 @@ void FusedMatmulInferMeta(const MetaTensor& x,
     new_dims.push_back(1);
   }
 
-  auto ddim_out = phi::make_ddim(new_dims);
+  auto ddim_out = common::make_ddim(new_dims);
 
   std::vector<int> shape = fused_reshape_Out;
   const std::vector<int>& axis = fused_transpose_Out;
@@ -1584,7 +1585,7 @@ void GatherInferMeta(const MetaTensor& x,
         for (int i = 1; i < input_dim.size(); ++i) {
           out_dim_vec.emplace_back(input_dim[i]);
         }
-        auto output_dims = phi::make_ddim(out_dim_vec);
+        auto output_dims = common::make_ddim(out_dim_vec);
         out->set_dims(output_dims);
         out->set_dtype(x.dtype());
         out->share_lod(x);
@@ -1596,7 +1597,7 @@ void GatherInferMeta(const MetaTensor& x,
         for (int i = axis_v + 1; i < input_dim.size(); i++) {
           out_dim_vec.push_back(input_dim[i]);  // NOLINT
         }
-        auto output_dims = phi::make_ddim(out_dim_vec);
+        auto output_dims = common::make_ddim(out_dim_vec);
         out->set_dims(output_dims);
         out->set_dtype(x.dtype());
         out->share_lod(x);
@@ -1621,7 +1622,7 @@ void GatherInferMeta(const MetaTensor& x,
       for (int i = axis_v + 1; i < input_dim.size(); i++) {
         out_dim_vec.push_back(input_dim[i]);  // NOLINT
       }
-      auto output_dims = phi::make_ddim(out_dim_vec);
+      auto output_dims = common::make_ddim(out_dim_vec);
       out->set_dims(output_dims);
       out->set_dtype(x.dtype());
       out->share_lod(x);
@@ -1659,7 +1660,7 @@ void GatherNdInferMeta(const MetaTensor& x,
     result_dims.emplace_back(x_dims[i]);
   }
 
-  out->set_dims(phi::make_ddim(result_dims));
+  out->set_dims(common::make_ddim(result_dims));
   out->share_lod(x);
   out->set_dtype(x.dtype());
 }
@@ -1760,8 +1761,8 @@ void HuberLossInferMeta(const MetaTensor& input,
                         input_dims.size(),
                         label_dims.size()));
 
-  bool contain_unknown_dim = phi::contain_unknown_dim(input_dims) ||
-                             phi::contain_unknown_dim(label_dims);
+  bool contain_unknown_dim = common::contain_unknown_dim(input_dims) ||
+                             common::contain_unknown_dim(label_dims);
   if (config.is_runtime || !contain_unknown_dim) {
     PADDLE_ENFORCE_EQ(
         input_dims,
@@ -1847,12 +1848,12 @@ void IndexSelectInferMeta(const MetaTensor& x,
       true,
       phi::errors::InvalidArgument("The length of Input(Index) can't be 0."));
 
-  auto output_dim = phi::vectorize(input_dim);
+  auto output_dim = common::vectorize(input_dim);
   if (dim < 0) {
     dim += input_dim.size();
   }
   output_dim[dim] = index_dim[0];
-  output->set_dims(phi::make_ddim(output_dim));
+  output->set_dims(common::make_ddim(output_dim));
   output->set_dtype(x.dtype());
   output->set_layout(x.layout());
   output->share_lod(x);
@@ -1874,12 +1875,12 @@ void IndexSelectStridedInferMeta(const MetaTensor& x,
           input_dim.size() - 1,
           dim));
 
-  auto output_dim = phi::vectorize(input_dim);
+  auto output_dim = common::vectorize(input_dim);
   if (dim < 0) {
     dim += input_dim.size();
   }
   output_dim.erase(output_dim.begin() + dim);
-  output->set_dims(phi::make_ddim(output_dim));
+  output->set_dims(common::make_ddim(output_dim));
   output->set_dtype(x.dtype());
   output->set_layout(x.layout());
   output->share_lod(x);
@@ -1968,7 +1969,7 @@ void KronInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
     int64_t dim_yi = (i < rank - rank_y) ? 1 : dim_y.at(i - (rank - rank_y));
     dim_out.push_back(dim_xi == -1 || dim_yi == -1 ? -1 : dim_xi * dim_yi);
   }
-  out->set_dims(phi::make_ddim(dim_out));
+  out->set_dims(common::make_ddim(dim_out));
   out->set_dtype(x.dtype());
 }
 
@@ -1981,7 +1982,7 @@ void LogLossInferMeta(const MetaTensor& input,
   auto label_dims = label.dims();
 
   if (config.is_runtime ||
-      (phi::product(pred_dims) > 0 && phi::product(label_dims) > 0)) {
+      (common::product(pred_dims) > 0 && common::product(label_dims) > 0)) {
     PADDLE_ENFORCE_EQ(
         pred_dims,
         label_dims,
@@ -2140,8 +2141,8 @@ void MatmulInferMeta(const MetaTensor& x,
                      bool trans_x,
                      bool trans_y,
                      MetaTensor* out) {
-  std::vector<int64_t> dims_x = phi::vectorize(x.dims());
-  std::vector<int64_t> dims_y = phi::vectorize(y.dims());
+  std::vector<int64_t> dims_x = common::vectorize(x.dims());
+  std::vector<int64_t> dims_y = common::vectorize(y.dims());
   auto ndims_x = dims_x.size();
   auto ndims_y = dims_y.size();
   PADDLE_ENFORCE_GT(ndims_x,
@@ -2198,7 +2199,7 @@ void MatmulInferMeta(const MetaTensor& x,
     new_dims.push_back(N);  // NOLINT
   }
 
-  auto ddim_out = phi::make_ddim(new_dims);
+  auto ddim_out = common::make_ddim(new_dims);
 
   out->set_dims(ddim_out);
   if (x.dtype() == phi::DataType::INT8) {
@@ -2221,7 +2222,7 @@ void MatmulWithFlattenInferMeta(const MetaTensor& x,
           << " x_num_col_dims=" << x_num_col_dims
           << " y_num_col_dims=" << y_num_col_dims;
 
-  PADDLE_ENFORCE_NE(phi::product(y_dims),
+  PADDLE_ENFORCE_NE(common::product(y_dims),
                     0,
                     phi::errors::PreconditionNotMet(
                         "The Input variable Y has not "
@@ -2249,8 +2250,8 @@ void MatmulWithFlattenInferMeta(const MetaTensor& x,
           y_dims,
           y_num_col_dims));
 
-  auto x_mat_dims = phi::flatten_to_2d(x_dims, x_num_col_dims);
-  auto y_mat_dims = phi::flatten_to_2d(y_dims, y_num_col_dims);
+  auto x_mat_dims = common::flatten_to_2d(x_dims, x_num_col_dims);
+  auto y_mat_dims = common::flatten_to_2d(y_dims, y_num_col_dims);
 
   PADDLE_ENFORCE_EQ(
       x_mat_dims[1],
@@ -2279,7 +2280,7 @@ void MatmulWithFlattenInferMeta(const MetaTensor& x,
     output_dims.push_back(y_dims[i]);
   }
 
-  out->set_dims(phi::make_ddim(output_dims));
+  out->set_dims(common::make_ddim(output_dims));
   if (x.dtype() == phi::DataType::INT8) {
     out->set_dtype(phi::DataType::INT32);
   } else {
@@ -2394,7 +2395,7 @@ void MatrixRankTolInferMeta(const MetaTensor& x,
                                        out_dims_array.data(),
                                        max_dim,
                                        axis);
-    out->set_dims(phi::make_ddim(out_dims_array));
+    out->set_dims(common::make_ddim(out_dims_array));
   }
   out->share_lod(x);
 }
@@ -2421,7 +2422,7 @@ void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out) {
                         dim_x,
                         dim_vec));
 
-  auto dim_out = phi::make_ddim({dim_x[0]});
+  auto dim_out = common::make_ddim({dim_x[0]});
 
   out->set_dims(dim_out);
   out->set_dtype(x.dtype());
@@ -2437,7 +2438,7 @@ void PReluInferMeta(const MetaTensor& x,
                     MetaConfig config) {
   auto x_dim = x.dims();
   if (mode == "all") {
-    PADDLE_ENFORCE_EQ(phi::product(alpha.dims()),
+    PADDLE_ENFORCE_EQ(common::product(alpha.dims()),
                       1,
                       phi::errors::InvalidArgument(
                           "For mode 'all', size of weight Alpha must be one. "
@@ -2625,8 +2626,8 @@ void PriorBoxInferMeta(const MetaTensor& input,
 
   out->set_dtype(input.dtype());
   var->set_dtype(input.dtype());
-  out->set_dims(phi::make_ddim(dim_vec));
-  var->set_dims(phi::make_ddim(dim_vec));
+  out->set_dims(common::make_ddim(dim_vec));
+  var->set_dims(common::make_ddim(dim_vec));
 }
 
 void RepeatInterleaveWithTensorIndexInferMeta(const MetaTensor& x,
@@ -2634,7 +2635,7 @@ void RepeatInterleaveWithTensorIndexInferMeta(const MetaTensor& x,
                                               int dim,
                                               MetaTensor* out) {
   const auto& input_dim = x.dims();
-  auto output_dim = phi::vectorize(input_dim);
+  auto output_dim = common::vectorize(input_dim);
   PADDLE_ENFORCE_EQ(
       dim < input_dim.size() && dim >= (0 - input_dim.size()),
       true,
@@ -2671,7 +2672,7 @@ void RepeatInterleaveWithTensorIndexInferMeta(const MetaTensor& x,
   }
   output_dim[dim] = -1;
 
-  out->set_dims(phi::make_ddim(output_dim));
+  out->set_dims(common::make_ddim(output_dim));
   out->share_lod(x);
   out->set_dtype(x.dtype());
 }
@@ -2740,7 +2741,7 @@ void SequenceMaskInferMeta(const MetaTensor& x,
                            int maxlen,
                            int out_dtype,
                            MetaTensor* y) {
-  auto dim = phi::vectorize<int>(x.dims());
+  auto dim = common::vectorize<int>(x.dims());
 
   if (max_len_tensor) {
     dim.push_back(-1);
@@ -2748,7 +2749,7 @@ void SequenceMaskInferMeta(const MetaTensor& x,
     dim.push_back(maxlen > 0 ? maxlen : -1);
   }
 
-  y->set_dims(phi::make_ddim(dim));
+  y->set_dims(common::make_ddim(dim));
   auto out_phi_dtype = phi::TransToPhiDataType(out_dtype);
   y->set_dtype(out_phi_dtype);
 }
@@ -2858,8 +2859,8 @@ void TriangularSolveInferMeta(const MetaTensor& x,
                         x_dims[x_dims_n - 2],
                         x_dims[x_dims_n - 1]));
 
-  std::vector<int64_t> x_dims_vec = phi::vectorize(x_dims);
-  std::vector<int64_t> y_dims_vec = phi::vectorize(y_dims);
+  std::vector<int64_t> x_dims_vec = common::vectorize(x_dims);
+  std::vector<int64_t> y_dims_vec = common::vectorize(y_dims);
 
   std::vector<int64_t> x_dims_vec_cut(x_dims_vec.begin(), x_dims_vec.end() - 2);
   std::vector<int64_t> y_dims_vec_cut(y_dims_vec.begin(), y_dims_vec.end() - 2);
@@ -2872,7 +2873,7 @@ void TriangularSolveInferMeta(const MetaTensor& x,
                           {y_dims_vec[y_dims_n - 2], y_dims_vec[y_dims_n - 1]});
 
   // dim of 'out' is the same with 'Y' after broadcast
-  out->set_dims(phi::make_ddim(y_broadcast_dims));
+  out->set_dims(common::make_ddim(y_broadcast_dims));
   out->set_dtype(y.dtype());
   out->set_layout(y.layout());
   out->share_lod(y);
@@ -2893,9 +2894,9 @@ void TopPSamplingInferMeta(const MetaTensor& x,
                         "But received x_dims[0] = %d and ps_dims[0] = %d.",
                         x_dims[0],
                         ps_dims[0]));
-  ids->set_dims(phi::make_ddim({x_dims[0], 1}));
+  ids->set_dims(common::make_ddim({x_dims[0], 1}));
   ids->set_dtype(DataType::INT64);
-  out->set_dims(phi::make_ddim({x_dims[0], 1}));
+  out->set_dims(common::make_ddim({x_dims[0], 1}));
   out->set_dtype(x.dtype());
 }
 
@@ -2962,24 +2963,24 @@ void LstsqInferMeta(const MetaTensor& x,
           m,
           y_dims[y_rank - 2]));
 
-  rank->set_dims(phi::make_ddim(batch_dims_vec));
+  rank->set_dims(common::make_ddim(batch_dims_vec));
 
   if (m > n) {
     batch_dims_vec.emplace_back(nrhs);
-    residuals->set_dims(phi::make_ddim(batch_dims_vec));
+    residuals->set_dims(common::make_ddim(batch_dims_vec));
     batch_dims_vec.pop_back();
   } else {
-    residuals->set_dims(phi::make_ddim({0}));
+    residuals->set_dims(common::make_ddim({0}));
   }
   residuals->set_dtype(y.dtype());
 
   batch_dims_vec.emplace_back(std::min(m, n));
-  singular_values->set_dims(phi::make_ddim(batch_dims_vec));
+  singular_values->set_dims(common::make_ddim(batch_dims_vec));
   singular_values->set_dtype(y.dtype());
 
   batch_dims_vec[x_rank - 2] = n;
   batch_dims_vec.emplace_back(nrhs);
-  solution->set_dims(phi::make_ddim(batch_dims_vec));
+  solution->set_dims(common::make_ddim(batch_dims_vec));
   solution->set_dtype(y.dtype());
 }
 
@@ -3088,11 +3089,11 @@ void YoloBoxInferMeta(const MetaTensor& x,
     box_num = -1;
   }
   std::vector<int64_t> dim_boxes({dim_x[0], box_num, 4});
-  boxes->set_dims(phi::make_ddim(dim_boxes));
+  boxes->set_dims(common::make_ddim(dim_boxes));
   boxes->set_dtype(x.dtype());
 
   std::vector<int64_t> dim_scores({dim_x[0], box_num, class_num});
-  scores->set_dims(phi::make_ddim(dim_scores));
+  scores->set_dims(common::make_ddim(dim_scores));
 }
 
 void ValueCompareInferMeta(const MetaTensor& x,
@@ -3109,8 +3110,8 @@ void SolveInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
   auto x_dims = x.dims();
   auto y_dims = y.dims();
 
-  std::vector<int64_t> x_dims_vec = phi::vectorize(x.dims());
-  std::vector<int64_t> y_dims_vec = phi::vectorize(y.dims());
+  std::vector<int64_t> x_dims_vec = common::vectorize(x.dims());
+  std::vector<int64_t> y_dims_vec = common::vectorize(y.dims());
 
   auto x_dims_n = x_dims_vec.size();
   auto y_dims_n = y_dims_vec.size();
@@ -3184,7 +3185,7 @@ void SolveInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
     new_dims.push_back(1);
   }
 
-  auto out_dims = phi::make_ddim(new_dims);
+  auto out_dims = common::make_ddim(new_dims);
 
   out->set_dims(out_dims);
   out->set_dtype(x.dtype());
@@ -3234,7 +3235,7 @@ void UnpoolInferMeta(const MetaTensor& x,
     }
   }
   if (out != nullptr) {
-    out->set_dims(phi::make_ddim(output_shape));
+    out->set_dims(common::make_ddim(output_shape));
     out->set_dtype(x.dtype());
   }
 }
@@ -3275,7 +3276,7 @@ void Unpool3dInferMeta(const MetaTensor& x,
     }
   }
   if (out != nullptr) {
-    out->set_dims(phi::make_ddim(output_shape));
+    out->set_dims(common::make_ddim(output_shape));
     out->set_dtype(x.dtype());
   }
 }
@@ -3305,7 +3306,7 @@ void WeightDequantizeInferMeta(const MetaTensor& x,
                         x.dims()[0]));
   int n = x.dims()[1];
   int k = x.dims()[0];
-  out->set_dims(phi::make_ddim({n, k}));
+  out->set_dims(common::make_ddim({n, k}));
   out->set_dtype(out_dtype);
 }
 
diff --git a/paddle/phi/infermeta/fusion.cc b/paddle/phi/infermeta/fusion.cc
index 0bda38a08d651b..37bb925067f67a 100644
--- a/paddle/phi/infermeta/fusion.cc
+++ b/paddle/phi/infermeta/fusion.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/phi/infermeta/fusion.h"
 #include <unordered_set>
 #include <vector>
-#include "paddle/phi/common/layout.h"
+#include "paddle/common/layout.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/core/meta_tensor.h"
@@ -65,7 +65,7 @@ static phi::DDim BroadCastInferShape(const DDim x_dims,
                                   max_dim,
                                   axis);
 
-    return phi::make_ddim(out_dims_array);
+    return common::make_ddim(out_dims_array);
   }
   return x_dims;
 }
@@ -89,7 +89,7 @@ void AddActXPUInferMeta(const MetaTensor& x,
   out->set_dtype(x.dtype());
   out->set_layout(x.layout());
   out->share_lod(x);
-  out_max->set_dims(phi::make_ddim({6}));
+  out_max->set_dims(common::make_ddim({6}));
   out_max->set_dtype(x.dtype());
   out_max->set_layout(x.layout());
 }
@@ -206,7 +206,7 @@ void Conv1dXPUInferMeta(const MetaTensor& x,
   out->set_dims(DDim(out_shape.data(), static_cast<int>(out_shape.size())));
   out->set_dtype(x.dtype());
   out->set_layout(x.layout());
-  out_max->set_dims(phi::make_ddim({6}));
+  out_max->set_dims(common::make_ddim({6}));
 }
 
 void Conv2dXPUInferMeta(const MetaTensor& x,
@@ -270,7 +270,7 @@ void Conv2dXPUInferMeta(const MetaTensor& x,
           in_dims.size(),
           in_dims,
           strides.size(),
-          phi::make_ddim(strides),
+          common::make_ddim(strides),
           in_sub_stride_size));
 
   for (int i = 0; i < dilation_size; ++i) {
@@ -313,9 +313,10 @@ void Conv2dXPUInferMeta(const MetaTensor& x,
   // update paddings and dilations accoring to padding_algorithm
   std::vector<int> paddings_vec = paddings;
   std::vector<int> dilations_vec = dilations;
-  DDim in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-  DDim filter_data_dims = phi::slice_ddim(filter_dims, 2, filter_dims.size());
-  std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
+  DDim in_data_dims = common::slice_ddim(in_dims, 2, in_dims.size());
+  DDim filter_data_dims =
+      common::slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
   phi::UpdatePaddingAndDilation(&paddings_vec,
                                 &dilations_vec,
                                 padding_algorithm,
@@ -334,7 +335,7 @@ void Conv2dXPUInferMeta(const MetaTensor& x,
   }
   // set output and output max dims
   out->set_dims(DDim(out_shape.data(), static_cast<int>(out_shape.size())));
-  out_max->set_dims(phi::make_ddim({6}));
+  out_max->set_dims(common::make_ddim({6}));
   out->set_dtype(out_dtype);
 }
 
@@ -358,7 +359,7 @@ void EmbeddingWithEltwiseAddXPUInferMeta(
 
   auto id_dims = ids[0]->dims();
   auto table_dims = tables[0]->dims();
-  out->set_dims(phi::make_ddim({id_dims[0], id_dims[1], table_dims[1]}));
+  out->set_dims(common::make_ddim({id_dims[0], id_dims[1], table_dims[1]}));
   out->set_dtype(tables[0]->dtype());
   out->set_layout(ids[0]->layout());
 }
@@ -387,7 +388,7 @@ void FcXPUInferMeta(const MetaTensor& x,
   out->set_dims(DDim(out_shape.data(), static_cast<int>(out_shape.size())));
   out->set_dtype(out_dtype);
   out->set_layout(x.layout());
-  out_max->set_dims(phi::make_ddim({6}));
+  out_max->set_dims(common::make_ddim({6}));
   out_max->set_dtype(x.dtype());
   out_max->set_layout(x.layout());
 }
@@ -841,7 +842,7 @@ void FusedFeedForwardInferMeta(const MetaTensor& x,
     if (x_dim.size() > 1) {
       return x_dim;
     }
-    return make_ddim({1, x_dim[0]});
+    return common::make_ddim({1, x_dim[0]});
   };
 
   auto mat_dim_x =
@@ -868,7 +869,8 @@ void FusedFeedForwardInferMeta(const MetaTensor& x,
     dropout2_mask->set_dims(dim_x);
   }
 
-  auto mean_dim = phi::make_ddim({mat_dim_x.batch_size_ * mat_dim_x.height_});
+  auto mean_dim =
+      common::make_ddim({mat_dim_x.batch_size_ * mat_dim_x.height_});
   if (pre_layer_norm) {
     ln1_out->set_dims(dim_x);
     ln1_mean->set_dims(mean_dim);
@@ -1237,7 +1239,8 @@ void FusedGemmEpilogueInferMeta(const MetaTensor& x,
                         bias_dims,
                         y_dims));
 
-  auto x_mat_dims = phi::flatten_to_2d(x_dims, trans_x ? 1 : x_dims.size() - 1);
+  auto x_mat_dims =
+      common::flatten_to_2d(x_dims, trans_x ? 1 : x_dims.size() - 1);
 
   int K_from_x = static_cast<int>(trans_x ? x_mat_dims[0] : x_mat_dims[1]);
   int K_from_y = static_cast<int>(trans_y ? y_dims[1] : y_dims[0]);
@@ -1264,11 +1267,11 @@ void FusedGemmEpilogueInferMeta(const MetaTensor& x,
   } else {
     out_dims.push_back(y_dims[1]);
   }
-  out->set_dims(phi::make_ddim(out_dims));
+  out->set_dims(common::make_ddim(out_dims));
   out->set_dtype(x.dtype());
 
   if (reserve_space) {
-    reserve_space->set_dims(phi::make_ddim(out_dims));
+    reserve_space->set_dims(common::make_ddim(out_dims));
     reserve_space->set_dtype(x.dtype());
     if (activation == "none") {
       PADDLE_THROW(phi::errors::InvalidArgument(
@@ -1337,8 +1340,8 @@ void FusedGemmEpilogueGradInferMeta(const MetaTensor& x,
           dout_dims.size(),
           x_dims.size()));
 
-  auto dout_mat_dims = phi::flatten_to_2d(dout_dims, dout_dims.size() - 1);
-  auto x_mat_dims = phi::flatten_to_2d(x_dims, x_dims.size() - 1);
+  auto dout_mat_dims = common::flatten_to_2d(dout_dims, dout_dims.size() - 1);
+  auto x_mat_dims = common::flatten_to_2d(x_dims, x_dims.size() - 1);
 
   PADDLE_ENFORCE_EQ(
       dout_mat_dims[1],
@@ -1373,7 +1376,7 @@ void FusedGemmEpilogueGradInferMeta(const MetaTensor& x,
 
   if (bias_grad) {
     int64_t dbias_dim = trans_y ? y_dims[0] : y_dims[1];
-    bias_grad->set_dims(phi::make_ddim({dbias_dim}));
+    bias_grad->set_dims(common::make_ddim({dbias_dim}));
     bias_grad->set_dtype(y.dtype());
   }
 }
@@ -1608,7 +1611,7 @@ void YoloBoxXPUInferMeta(const MetaTensor& x,
                                   left_slice_out_dims_vector.data(),
                                   1,
                                   true);
-  auto left_slice_out_dims = phi::make_ddim(left_slice_out_dims_vector);
+  auto left_slice_out_dims = common::make_ddim(left_slice_out_dims_vector);
   auto grid_dims = grid.dims();
   auto left_add_out_dims =
       BroadCastInferShape(left_slice_out_dims, grid_dims, -1);
@@ -1630,7 +1633,7 @@ void YoloBoxXPUInferMeta(const MetaTensor& x,
                                   mid_slice_out_dims_vector.data(),
                                   1,
                                   true);
-  auto mid_slice_out_dims = phi::make_ddim(mid_slice_out_dims_vector);
+  auto mid_slice_out_dims = common::make_ddim(mid_slice_out_dims_vector);
   auto anchor_grid_dims = anchor_grid.dims();
   auto mid_mul_out_dims =
       BroadCastInferShape(mid_slice_out_dims, anchor_grid_dims, -1);
@@ -1648,7 +1651,7 @@ void YoloBoxXPUInferMeta(const MetaTensor& x,
                                   right_slice_out_dims_vector.data(),
                                   1,
                                   true);
-  auto right_slice_out_dims = phi::make_ddim(right_slice_out_dims_vector);
+  auto right_slice_out_dims = common::make_ddim(right_slice_out_dims_vector);
   // compute concat out_dims
   std::vector<phi::DDim> in_dims;
   in_dims.reserve(3);
@@ -1661,7 +1664,7 @@ void YoloBoxXPUInferMeta(const MetaTensor& x,
   out->set_dims(out_dim);
   out->set_dtype(x.dtype());
   out->set_layout(x.layout());
-  out_max->set_dims(phi::make_ddim({6}));
+  out_max->set_dims(common::make_ddim({6}));
   out_max->set_dtype(x.dtype());
   out_max->set_layout(x.layout());
 }
@@ -1767,7 +1770,7 @@ void ConvTransposeXPUInferMeta(const MetaTensor& x,
     x_data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1);
   }
   DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
-  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
   UpdatePaddingAndDilation(
       &paddings_, &dilations_, padding_algorithm, x_data_dims, strides, ksize);
 
@@ -1795,9 +1798,9 @@ void ConvTransposeXPUInferMeta(const MetaTensor& x,
     output_shape.push_back(filter_dims[1] * groups);
   }
 
-  out->set_dims(make_ddim(output_shape));
+  out->set_dims(common::make_ddim(output_shape));
   out->set_dtype(x.dtype());
-  out_max->set_dims(phi::make_ddim({6}));
+  out_max->set_dims(common::make_ddim({6}));
 }
 
 void Conv2dTransposeXPUInferMeta(const MetaTensor& x,
@@ -1876,7 +1879,7 @@ void BNActXPUInferMeta(const MetaTensor& x,
             x_dims));
   }
 
-  const DataLayout data_layout_str = phi::StringToDataLayout(data_layout);
+  const DataLayout data_layout_str = common::StringToDataLayout(data_layout);
 
   PADDLE_ENFORCE_GE(
       x_dims.size(),
@@ -1924,7 +1927,7 @@ void BNActXPUInferMeta(const MetaTensor& x,
 
   bool check = true;
   if ((!config.is_runtime) &&
-      (phi::product(scale_dim) <= 0 || phi::product(bias_dim) <= 0)) {
+      (common::product(scale_dim) <= 0 || common::product(bias_dim) <= 0)) {
     check = false;
   }
 
@@ -2072,9 +2075,10 @@ void FusedScaleBiasReluConvBnInferMeta(const MetaTensor& x,
   std::vector<int> paddings_vec = paddings;
   std::vector<int> dilations_vec = dilations;
   // get "HW" from "NHWC"
-  DDim in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-  DDim filter_data_dims = phi::slice_ddim(filter_dims, 2, filter_dims.size());
-  std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
+  DDim in_data_dims = common::slice_ddim(in_dims, 1, in_dims.size() - 1);
+  DDim filter_data_dims =
+      common::slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
   phi::UpdatePaddingAndDilation(&paddings_vec,
                                 &dilations_vec,
                                 padding_algorithm,
@@ -2093,7 +2097,7 @@ void FusedScaleBiasReluConvBnInferMeta(const MetaTensor& x,
   }
   out_shape.push_back(filter_dims[0]);
   // make shape for other outputs
-  auto c_dims = phi::make_ddim({filter_dims[0]});
+  auto c_dims = common::make_ddim({filter_dims[0]});
   // set output and output max dims
   out->set_dims(DDim(out_shape.data(), static_cast<int>(out_shape.size())));
   out_running_mean->set_dims(c_dims);
@@ -2225,7 +2229,7 @@ void FusedEmbeddingEltWiseLayerNormInferMeta(
             hidden));
   }
 
-  auto dim_output = phi::make_ddim({batch, seq_len, hidden});
+  auto dim_output = common::make_ddim({batch, seq_len, hidden});
   out->set_dims(dim_output);
   out->share_lod(*ids[0]);
   out->set_dtype((*embs[0]).dtype());
@@ -2289,7 +2293,7 @@ void FusionTransposeFlattenConcatInferMeta(
   if (out_dims[concat_axis] < 0) {
     out_dims[concat_axis] = -1;
   }
-  out->set_dims(phi::make_ddim(out_dims));
+  out->set_dims(common::make_ddim(out_dims));
   out->set_dtype((*x[0]).dtype());
 }
 
@@ -2368,7 +2372,7 @@ void FusedFCElementwiseLayerNormInferMeta(const MetaTensor& x,
           x_dims.size(),
           x_dims));
 
-  auto x_mat_dims = phi::flatten_to_2d(x_dims, x_num_col_dims);
+  auto x_mat_dims = common::flatten_to_2d(x_dims, x_num_col_dims);
   PADDLE_ENFORCE_EQ(
       x_mat_dims[1],
       w_dims[0],
@@ -2389,13 +2393,13 @@ void FusedFCElementwiseLayerNormInferMeta(const MetaTensor& x,
   fc_out_dims.push_back(w_dims[1]);
 
   DDim y_dims = y.dims();
-  PADDLE_ENFORCE_EQ(phi::make_ddim(fc_out_dims),
+  PADDLE_ENFORCE_EQ(common::make_ddim(fc_out_dims),
                     y_dims,
                     phi::errors::InvalidArgument(
                         "The output's shape of fc is expected to be equal to "
                         "that of input Y. But received output's shape of fc "
                         "is %s, input Y's shape is %s.",
-                        phi::make_ddim(fc_out_dims),
+                        common::make_ddim(fc_out_dims),
                         y_dims));
 
   PADDLE_ENFORCE_LT(
@@ -2410,7 +2414,7 @@ void FusedFCElementwiseLayerNormInferMeta(const MetaTensor& x,
           y_dims.size(),
           y_dims));
 
-  auto y_mat_dim = phi::flatten_to_2d(y_dims, begin_norm_axis);
+  auto y_mat_dim = common::flatten_to_2d(y_dims, begin_norm_axis);
   int64_t dim_0 = y_mat_dim[0];
   int64_t dim_1 = y_mat_dim[1];
   if (scale) {
@@ -2512,7 +2516,7 @@ void Conv2dFusionInferMeta(const MetaTensor& input,
                                                       data_format,
                                                       channel_last,
                                                       config);
-  output->set_dims(phi::make_ddim(out_shape));
+  output->set_dims(common::make_ddim(out_shape));
   output->set_dtype(input.dtype());
   if (data_format == "NHWC") {
     output->set_layout(phi::DataLayout::NHWC);
@@ -2533,17 +2537,17 @@ void Conv2dFusionInferMeta(const MetaTensor& input,
             "Attr(split_channels) = %u, the content = [%s].",
             outputs.size(),
             split_channels.size(),
-            phi::make_ddim(split_channels)));
+            common::make_ddim(split_channels)));
 
     int split_channels_sum = 0;
     std::vector<phi::DDim> output_shapes(split_channels.size());
     for (size_t i = 0; i < split_channels.size(); ++i) {
       split_channels_sum += split_channels[i];
       if (channel_last) {
-        output_shapes[i] = phi::make_ddim(
+        output_shapes[i] = common::make_ddim(
             {out_shape[0], out_shape[1], out_shape[2], split_channels[i]});
       } else {
-        output_shapes[i] = phi::make_ddim(
+        output_shapes[i] = common::make_ddim(
             {out_shape[0], split_channels[i], out_shape[2], out_shape[3]});
       }
     }
@@ -2650,13 +2654,13 @@ void FusionRepeatedFCReluInferMeta(const MetaTensor& x,
                           i,
                           w_dims[i].size()));
     PADDLE_ENFORCE_EQ(
-        phi::product(b_dims[i]),
+        common::product(b_dims[i]),
         w_dims[i][1],
         phi::errors::InvalidArgument(
             "The length of Bias must be equal with w_dims[1], but received "
             "product(b_dims[%d]) = %d, w_dims[%d][1] = %d.",
             i,
-            phi::product(b_dims[i]),
+            common::product(b_dims[i]),
             i,
             w_dims[i][1]));
   }
@@ -2739,7 +2743,7 @@ void FusionGRUInferMeta(const MetaTensor& x,
 
   DDim x_dims = x.dims();
   auto x_mat_dims = (x_dims.size() == 3 && x_dims[1] == 1)
-                        ? phi::flatten_to_2d(x_dims, 1)
+                        ? common::flatten_to_2d(x_dims, 1)
                         : x_dims;
   PADDLE_ENFORCE_EQ(
       x_mat_dims.size(),
@@ -3113,7 +3117,7 @@ void FCInferMeta(const MetaTensor& input,
   phi::funcs::FCOutputSize(
       in_dims, w_dims, output_dims, in_num_col_dims, padding_weights);
 
-  out->set_dims(phi::make_ddim(output_dims));
+  out->set_dims(common::make_ddim(output_dims));
   out->share_lod(input);
   out->set_dtype(input.dtype());
 }
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 22dfe4b059ed31..8a7eaf82bdc585 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -18,9 +18,9 @@ limitations under the License. */
 
 #include "glog/logging.h"
 
+#include "paddle/common/layout.h"
 #include "paddle/phi/backends/device_memory_aligment.h"
 #include "paddle/phi/common/data_type.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/core/meta_tensor.h"
@@ -57,7 +57,7 @@ void AdadeltaInferMeta(const MetaTensor& param,
                        MetaTensor* master_param_out) {
   auto lr_dims = learning_rate.dims();
   PADDLE_ENFORCE_EQ(
-      phi::product(lr_dims),
+      common::product(lr_dims),
       1,
       phi::errors::InvalidArgument("LearningRate should have one element"));
   auto param_dims = param.dims();
@@ -115,7 +115,7 @@ void AdagradInferMeta(const MetaTensor& param,
                       MetaTensor* master_param_out) {
   auto lr_dims = learning_rate.dims();
   PADDLE_ENFORCE_EQ(
-      phi::product(lr_dims),
+      common::product(lr_dims),
       1,
       phi::errors::InvalidArgument("LearningRate should have one element"));
   auto param_dims = param.dims();
@@ -171,7 +171,7 @@ void AdamInferMeta(const MetaTensor& param,
                    MetaTensor* master_param_outs) {
   auto lr_dims = learning_rate.dims();
   PADDLE_ENFORCE_EQ(
-      phi::product(lr_dims),
+      common::product(lr_dims),
       1,
       errors::InvalidArgument(
           "The number of LearningRate shall be 1, but received %d. Maybe "
@@ -179,23 +179,23 @@ void AdamInferMeta(const MetaTensor& param,
           "been initialized. You may need to confirm "
           "if you put exe.run(startup_program) "
           "after optimizer.minimize function.",
-          phi::product(lr_dims)));
+          common::product(lr_dims)));
   auto beta1_pow_dims = beta1_pow.dims();
   VLOG(3) << "dims of Beta1Pow : [" << beta1_pow_dims << "]";
-  PADDLE_ENFORCE_GE(phi::product(beta1_pow_dims),
+  PADDLE_ENFORCE_GE(common::product(beta1_pow_dims),
                     1,
                     errors::InvalidArgument(
                         "The size of Beta1 power accumulator should be greater "
                         "than 0, but received %d.",
-                        phi::product(beta1_pow_dims)));
+                        common::product(beta1_pow_dims)));
   auto beta2_pow_dims = beta2_pow.dims();
   VLOG(3) << "dims of Beta2Pow : [" << beta2_pow_dims << "]";
-  PADDLE_ENFORCE_GE(phi::product(beta2_pow_dims),
+  PADDLE_ENFORCE_GE(common::product(beta2_pow_dims),
                     1,
                     errors::InvalidArgument(
                         "The size of Beta2 power accumulator should be greater "
                         "than 0, but received %d.",
-                        phi::product(beta2_pow_dims)));
+                        common::product(beta2_pow_dims)));
 
   auto param_dims = param.dims();
   PADDLE_ENFORCE_EQ(
@@ -395,7 +395,7 @@ void AddNInferMeta(const std::vector<const MetaTensor*>& x,
       continue;
     }
     // for zero-sized tensor
-    if (phi::product(x_dim) == 0) {
+    if (common::product(x_dim) == 0) {
       continue;
     }
     // for 0D tensor
@@ -403,7 +403,7 @@ void AddNInferMeta(const std::vector<const MetaTensor*>& x,
       continue;
     }
     is_all_0d_tensor = false;
-    if (phi::product(in_dim) == 0) {
+    if (common::product(in_dim) == 0) {
       in_dim = x_dim;
     } else {
       if (config.is_runtime) {
@@ -451,7 +451,7 @@ void AddNInferMeta(const std::vector<const MetaTensor*>& x,
     }
   }
   if (is_all_0d_tensor) {
-    out->set_dims(make_ddim({}));
+    out->set_dims(common::make_ddim({}));
   } else {
     out->set_dims(in_dim);
   }
@@ -477,7 +477,7 @@ void AddNTensorArrayInferMeta(const std::vector<const MetaTensor*>& x,
 
   if (has_tensor_array) {
     if (out->is_tensor_array()) {
-      out->set_dims(make_ddim({max_length}));
+      out->set_dims(common::make_ddim({max_length}));
     }
   } else {
     AddNInferMeta(x, out, config);
@@ -508,14 +508,14 @@ void AucInferMeta(const MetaTensor& input,
           predict_dims));
   auto predict_width = predict_dims[1];
   PADDLE_ENFORCE_NE(
-      phi::product(predict_dims),
+      common::product(predict_dims),
       0,
       phi::errors::InvalidArgument(
           "The Input(Predict) has not been initialized properly. The "
           "shape of Input(Predict) = [%s], the shape can not involes 0.",
           predict_dims));
   PADDLE_ENFORCE_NE(
-      phi::product(label_dims),
+      common::product(label_dims),
       0,
       phi::errors::InvalidArgument(
           "The Input(Label) has not been initialized properly. The "
@@ -550,7 +550,7 @@ void AucInferMeta(const MetaTensor& input,
       0,
       phi::errors::InvalidArgument("slide_steps must be natural number"));
 
-  auc->set_dims(phi::make_ddim({}));
+  auc->set_dims(common::make_ddim({}));
   auc->set_dtype(DataType::INT64);
 
   if (slide_steps) {
@@ -658,7 +658,7 @@ void BatchNormInferMeta(const MetaTensor& x,
             x_dims));
   }
 
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
 
   PADDLE_ENFORCE_GE(
       x_dims.size(),
@@ -709,8 +709,8 @@ void BatchNormInferMeta(const MetaTensor& x,
 
   bool check = true;
   if (!scale || !bias ||
-      ((!config.is_runtime) &&
-       (phi::product(scale.dims()) <= 0 || phi::product(bias.dims()) <= 0))) {
+      ((!config.is_runtime) && (common::product(scale.dims()) <= 0 ||
+                                common::product(bias.dims()) <= 0))) {
     check = false;
   }
 
@@ -887,7 +887,7 @@ void BroadcastTensorsInferMeta(const std::vector<const MetaTensor*>& x,
 
   // 3. Set Output Dim
   for (size_t i = 0; i < out.size(); i++) {
-    out[i]->set_dims(phi::make_ddim(target_dims));
+    out[i]->set_dims(common::make_ddim(target_dims));
     out[i]->share_lod(*(x[i]));
     out[i]->set_dtype(x[i]->dtype());
   }
@@ -936,7 +936,7 @@ void CoalesceTensorInferMeta(const std::vector<const MetaTensor*>& input,
     int64_t numel = 0;
     for (auto item : input) {
       const auto& dim = item->dims();
-      auto size = phi::product(dim);
+      auto size = common::product(dim);
       auto len = use_align
                      ? phi::Alignment(static_cast<size_t>(size) * size_of_dtype,
                                       phi::GPUPlace(),
@@ -946,9 +946,9 @@ void CoalesceTensorInferMeta(const std::vector<const MetaTensor*>& input,
       numel += len;
     }
     if (fused_output) {
-      fused_output->set_dims(phi::make_ddim({numel}));
+      fused_output->set_dims(common::make_ddim({numel}));
       fused_output->set_dtype(dtype);
-      VLOG(4) << "fused_output size:" << phi::make_ddim({numel});
+      VLOG(4) << "fused_output size:" << common::make_ddim({numel});
     }
 #else
     return;
@@ -968,7 +968,7 @@ void CoalesceTensorInferMeta(const std::vector<const MetaTensor*>& input,
 
       for (auto item : input) {
         const auto& dim = item->dims();
-        auto size = phi::product(dim);
+        auto size = common::product(dim);
         auto len = use_align
                        ? alignment(static_cast<size_t>(size) * size_of_dtype,
                                    align_size) /
@@ -977,9 +977,9 @@ void CoalesceTensorInferMeta(const std::vector<const MetaTensor*>& input,
         numel += static_cast<int64_t>(len);
       }
       if (fused_output) {
-        fused_output->set_dims(phi::make_ddim({numel}));
+        fused_output->set_dims(common::make_ddim({numel}));
         fused_output->set_dtype(dtype);
-        VLOG(4) << "fused_output size:" << phi::make_ddim({numel});
+        VLOG(4) << "fused_output size:" << common::make_ddim({numel});
       }
     }
   }
@@ -995,11 +995,11 @@ void CheckMemoryContinueInferMeta(const std::vector<const MetaTensor*>& input,
   int64_t numel = 0;
   for (auto item : input) {
     const auto& dim = item->dims();
-    auto size = phi::product(dim);
+    auto size = common::product(dim);
     auto len = size * phi::SizeOf(item->dtype());
     numel += static_cast<int64_t>(len);
   }
-  output->set_dims(phi::make_ddim({numel}));
+  output->set_dims(common::make_ddim({numel}));
   output->set_dtype(phi::DataType::INT8);
 }
 
@@ -1014,7 +1014,7 @@ void ConcatInferMeta(const std::vector<const MetaTensor*>& x,
                         "than 0."));
   if (axis_scalar.FromTensor()) {
     auto out_dims =
-        phi::make_ddim(std::vector<int>(x.at(0)->dims().size(), -1));
+        common::make_ddim(std::vector<int>(x.at(0)->dims().size(), -1));
     out->set_dims(out_dims);
     out->set_dtype(x.at(0)->dtype());
     out->set_layout(x.at(0)->layout());
@@ -1140,7 +1140,7 @@ void DecayedAdagradInferMeta(const MetaTensor& param,
                              MetaTensor* param_out,
                              MetaTensor* moment_out) {
   auto lr_dims = learning_rate.dims();
-  PADDLE_ENFORCE_NE(phi::product(lr_dims),
+  PADDLE_ENFORCE_NE(common::product(lr_dims),
                     0,
                     phi::errors::InvalidArgument(
                         "Maybe the Input variable LearningRate has not "
@@ -1148,7 +1148,7 @@ void DecayedAdagradInferMeta(const MetaTensor& param,
                         "if you put exe.run(startup_program) "
                         "after optimizer.minimize function."));
   PADDLE_ENFORCE_EQ(
-      phi::product(lr_dims),
+      common::product(lr_dims),
       1,
       phi::errors::InvalidArgument("LearningRate should have one element"));
   auto param_dims = param.dims();
@@ -1376,7 +1376,7 @@ void DeformableConvInferMeta(const MetaTensor& x,
     }
   }
 
-  out->set_dims(phi::make_ddim(output_shape));
+  out->set_dims(common::make_ddim(output_shape));
   out->set_dtype(x.dtype());
 }
 
@@ -1400,19 +1400,19 @@ void DGCMomentumInferMeta(const MetaTensor& param,
                           MetaTensor* grad_out) {
   auto lr_dims = learning_rate.dims();
 
-  PADDLE_ENFORCE_NE(phi::product(lr_dims),
+  PADDLE_ENFORCE_NE(common::product(lr_dims),
                     0,
                     phi::errors::InvalidArgument(
                         "Maybe the Input variable LearningRate has not "
                         "been initialized. You may need to confirm "
                         "if you put exe.run(startup_program) "
                         "after optimizer.minimize function."));
-  PADDLE_ENFORCE_EQ(phi::product(lr_dims),
+  PADDLE_ENFORCE_EQ(common::product(lr_dims),
                     1,
                     phi::errors::InvalidArgument(
                         "Learning_rate should be a scalar. But Received "
                         "LearningRate's dim [%s]",
-                        phi::product(lr_dims)));
+                        common::product(lr_dims)));
 
   auto param_dims = param.dims();
   auto grad_dims = grad.dims();
@@ -1524,7 +1524,7 @@ void EditDistanceInferMeta(const MetaTensor& hyps,
 
   out->set_dims(refs.dims());
   out->set_dtype(DataType::FLOAT32);
-  sequencenum->set_dims(phi::make_ddim({1}));
+  sequencenum->set_dims(common::make_ddim({1}));
   sequencenum->set_dtype(DataType::FLOAT32);
 }
 
@@ -1598,9 +1598,9 @@ void FusedBiasActInferMeta(const MetaTensor& x,
         phi::errors::InvalidArgument(
             "The seconde dimension of x must be even, but receive %d", dim));
     dim /= 2;
-    out->set_dims(phi::make_ddim({token_num, dim}));
+    out->set_dims(common::make_ddim({token_num, dim}));
   } else if (act_method == "gelu" || act_method == "relu") {
-    out->set_dims(phi::make_ddim({token_num, dim}));
+    out->set_dims(common::make_ddim({token_num, dim}));
   } else {
     PADDLE_THROW(
         errors::InvalidArgument("act_method must be geglu, swiglu or gelu, "
@@ -1709,7 +1709,7 @@ void FusedLayerNormInferMeta(const MetaTensor& x,
                              MetaTensor* residual_out,
                              MetaTensor* mean,
                              MetaTensor* variance) {
-  std::vector<int64_t> x_dims_vec = phi::vectorize(x.dims());
+  std::vector<int64_t> x_dims_vec = common::vectorize(x.dims());
   auto x_dims_size = x_dims_vec.size();
 
   size_t normalized_dims = 1;
@@ -1734,7 +1734,7 @@ void FusedLayerNormInferMeta(const MetaTensor& x,
                           norm_weight.dims()[0]));
   }
 
-  auto out_dims = phi::make_ddim(x_dims_vec);
+  auto out_dims = common::make_ddim(x_dims_vec);
 
   out->set_dims(out_dims);
   if (residual_out && !norm_weight && !norm_bias) {
@@ -1752,11 +1752,11 @@ void FusedLayerNormInferMeta(const MetaTensor& x,
   residual_out->set_dtype(x.dtype());
   residual_out->set_layout(x.layout());
 
-  mean->set_dims(phi::make_ddim({rows}));
+  mean->set_dims(common::make_ddim({rows}));
   mean->set_dtype(DataType::FLOAT32);
   mean->set_layout(x.layout());
 
-  variance->set_dims(phi::make_ddim({rows}));
+  variance->set_dims(common::make_ddim({rows}));
   variance->set_dtype(DataType::FLOAT32);
   variance->set_layout(x.layout());
 }
@@ -1900,8 +1900,8 @@ void GenerateProposalsV2InferMeta(const MetaTensor& scores,
                                   MetaTensor* rpn_rois,
                                   MetaTensor* rpn_roi_probs,
                                   MetaTensor* rpn_rois_num) {
-  rpn_rois->set_dims(phi::make_ddim({-1, 4}));
-  rpn_roi_probs->set_dims(phi::make_ddim({-1, 1}));
+  rpn_rois->set_dims(common::make_ddim({-1, 4}));
+  rpn_roi_probs->set_dims(common::make_ddim({-1, 1}));
 }
 
 void GraphReindexInferMeta(const MetaTensor& x,
@@ -2025,7 +2025,7 @@ void HSigmoidLossInferMeta(const MetaTensor& x,
                         label_dims));
 
   std::vector<int64_t> output_shape({input_dims, 1});
-  out->set_dims(phi::make_ddim(output_shape));
+  out->set_dims(common::make_ddim(output_shape));
   out->share_lod(x);
   out->set_dtype(x.dtype());
 }
@@ -2053,7 +2053,7 @@ static void Interpolate1DInferShapeCheck(
                         "Interpolation method can only be \"linear\" when"
                         "Input(X) dimension is 3, but got method = %s .",
                         interp_method));
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
   for (int i = 0; i < dim_x.size(); ++i) {
     PADDLE_ENFORCE_NE(
         dim_x[i],
@@ -2184,7 +2184,7 @@ static void Interpolate2DInferShapeCheck(
           "Interpolation method can only be \"bilinear\" or \"nearest\" when "
           "Input(X) dimension is 4, but got method = %s.",
           interp_method));
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
 
   for (int i = 0; i < dim_x.size(); ++i) {
     PADDLE_ENFORCE_NE(
@@ -2337,7 +2337,7 @@ static void Interpolate3DInferShapeCheck(
           "\"nearest\" when Input(X) "
           "dimension is 5, but got method = %s .",
           interp_method));
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
 
   for (int i = 0; i < dim_x.size(); ++i) {
     PADDLE_ENFORCE_NE(
@@ -2583,7 +2583,7 @@ void LambInferMeta(const MetaTensor& param,
                    MetaTensor* master_param_outs) {
   auto lr_dims = learning_rate.dims();
   PADDLE_ENFORCE_NE(
-      phi::product(lr_dims),
+      common::product(lr_dims),
       0,
       phi::errors::InvalidArgument(
           "The number of LearningRate shall not be 0, but received %d. Maybe "
@@ -2591,27 +2591,27 @@ void LambInferMeta(const MetaTensor& param,
           "been initialized. You may need to confirm "
           "if you put exe.run(startup_program) "
           "after optimizer.minimize function.",
-          phi::product(lr_dims)));
+          common::product(lr_dims)));
   PADDLE_ENFORCE_EQ(
-      phi::product(lr_dims),
+      common::product(lr_dims),
       1,
       phi::errors::InvalidArgument(
           "Learning rate should have 1 dimension, but received %d.",
-          phi::product(lr_dims)));
+          common::product(lr_dims)));
   auto beta1_pow_dims = beta1_pow.dims();
-  PADDLE_ENFORCE_GE(phi::product(beta1_pow_dims),
+  PADDLE_ENFORCE_GE(common::product(beta1_pow_dims),
                     1,
                     phi::errors::InvalidArgument(
                         "The size of Beta1 power accumulator should be "
                         "greater than 0, but received %d.",
-                        phi::product(beta1_pow_dims)));
+                        common::product(beta1_pow_dims)));
   auto beta2_pow_dims = beta2_pow.dims();
-  PADDLE_ENFORCE_GE(phi::product(beta2_pow_dims),
+  PADDLE_ENFORCE_GE(common::product(beta2_pow_dims),
                     1,
                     phi::errors::InvalidArgument(
                         "The size of Beta2 power accumulator should be "
                         "greater than 0, but received %d.",
-                        phi::product(beta2_pow_dims)));
+                        common::product(beta2_pow_dims)));
 
   auto param_dims = param.dims();
   PADDLE_ENFORCE_EQ(
@@ -2726,12 +2726,12 @@ void LarsMomentumInferMeta(
           grad_dim.size()));
 
   for (auto& lr_dim : lr_dims) {
-    PADDLE_ENFORCE_EQ(phi::product(lr_dim),
+    PADDLE_ENFORCE_EQ(common::product(lr_dim),
                       1,
                       phi::errors::InvalidArgument(
                           "Learning_rate should be a scalar. But Received "
                           "LearningRate's dim [%s]",
-                          phi::product(lr_dim)));
+                          common::product(lr_dim)));
   }
 
   for (size_t i = 0; i < param_dim.size(); ++i) {
@@ -2826,33 +2826,33 @@ void LogspaceInferMeta(const MetaTensor& start,
                        MetaTensor* out) {
   auto s_dims = start.dims();
   PADDLE_ENFORCE_EQ(
-      phi::product(s_dims),
+      common::product(s_dims),
       1,
       phi::errors::InvalidArgument("The size of Input(Start) must be 1,"
                                    "but received input size is %s.",
-                                   phi::product(s_dims)));
+                                   common::product(s_dims)));
   auto e_dims = stop.dims();
   PADDLE_ENFORCE_EQ(
-      phi::product(e_dims),
+      common::product(e_dims),
       true,
       phi::errors::InvalidArgument("The size of Input(Stop) must be 1,"
                                    "but received input size is %s.",
-                                   phi::product(e_dims)));
+                                   common::product(e_dims)));
   auto num_dims = number.dims();
   PADDLE_ENFORCE_EQ(
-      phi::product(num_dims),
+      common::product(num_dims),
       true,
       phi::errors::InvalidArgument("The size of Input(Num) must be 1,"
                                    "but received input size is %s.",
-                                   phi::product(num_dims)));
+                                   common::product(num_dims)));
   auto b_dims = base.dims();
-  PADDLE_ENFORCE_EQ(phi::product(b_dims),
+  PADDLE_ENFORCE_EQ(common::product(b_dims),
                     true,
                     phi::errors::InvalidArgument(
                         "The size of Input(Base) must be 1,"
-                        "but received input size is phi::product(b_dims).",
-                        phi::product(b_dims)));
-  out->set_dims(phi::make_ddim({-1}));
+                        "but received input size is common::product(b_dims).",
+                        common::product(b_dims)));
+  out->set_dims(common::make_ddim({-1}));
   out->set_dtype(dtype);
 }
 
@@ -2970,15 +2970,15 @@ void MemoryEfficientAttentionInferMeta(const MetaTensor& query,
   std::vector<int64_t> logsumexp_dims({query_num_head, query_batch_size});
   std::vector<int64_t> seed_and_offset_dims({2});
 
-  output->set_dims(phi::make_ddim(out_dims));
+  output->set_dims(common::make_ddim(out_dims));
   output->share_lod(query);
   output->set_dtype(query.dtype());
   output->set_layout(query.layout());
 
-  logsumexp->set_dims(phi::make_ddim(logsumexp_dims));
+  logsumexp->set_dims(common::make_ddim(logsumexp_dims));
   logsumexp->set_dtype(phi::DataType::FLOAT32);
 
-  seed_and_offset->set_dims(phi::make_ddim(seed_and_offset_dims));
+  seed_and_offset->set_dims(common::make_ddim(seed_and_offset_dims));
   seed_and_offset->set_dtype(phi::DataType::INT64);
 }
 
@@ -3060,7 +3060,7 @@ void VariableLengthMemoryEfficientAttentionInferMeta(
   std::vector<int64_t> out_dims(
       {query_batch_size, query_num_head, query_seq_length, value_head_size});
 
-  out->set_dims(phi::make_ddim(out_dims));
+  out->set_dims(common::make_ddim(out_dims));
   out->set_dtype(query.dtype());
   out->set_layout(query.layout());
 }
@@ -3078,7 +3078,7 @@ void MeshgridInferMeta(const std::vector<const MetaTensor*>& inputs,
       out_shape[i] = static_cast<int>(inputs[i]->dims()[0]);
     }
   }
-  auto out_dims = phi::make_ddim(std::vector<int>(out_shape));
+  auto out_dims = common::make_ddim(std::vector<int>(out_shape));
   for (auto& output : outputs) {
     output->set_dims(out_dims);
     output->set_dtype(inputs[0]->dtype());
@@ -3110,18 +3110,18 @@ void MomentumInferMeta(const MetaTensor& param,
 
   auto lr_dims = learning_rate.dims();
   PADDLE_ENFORCE_NE(
-      phi::product(lr_dims),
+      common::product(lr_dims),
       0,
       errors::InvalidArgument("Maybe the Input variable LearningRate has not "
                               "been initialized. You may need to confirm "
                               "if you put exe.run(startup_program) "
                               "after optimizer.minimize function."));
   PADDLE_ENFORCE_EQ(
-      phi::product(lr_dims),
+      common::product(lr_dims),
       1,
       errors::InvalidArgument("Learning_rate should be a scalar. But Received "
                               "LearningRate's dim [%s]",
-                              phi::product(lr_dims)));
+                              common::product(lr_dims)));
 
   auto param_dim = param.dims();
   param_out->set_dims(param_dim);
@@ -3163,7 +3163,7 @@ void MultiDotInferMeta(const std::vector<const MetaTensor*>& x,
 
   // If the first tensor is 1D of size n view it as a row vector (1, n)
   if (first_dim.size() == 1) {
-    first_dim = phi::make_ddim({1, static_cast<int>(first_dim[0])});
+    first_dim = common::make_ddim({1, static_cast<int>(first_dim[0])});
     is_vector = true;
   }
 
@@ -3177,11 +3177,12 @@ void MultiDotInferMeta(const std::vector<const MetaTensor*>& x,
 
   // If the last tensor is 1D of size n view it as a column vector (n, 1)
   if (last_dim.size() == 1) {
-    last_dim = phi::make_ddim({static_cast<int>(last_dim[0]), 1});
-    out_dim = is_vector ? phi::make_ddim({}) : phi::make_ddim({first_dim[0]});
+    last_dim = common::make_ddim({static_cast<int>(last_dim[0]), 1});
+    out_dim =
+        is_vector ? common::make_ddim({}) : common::make_ddim({first_dim[0]});
   } else {
-    out_dim = is_vector ? phi::make_ddim({last_dim[1]})
-                        : phi::make_ddim({first_dim[0], last_dim[1]});
+    out_dim = is_vector ? common::make_ddim({last_dim[1]})
+                        : common::make_ddim({first_dim[0], last_dim[1]});
   }
 
   auto width = first_dim.at(1);
@@ -3351,7 +3352,7 @@ void RmsNormInferMeta(const MetaTensor& x,
                       const float quant_min_bound,
                       MetaTensor* out,
                       MetaTensor* residual_out) {
-  std::vector<int64_t> x_dims_vec = phi::vectorize(x.dims());
+  std::vector<int64_t> x_dims_vec = common::vectorize(x.dims());
   auto x_dims_size = x_dims_vec.size();
 
   size_t normalized_dims = 1;
@@ -3369,7 +3370,7 @@ void RmsNormInferMeta(const MetaTensor& x,
                         normalized_dims,
                         norm_weight.dims()[0]));
 
-  auto out_dims = phi::make_ddim(x_dims_vec);
+  auto out_dims = common::make_ddim(x_dims_vec);
 
   out->set_dims(out_dims);
   if (quant_scale <= 0.0f) {
@@ -3429,12 +3430,12 @@ void RmspropInferMeta(const MetaTensor& param,
                         mean_square.dims()));
 
   auto lr_dim = learning_rate.dims();
-  PADDLE_ENFORCE_EQ(phi::product(lr_dim),
+  PADDLE_ENFORCE_EQ(common::product(lr_dim),
                     1,
                     phi::errors::InvalidArgument(
                         "Learning Rate of RmspropOp should be a scalar. But "
                         "received LearningRate's dim [%s]",
-                        phi::product(lr_dim)));
+                        common::product(lr_dim)));
 
   if (master_param.initialized()) {
     PADDLE_ENFORCE_EQ(param_dim,
@@ -3564,12 +3565,12 @@ void SgdInferMeta(const MetaTensor& param,
                               "Output(ParamOut) of SGDOp should not be null."));
 
   auto lr_dims = learning_rate.dims();
-  PADDLE_ENFORCE_EQ(phi::product(lr_dims),
+  PADDLE_ENFORCE_EQ(common::product(lr_dims),
                     1,
                     phi::errors::InvalidArgument(
                         "Learning rate should have 1 element. But received "
                         "LearningRate dims [%s]",
-                        phi::product(lr_dims)));
+                        common::product(lr_dims)));
 
   param_out->set_dims(param.dims());
   param_out->set_dtype(param.dtype());
@@ -3605,14 +3606,14 @@ void SigmoidCrossEntropyWithLogitsInferMeta(const MetaTensor& x,
 
   bool check = true;
   if ((!config.is_runtime) &&
-      (phi::product(x_dims) <= 0 || phi::product(labels_dims) <= 0)) {
+      (common::product(x_dims) <= 0 || common::product(labels_dims) <= 0)) {
     check = false;
   }
 
   if (check) {
     PADDLE_ENFORCE_EQ(
-        phi::slice_ddim(x_dims, 0, rank),
-        phi::slice_ddim(labels_dims, 0, rank),
+        common::slice_ddim(x_dims, 0, rank),
+        common::slice_ddim(labels_dims, 0, rank),
         phi::errors::InvalidArgument(
             "Input(X) and Input(Label) shall have the same shape "
             "except the last dimension. But received: the shape of "
@@ -3623,8 +3624,8 @@ void SigmoidCrossEntropyWithLogitsInferMeta(const MetaTensor& x,
     if (pos_weight) {
       auto weight_dims = pos_weight.dims();
       PADDLE_ENFORCE_EQ(
-          phi::slice_ddim(weight_dims, 0, rank),
-          phi::slice_ddim(labels_dims, 0, rank),
+          common::slice_ddim(weight_dims, 0, rank),
+          common::slice_ddim(labels_dims, 0, rank),
           phi::errors::InvalidArgument(
               "Input(pos_weight) and Input(Label) shall have the same shape "
               "But received: the shape of Input(PosWeight) is [%s], "
@@ -3705,8 +3706,8 @@ void SendUERecvInferMeta(const MetaTensor& x,
 
   // Infer out's shape according to x and e(need broadcasting condition)
   out->set_dtype(x.dtype());
-  auto x_dims1 = phi::vectorize<int>(x_dims);
-  auto y_dims1 = phi::vectorize<int>(y_dims);
+  auto x_dims1 = common::vectorize<int>(x_dims);
+  auto y_dims1 = common::vectorize<int>(y_dims);
   std::vector<int> x_dims2(x_dims1.begin() + 1, x_dims1.end());
   std::vector<int> y_dims2(y_dims1.begin() + 1, y_dims1.end());
 
@@ -3716,15 +3717,15 @@ void SendUERecvInferMeta(const MetaTensor& x,
   std::vector<int> y_dims_array(max_dim);
   std::vector<int> out_dims_array(max_dim);
   // Only need to broadcast dimensions other than the 0th dimension.
-  phi::funcs::GetBroadcastDimsArrays(phi::make_ddim(x_dims2),
-                                     phi::make_ddim(y_dims2),
+  phi::funcs::GetBroadcastDimsArrays(common::make_ddim(x_dims2),
+                                     common::make_ddim(y_dims2),
                                      x_dims_array.data(),
                                      y_dims_array.data(),
                                      out_dims_array.data(),
                                      max_dim,
                                      axis);
   out_dims_array.insert(out_dims_array.begin(), -1);
-  out->set_dims(phi::make_ddim(out_dims_array));
+  out->set_dims(common::make_ddim(out_dims_array));
 }
 
 void SendUVInferMeta(const MetaTensor& x,
@@ -3776,8 +3777,8 @@ void SendUVInferMeta(const MetaTensor& x,
   out->set_dtype(x.dtype());
   auto x_dims = x.dims();
   auto y_dims = y.dims();
-  auto x_dims1 = phi::vectorize<int>(x_dims);
-  auto y_dims1 = phi::vectorize<int>(y_dims);
+  auto x_dims1 = common::vectorize<int>(x_dims);
+  auto y_dims1 = common::vectorize<int>(y_dims);
   std::vector<int> x_dims2(x_dims1.begin() + 1, x_dims1.end());
   std::vector<int> y_dims2(y_dims1.begin() + 1, y_dims1.end());
   int max_dim = static_cast<int>(std::max(x_dims2.size(), y_dims2.size()));
@@ -3786,15 +3787,15 @@ void SendUVInferMeta(const MetaTensor& x,
   std::vector<int> y_dims_array(max_dim);
   std::vector<int> out_dims_array(max_dim);
   // Only need to broadcast dimensions other than the 0th dimension.
-  phi::funcs::GetBroadcastDimsArrays(phi::make_ddim(x_dims2),
-                                     phi::make_ddim(y_dims2),
+  phi::funcs::GetBroadcastDimsArrays(common::make_ddim(x_dims2),
+                                     common::make_ddim(y_dims2),
                                      x_dims_array.data(),
                                      y_dims_array.data(),
                                      out_dims_array.data(),
                                      max_dim,
                                      axis);
   out_dims_array.insert(out_dims_array.begin(), src_index_dims[0]);  // NOLINT
-  out->set_dims(phi::make_ddim(out_dims_array));
+  out->set_dims(common::make_ddim(out_dims_array));
 }
 
 void StackInferMeta(const std::vector<const MetaTensor*>& x,
@@ -3830,9 +3831,9 @@ void StackInferMeta(const std::vector<const MetaTensor*>& x,
           rank,
           axis));
   if (axis < 0) axis += (rank + 1);
-  auto vec = phi::vectorize<int64_t>(out_dim);
+  auto vec = common::vectorize<int64_t>(out_dim);
   vec.insert(vec.begin() + axis, input_dims.size());  // NOLINT
-  out->set_dims(phi::make_ddim(vec));
+  out->set_dims(common::make_ddim(vec));
   out->set_dtype(x.at(0)->dtype());
   out->share_lod(*x.at(0));
 }
@@ -3923,7 +3924,7 @@ void WarpctcInferMeta(const MetaTensor& logits,
     sequence_width = static_cast<int>(logits_dims[2]);
   } else {
     sequence_width =
-        static_cast<int>(phi::product(logits_dims) / logits_dims[0]);
+        static_cast<int>(common::product(logits_dims) / logits_dims[0]);
   }
 
   PADDLE_ENFORCE_GE(
@@ -4182,15 +4183,15 @@ void YoloLossInferMeta(const MetaTensor& x,
   }
 
   std::vector<int64_t> dim_out({dim_x[0]});
-  loss->set_dims(phi::make_ddim(dim_out));
+  loss->set_dims(common::make_ddim(dim_out));
   loss->set_dtype(x.dtype());
 
   std::vector<int64_t> dim_obj_mask({dim_x[0], mask_num, dim_x[2], dim_x[3]});
-  objectness_mask->set_dims(phi::make_ddim(dim_obj_mask));
+  objectness_mask->set_dims(common::make_ddim(dim_obj_mask));
   objectness_mask->set_dtype(x.dtype());
 
   std::vector<int64_t> dim_gt_match_mask({dim_gtbox[0], dim_gtbox[1]});
-  gt_match_mask->set_dims(phi::make_ddim(dim_gt_match_mask));
+  gt_match_mask->set_dims(common::make_ddim(dim_gt_match_mask));
   gt_match_mask->set_dtype(x.dtype());
 }
 
@@ -4546,7 +4547,7 @@ void MaskedMultiheadAttentionInferMeta(const MetaTensor& x,
 void FullWithTensorInferMeta(const MetaTensor& shape,
                              DataType dtype,
                              MetaTensor* out) {
-  out->set_dims(make_ddim(std::vector<int64_t>(shape.numel(), -1)));
+  out->set_dims(common::make_ddim(std::vector<int64_t>(shape.numel(), -1)));
   out->set_dtype(dtype);
 }
 
diff --git a/paddle/phi/infermeta/nullary.cc b/paddle/phi/infermeta/nullary.cc
index d32f160fd7f927..3f8686753e890b 100644
--- a/paddle/phi/infermeta/nullary.cc
+++ b/paddle/phi/infermeta/nullary.cc
@@ -27,7 +27,7 @@ void ArangeInferMeta(const Scalar& start,
     double step_value = step.to<double>();
     int numel =
         static_cast<int>(std::ceil((end_value - start_value) / step_value));
-    out->set_dims(phi::make_ddim(std::vector<int64_t>(1, numel)));
+    out->set_dims(common::make_ddim(std::vector<int64_t>(1, numel)));
   } else {
     out->set_dims({-1});
   }
@@ -37,7 +37,7 @@ void ArangeInferMeta(const Scalar& start,
 void AssignValueInferMeta(const std::vector<int>& shape,
                           DataType dtype,
                           MetaTensor* out) {
-  out->set_dims(phi::make_ddim(shape));
+  out->set_dims(common::make_ddim(shape));
   out->set_dtype(dtype);
 }
 
@@ -57,7 +57,7 @@ void CreateInferMeta(const IntArray& shape, DataType dtype, MetaTensor* out) {
               "than 0. But received: shape[%u] = %d; shape = [%s].",
               i,
               data[i],
-              phi::make_ddim(data)));
+              common::make_ddim(data)));
     }
   }
   CreateInferMetaBase(shape.GetData(), dtype, DataLayout::NCHW, out);
@@ -74,7 +74,7 @@ void CreateInferMetaBase(const std::vector<int64_t>& shape,
                          DataType dtype,
                          DataLayout layout,
                          MetaTensor* out) {
-  auto out_dims = phi::make_ddim(shape);
+  auto out_dims = common::make_ddim(shape);
   out->set_dims(out_dims);
   out->set_dtype(dtype);
   out->set_layout(layout);
@@ -84,7 +84,7 @@ void DataInferMeta(const std::string& name,
                    const phi::IntArray& shape,
                    phi::DataType data_type,
                    MetaTensor* out) {
-  auto out_dims = phi::make_ddim(shape.GetData());
+  auto out_dims = common::make_ddim(shape.GetData());
   out->set_dims(out_dims);
   out->set_dtype(data_type);
 }
@@ -117,21 +117,21 @@ void GaussianInferMeta(const IntArray& shape,
                        int seed,
                        DataType dtype,
                        MetaTensor* out) {
-  auto out_dims = phi::make_ddim(shape.GetData());
+  auto out_dims = common::make_ddim(shape.GetData());
   out->set_dims(out_dims);
   out->set_dtype(dtype);
   out->set_layout(DataLayout::NCHW);
 }
 
 void RandpermInferMeta(int n, DataType dtype, MetaTensor* out) {
-  out->set_dims(phi::make_ddim({n}));
+  out->set_dims(common::make_ddim({n}));
   out->set_dtype(dtype);
 }
 
 void UniformRandomInferMeta(const IntArray& shape,
                             DataType dtype,
                             MetaTensor* out) {
-  auto out_dims = phi::make_ddim(shape.GetData());
+  auto out_dims = common::make_ddim(shape.GetData());
   out->set_dims(out_dims);
   out->set_dtype(dtype);
   out->set_layout(DataLayout::NCHW);
@@ -156,7 +156,7 @@ void RandintInferMeta(
   for (auto dim : shape_vector) {
     tensor_shape.push_back(static_cast<int64_t>(dim));
   }
-  out->set_dims(make_ddim(tensor_shape));
+  out->set_dims(common::make_ddim(tensor_shape));
   out->set_dtype(dtype);
 }
 
@@ -238,13 +238,13 @@ void RecvV2InferMeta(const int ring_id,
                             i,
                             out_shape[i]));
     }
-    out->set_dims(phi::make_ddim(out_shape));
+    out->set_dims(common::make_ddim(out_shape));
   }
   out->set_dtype(dtype);
 }
 
 void SeedInferMeta(int seed, MetaTensor* out) {
-  out->set_dims(phi::make_ddim({1}));
+  out->set_dims(common::make_ddim({1}));
   out->set_dtype(DataType::INT32);
 }
 
@@ -254,7 +254,7 @@ void TruncatedGaussianRandomInferMeta(const std::vector<int>& shape,
                                       int seed,
                                       DataType dtype,
                                       MetaTensor* out) {
-  auto out_dims = phi::make_ddim(shape);
+  auto out_dims = common::make_ddim(shape);
   out->set_dims(out_dims);
   out->set_dtype(dtype);
   out->set_layout(DataLayout::NCHW);
@@ -279,7 +279,7 @@ void TrilIndicesInferMeta(
     tril_size += diff_row * cols;
   }
   std::vector<int64_t> tmp = {2, tril_size};
-  auto out_dims = phi::make_ddim(tmp);
+  auto out_dims = common::make_ddim(tmp);
   out->set_dims(out_dims);
   out->set_dtype(dtype);
 }
@@ -308,7 +308,7 @@ void TriuIndicesInferMeta(
     tril_size += diff_row * col;
   }
   std::vector<int64_t> tmp = {2, row * col - tril_size};
-  auto out_dims = phi::make_ddim(tmp);
+  auto out_dims = common::make_ddim(tmp);
   out->set_dims(out_dims);
   out->set_dtype(dtype);
 }
diff --git a/paddle/phi/infermeta/nullary.h b/paddle/phi/infermeta/nullary.h
index c1c0501593ea45..1447649eb1ebb7 100644
--- a/paddle/phi/infermeta/nullary.h
+++ b/paddle/phi/infermeta/nullary.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/meta_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/infermeta/sparse/binary.cc b/paddle/phi/infermeta/sparse/binary.cc
index 6e9a5775e24634..2ed540c0e0c4db 100644
--- a/paddle/phi/infermeta/sparse/binary.cc
+++ b/paddle/phi/infermeta/sparse/binary.cc
@@ -91,7 +91,7 @@ void Conv3dInferMeta(const MetaTensor& x,
 
   int rank = is2D ? 4 : 5;
   std::vector<int> out_dims_vec(rank, 1);
-  DDim out_dims = make_ddim(out_dims_vec);
+  DDim out_dims = common::make_ddim(out_dims_vec);
 
   std::vector<int> kernel_sizes(kernel_dims.size());
   for (int i = 0; i < kernel_dims.size(); i++) {
@@ -164,7 +164,7 @@ void SparseCooTensorInferMeta(const MetaTensor& values,
                               const MetaTensor& indices,
                               const std::vector<int64_t>& shape,
                               MetaTensor* out) {
-  out->set_dims(phi::make_ddim(shape));
+  out->set_dims(common::make_ddim(shape));
   out->set_dtype(values.dtype());
   out->set_layout(values.layout());
 }
diff --git a/paddle/phi/infermeta/spmd_rules/concat.cc b/paddle/phi/infermeta/spmd_rules/concat.cc
index 5311b287a734ed..15844a426245fe 100644
--- a/paddle/phi/infermeta/spmd_rules/concat.cc
+++ b/paddle/phi/infermeta/spmd_rules/concat.cc
@@ -62,7 +62,7 @@ SpmdInfo ConcatInferSpmd(const std::vector<DistMetaTensor>& x, int axis) {
                  x.end(),
                  std::back_inserter(tensor_shapes),
                  [](const DistMetaTensor& meta) {
-                   return phi::vectorize<int64_t>(meta.dims());
+                   return common::vectorize<int64_t>(meta.dims());
                  });
   bool all_empty =
       std::all_of(tensor_shapes.begin(), tensor_shapes.end(), IsEmpty);
@@ -125,7 +125,7 @@ SpmdInfo ConcatGradInferSpmdDynamic(const std::vector<DistMetaTensor>& x,
                  x.end(),
                  std::back_inserter(tensor_shapes),
                  [](const DistMetaTensor& meta) {
-                   return phi::vectorize<int64_t>(meta.dims());
+                   return common::vectorize<int64_t>(meta.dims());
                  });
   bool all_empty =
       std::all_of(tensor_shapes.begin(), tensor_shapes.end(), IsEmpty);
@@ -148,7 +148,7 @@ SpmdInfo ConcatGradInferSpmdDynamic(const std::vector<DistMetaTensor>& x,
         return meta.dist_attr();
       });
   input_attrs.push_back(output_grad.dist_attr());
-  tensor_shapes.push_back(phi::vectorize<int64_t>(output_grad.dims()));
+  tensor_shapes.push_back(common::vectorize<int64_t>(output_grad.dims()));
   std::string all_aixs;
   std::string align_axis;
   std::tie(all_aixs, align_axis) = FillConcatNotation(ndim, dim);
diff --git a/paddle/phi/infermeta/spmd_rules/default_data_parallel.cc b/paddle/phi/infermeta/spmd_rules/default_data_parallel.cc
index 7a3639147f1ee6..6b390e7eda1a8d 100644
--- a/paddle/phi/infermeta/spmd_rules/default_data_parallel.cc
+++ b/paddle/phi/infermeta/spmd_rules/default_data_parallel.cc
@@ -81,7 +81,7 @@ SpmdInfo DefaultDataParallelInferSpmd(
   VLOG(4) << "DefaultDataParallelSpmd InferForward:";
   for (int64_t i = 0; i < ninputs; i++) {
     VLOG(4) << "Input" << std::to_string(i) << " shape: ["
-            << str_join(phi::vectorize(ins[i]->dims())) << "] "
+            << str_join(common::vectorize(ins[i]->dims())) << "] "
             << "src_dims_mapping: ["
             << str_join(ins[i]->dist_attr().dims_mapping()) << "] "
             << "dst_dims_mapping: ["
@@ -90,7 +90,7 @@ SpmdInfo DefaultDataParallelInferSpmd(
 
   for (int64_t i = 0; i < noutputs; i++) {
     VLOG(4) << "Output" << std::to_string(i) << " shape: ["
-            << str_join(phi::vectorize(outs[i]->dims())) << "] "
+            << str_join(common::vectorize(outs[i]->dims())) << "] "
             << "dst_dims_mapping: ["
             << str_join(output_dist_attrs[i].dims_mapping()) << "]";
   }
@@ -144,7 +144,7 @@ SpmdInfo DefaultDataParallelInferSpmdReverse(
   VLOG(4) << "DefaultDataParallelSpmd InferBackward:";
   for (int64_t i = 0; i < noutputs; i++) {
     VLOG(4) << "Output" << std::to_string(i) << " shape: ["
-            << str_join(phi::vectorize(outs[i]->dims())) << "] "
+            << str_join(common::vectorize(outs[i]->dims())) << "] "
             << "src_dims_mapping: ["
             << str_join(outs[i]->dist_attr().dims_mapping()) << "] "
             << "dst_dims_mapping: ["
@@ -153,7 +153,7 @@ SpmdInfo DefaultDataParallelInferSpmdReverse(
 
   for (int64_t i = 0; i < ninputs; i++) {
     VLOG(4) << "Input" << std::to_string(i) << " shape: ["
-            << str_join(phi::vectorize(ins[i]->dims())) << "] "
+            << str_join(common::vectorize(ins[i]->dims())) << "] "
             << "dst_dims_mapping: ["
             << str_join(dst_input_dist_attrs[i].dims_mapping()) << "]";
   }
diff --git a/paddle/phi/infermeta/spmd_rules/elementwise.cc b/paddle/phi/infermeta/spmd_rules/elementwise.cc
index e0d4c248b7760c..a4b6826f35cbf9 100644
--- a/paddle/phi/infermeta/spmd_rules/elementwise.cc
+++ b/paddle/phi/infermeta/spmd_rules/elementwise.cc
@@ -81,7 +81,7 @@ void GetBinaryNotations(const std::vector<int64_t>& x_shape,
 
 SpmdInfo ElementwiseUnaryInferSpmd(const DistMetaTensor& x) {
   // Step0: Verify Input Args Based on Elementwise Logic
-  auto x_shape = phi::vectorize(x.dims());
+  auto x_shape = common::vectorize(x.dims());
   int x_ndim = x_shape.size();
   TensorDistAttr x_dist_attr_src = x.dist_attr();
   std::vector<int64_t> x_dims_mapping = x_dist_attr_src.dims_mapping();
@@ -127,9 +127,9 @@ SpmdInfo ElementwiseUnaryInferSpmd(const DistMetaTensor& x) {
 SpmdInfo ElementwiseUnaryInferSpmdReverse(const DistMetaTensor& x,
                                           const DistMetaTensor& out) {
   // Step0: Verify Input Args Based on Elementwise Logic
-  auto x_shape = phi::vectorize(x.dims());
+  auto x_shape = common::vectorize(x.dims());
   int x_ndim = x_shape.size();
-  auto out_shape = phi::vectorize(out.dims());
+  auto out_shape = common::vectorize(out.dims());
   int out_ndim = out_shape.size();
   TensorDistAttr out_dist_attr = out.dist_attr();
   std::vector<int64_t> out_dims_mapping = out_dist_attr.dims_mapping();
@@ -181,9 +181,9 @@ SpmdInfo ElementwiseUnaryInferSpmdReverse(const DistMetaTensor& x,
 SpmdInfo ElementwiseBinaryInferSpmd(const DistMetaTensor& x,
                                     const DistMetaTensor& y) {
   // Step0: Verify Input Args Based on Elementwise Logic
-  auto x_shape = phi::vectorize(x.dims());
+  auto x_shape = common::vectorize(x.dims());
   int x_ndim = x_shape.size();
-  auto y_shape = phi::vectorize(y.dims());
+  auto y_shape = common::vectorize(y.dims());
   int y_ndim = y_shape.size();
   TensorDistAttr x_dist_attr_src = x.dist_attr();
   TensorDistAttr y_dist_attr_src = y.dist_attr();
@@ -251,11 +251,11 @@ SpmdInfo ElementwiseBinaryInferSpmdReverse(const DistMetaTensor& x,
                                            const DistMetaTensor& y,
                                            const DistMetaTensor& out) {
   // Step0: Verify Input Args Based on Elementwise Logic
-  auto x_shape = phi::vectorize(x.dims());
+  auto x_shape = common::vectorize(x.dims());
   int x_ndim = x_shape.size();
-  auto y_shape = phi::vectorize(y.dims());
+  auto y_shape = common::vectorize(y.dims());
   int y_ndim = y_shape.size();
-  auto out_shape = phi::vectorize(out.dims());
+  auto out_shape = common::vectorize(out.dims());
   int out_ndim = out_shape.size();
   int max_ndim = std::max(x_ndim, y_ndim);
   TensorDistAttr out_dist_attr = out.dist_attr();
diff --git a/paddle/phi/infermeta/spmd_rules/embedding.cc b/paddle/phi/infermeta/spmd_rules/embedding.cc
index 99aec54cd54742..873f8065f222a1 100644
--- a/paddle/phi/infermeta/spmd_rules/embedding.cc
+++ b/paddle/phi/infermeta/spmd_rules/embedding.cc
@@ -33,8 +33,8 @@ SpmdInfo EmbeddingInferSpmd(const DistMetaTensor& x,
                             int padding_idx,
                             bool sparse) {
   // Step0: Verify input args based on embedding logic
-  auto x_shape = phi::vectorize(x.dims());
-  auto weight_shape = phi::vectorize(weight.dims());
+  auto x_shape = common::vectorize(x.dims());
+  auto weight_shape = common::vectorize(weight.dims());
   int x_ndim = static_cast<int>(x_shape.size());
   int weight_ndim = static_cast<int>(weight_shape.size());
   auto x_dist_attr_src = x.dist_attr();
@@ -159,9 +159,9 @@ SpmdInfo EmbeddingInferSpmdReverse(const DistMetaTensor& x,
                                    bool sparse) {
   // Step0: Verify input args based on embedding logic
   // InferBackward is called after InferForward, so we skip some checks.
-  auto x_shape = phi::vectorize(x.dims());
+  auto x_shape = common::vectorize(x.dims());
   int x_ndim = static_cast<int>(x_shape.size());
-  auto out_shape = phi::vectorize(out.dims());
+  auto out_shape = common::vectorize(out.dims());
   int out_ndim = static_cast<int>(out_shape.size());
 
   PADDLE_ENFORCE_EQ(x_ndim,
diff --git a/paddle/phi/infermeta/spmd_rules/flash_attention.cc b/paddle/phi/infermeta/spmd_rules/flash_attention.cc
index c12f6665237721..5f6e05eb256eeb 100644
--- a/paddle/phi/infermeta/spmd_rules/flash_attention.cc
+++ b/paddle/phi/infermeta/spmd_rules/flash_attention.cc
@@ -60,7 +60,7 @@ SpmdInfo FlashAttInferSpmd(const DistMetaTensor& q,
                            const std::string& rng_name) {
   // q
   // [batch_size, seq_len_q, num_heads, head_dim]
-  auto q_shape = phi::vectorize(q.dims());
+  auto q_shape = common::vectorize(q.dims());
   int q_ndim = q_shape.size();
   auto q_dist_attr = q.dist_attr();
   int q_dims_mapping_size = q_dist_attr.dims_mapping().size();
@@ -85,7 +85,7 @@ SpmdInfo FlashAttInferSpmd(const DistMetaTensor& q,
 
   // k
   // [batch_size, seq_len_kv, num_heads, head_dim]
-  auto k_shape = phi::vectorize(k.dims());
+  auto k_shape = common::vectorize(k.dims());
   int k_ndim = k_shape.size();
   auto k_dist_attr = k.dist_attr();
   int k_dims_mapping_size = k_dist_attr.dims_mapping().size();
@@ -134,7 +134,7 @@ SpmdInfo FlashAttInferSpmd(const DistMetaTensor& q,
 
   // v
   // [batch_size, seq_len_kv, num_heads, head_dim]
-  auto v_shape = phi::vectorize(v.dims());
+  auto v_shape = common::vectorize(v.dims());
   int v_ndim = v_shape.size();
   auto v_dist_attr = v.dist_attr();
   int v_dims_mapping_size = v_dist_attr.dims_mapping().size();
@@ -183,9 +183,9 @@ SpmdInfo FlashAttInferSpmd(const DistMetaTensor& q,
   // fixed_seed_offset
   // TODO(liuzhenhai): process fixed_seed_offset and attn_mask
   auto fixed_seed_offset_dist_attr = fixed_seed_offset.dist_attr();
-  auto fixed_seed_offset_shape = phi::vectorize(fixed_seed_offset.dims());
+  auto fixed_seed_offset_shape = common::vectorize(fixed_seed_offset.dims());
   // attn_mask
-  auto attn_mask_shape = phi::vectorize(attn_mask.dims());
+  auto attn_mask_shape = common::vectorize(attn_mask.dims());
   int mask_ndim = attn_mask_shape.size();
   auto attn_mask_dist_attr = attn_mask.dist_attr();
   int mask_dims_mapping_size = attn_mask_dist_attr.dims_mapping().size();
@@ -289,7 +289,7 @@ SpmdInfo FlashAttGradInferSpmd(const DistMetaTensor& q,
                                bool causal) {
   // q
   // [batch_size, seq_len_q, num_heads, head_dim]
-  auto q_shape = phi::vectorize(q.dims());
+  auto q_shape = common::vectorize(q.dims());
   int q_ndim = q_shape.size();
   auto q_dist_attr = q.dist_attr();
   int q_dims_mapping_size = q_dist_attr.dims_mapping().size();
@@ -314,7 +314,7 @@ SpmdInfo FlashAttGradInferSpmd(const DistMetaTensor& q,
 
   // k
   // [batch_size, seq_len_kv, num_heads, head_dim]
-  auto k_shape = phi::vectorize(k.dims());
+  auto k_shape = common::vectorize(k.dims());
   int k_ndim = k_shape.size();
   auto k_dist_attr = k.dist_attr();
   int k_dims_mapping_size = k_dist_attr.dims_mapping().size();
@@ -363,7 +363,7 @@ SpmdInfo FlashAttGradInferSpmd(const DistMetaTensor& q,
 
   // v
   // [batch_size, seq_len_kv, num_heads, head_dim]
-  auto v_shape = phi::vectorize(v.dims());
+  auto v_shape = common::vectorize(v.dims());
   int v_ndim = v_shape.size();
   auto v_dist_attr = v.dist_attr();
   int v_dims_mapping_size = v_dist_attr.dims_mapping().size();
@@ -411,10 +411,10 @@ SpmdInfo FlashAttGradInferSpmd(const DistMetaTensor& q,
 
   // fixed_seed_offset
   auto seed_offset_dist_attr = seed_offset.dist_attr();
-  auto seed_offset_shape = phi::vectorize(seed_offset.dims());
+  auto seed_offset_shape = common::vectorize(seed_offset.dims());
 
   // attn_mask
-  auto attn_mask_shape = phi::vectorize(attn_mask.dims());
+  auto attn_mask_shape = common::vectorize(attn_mask.dims());
   int mask_ndim = attn_mask_shape.size();
   auto attn_mask_dist_attr = attn_mask.dist_attr();
   int mask_dims_mapping_size = attn_mask_dist_attr.dims_mapping().size();
@@ -428,13 +428,13 @@ SpmdInfo FlashAttGradInferSpmd(const DistMetaTensor& q,
                                      mask_dims_mapping_size));
   }
 
-  auto out_shape = phi::vectorize(out.dims());
+  auto out_shape = common::vectorize(out.dims());
   auto out_dist_attr = out.dist_attr();
 
-  auto softmax_lse_shape = phi::vectorize(softmax_lse.dims());
+  auto softmax_lse_shape = common::vectorize(softmax_lse.dims());
   auto softmax_lse_dist_attr = softmax_lse.dist_attr();
 
-  auto out_grad_shape = phi::vectorize(out_grad.dims());
+  auto out_grad_shape = common::vectorize(out_grad.dims());
   auto out_grad_dist_attr = out_grad.dist_attr();
 
   std::string alphabet = "abcdefghijklmnopqrstuvwxyz";
diff --git a/paddle/phi/infermeta/spmd_rules/flatten.cc b/paddle/phi/infermeta/spmd_rules/flatten.cc
index ec0917b840785d..bd1b88cf07b06c 100644
--- a/paddle/phi/infermeta/spmd_rules/flatten.cc
+++ b/paddle/phi/infermeta/spmd_rules/flatten.cc
@@ -93,7 +93,7 @@ SpmdInfo FlattenInferSpmd(const DistMetaTensor& x,
                           int start_axis,
                           int stop_axis) {
   // Step0: Verify input args based on flatten logic
-  auto src_shape = phi::vectorize(x.dims());
+  auto src_shape = common::vectorize(x.dims());
   int x_ndim = static_cast<int64_t>(src_shape.size());
   auto x_dist_attr_src = x.dist_attr();
   std::vector<int64_t> x_dims_mapping = x_dist_attr_src.dims_mapping();
@@ -145,9 +145,9 @@ SpmdInfo FlattenInferSpmdReverse(const DistMetaTensor& x,
                                  int start_axis,
                                  int stop_axis) {
   // Step0: Verify input args based on flatten logic
-  auto x_shape = phi::vectorize(x.dims());
+  auto x_shape = common::vectorize(x.dims());
   auto x_ndim = x_shape.size();
-  auto out_shape = phi::vectorize(out.dims());
+  auto out_shape = common::vectorize(out.dims());
   int out_ndim = out_shape.size();
   auto out_dist_attr_src = out.dist_attr();
   std::vector<int64_t> out_dims_mapping = out_dist_attr_src.dims_mapping();
diff --git a/paddle/phi/infermeta/spmd_rules/layer_norm.cc b/paddle/phi/infermeta/spmd_rules/layer_norm.cc
index 9faf0f240d3d20..7bd9482f4aa615 100644
--- a/paddle/phi/infermeta/spmd_rules/layer_norm.cc
+++ b/paddle/phi/infermeta/spmd_rules/layer_norm.cc
@@ -32,9 +32,9 @@ SpmdInfo LayerNormInferSpmd(const DistMetaTensor& x,
                             float epsilon,
                             int begin_norm_axis) {
   // Step0: verify input args based on layer_norm logic
-  auto x_shape = phi::vectorize(x.dims());
-  auto scale_shape = phi::vectorize(scale.dims());
-  auto bias_shape = phi::vectorize(bias.dims());
+  auto x_shape = common::vectorize(x.dims());
+  auto scale_shape = common::vectorize(scale.dims());
+  auto bias_shape = common::vectorize(bias.dims());
   int x_ndim = x_shape.size();
   int scale_ndim = scale_shape.size();
   int bias_ndim = bias_shape.size();
@@ -158,10 +158,10 @@ SpmdInfo LayerNormInferSpmdReverse(const DistMetaTensor& x,
                                    float epsilon,
                                    int begin_norm_axis) {
   // Step0: Verify input args based on layer_norm logic
-  auto x_shape = phi::vectorize(x.dims());
-  auto out_shape = phi::vectorize(out.dims());
-  auto mean_shape = phi::vectorize(mean.dims());
-  auto variance_shape = phi::vectorize(variance.dims());
+  auto x_shape = common::vectorize(x.dims());
+  auto out_shape = common::vectorize(out.dims());
+  auto mean_shape = common::vectorize(mean.dims());
+  auto variance_shape = common::vectorize(variance.dims());
   int x_ndim = x_shape.size();
   int out_ndim = out_shape.size();
   int mean_ndim = mean_shape.size();
@@ -303,7 +303,7 @@ SpmdInfo LayerNormGradInferSpmd(const DistMetaTensor& x,
                                 float epsilon,
                                 int begin_norm_axis) {
   auto get_shape = [](const auto& meta) {
-    return phi::vectorize<int64_t>(meta.dims());
+    return common::vectorize<int64_t>(meta.dims());
   };
   // 1、check tensors shapes
   auto x_shape = get_shape(x);
diff --git a/paddle/phi/infermeta/spmd_rules/matmul.cc b/paddle/phi/infermeta/spmd_rules/matmul.cc
index 277073fb798b16..5cd895401dc96e 100644
--- a/paddle/phi/infermeta/spmd_rules/matmul.cc
+++ b/paddle/phi/infermeta/spmd_rules/matmul.cc
@@ -119,8 +119,8 @@ SpmdInfo MatmulInferSpmd(const DistMetaTensor& x,
                          bool trans_x,
                          bool trans_y) {
   // Step0: verify input args based on matmul logic
-  auto x_shape = phi::vectorize(x.dims());
-  auto y_shape = phi::vectorize(y.dims());
+  auto x_shape = common::vectorize(x.dims());
+  auto y_shape = common::vectorize(y.dims());
   int x_ndim = x_shape.size();
   int y_ndim = y_shape.size();
   auto x_dist_attr_src = x.dist_attr();
@@ -226,11 +226,11 @@ SpmdInfo MatmulInferSpmdReverse(const DistMetaTensor& x,
                                 const DistMetaTensor& out,
                                 bool trans_x,
                                 bool trans_y) {
-  auto out_shape = phi::vectorize(out.dims());
+  auto out_shape = common::vectorize(out.dims());
   int out_ndim = out_shape.size();
 
-  auto x_shape = phi::vectorize(x.dims());
-  auto y_shape = phi::vectorize(y.dims());
+  auto x_shape = common::vectorize(x.dims());
+  auto y_shape = common::vectorize(y.dims());
   int x_ndim = x_shape.size();
   int y_ndim = y_shape.size();
   int max_ndim = std::max(x_ndim, y_ndim);
diff --git a/paddle/phi/infermeta/spmd_rules/numel.cc b/paddle/phi/infermeta/spmd_rules/numel.cc
index 013639cbb0df2b..ca0678b7731635 100644
--- a/paddle/phi/infermeta/spmd_rules/numel.cc
+++ b/paddle/phi/infermeta/spmd_rules/numel.cc
@@ -25,7 +25,7 @@ using phi::distributed::auto_parallel::str_join;
 
 SpmdInfo NumelInferSpmd(const DistMetaTensor& x) {
   std::string alphabet = "abcdefghijklmnopqrstuvwxyz";
-  auto x_shape = phi::vectorize(x.dims());
+  auto x_shape = common::vectorize(x.dims());
   int x_ndim = x_shape.size();
   auto x_dist_attr_src = x.dist_attr();
   std::vector<int64_t> x_dims_mapping = x_dist_attr_src.dims_mapping();
diff --git a/paddle/phi/infermeta/spmd_rules/reduction.cc b/paddle/phi/infermeta/spmd_rules/reduction.cc
index 3935459683e551..8c8721a238a984 100644
--- a/paddle/phi/infermeta/spmd_rules/reduction.cc
+++ b/paddle/phi/infermeta/spmd_rules/reduction.cc
@@ -70,7 +70,7 @@ SpmdInfo ReductionInferSpmdBase(const DistMetaTensor& x,
                                 bool keep_dim,
                                 int reduce_type) {
   // Step0: Verify input args based on reduction logic
-  auto x_shape = phi::vectorize(x.dims());
+  auto x_shape = common::vectorize(x.dims());
   int x_ndim = x_shape.size();
   auto x_dist_attr_src = x.dist_attr();
   std::vector<int64_t> x_dims_mapping = x_dist_attr_src.dims_mapping();
@@ -170,8 +170,8 @@ SpmdInfo ReductionInferSpmdReverse(const DistMetaTensor& x,
                                    const std::vector<int64_t>& axis,
                                    bool keep_dim) {
   // Step0: Verify input args based on reduction logic
-  auto x_shape = phi::vectorize(x.dims());
-  auto out_shape = phi::vectorize(out.dims());
+  auto x_shape = common::vectorize(x.dims());
+  auto out_shape = common::vectorize(out.dims());
   int x_ndim = x_shape.size();
   int out_ndim = out_shape.size();
   auto out_dist_attr_src = out.dist_attr();
@@ -228,8 +228,8 @@ SpmdInfo ReductionGradInferSpmd(const DistMetaTensor& x,
   TensorDistAttr x_dist_attr = out_grad_dist_attr;
   TensorDistAttr x_grad_dist_attr = out_grad_dist_attr;
 
-  std::vector<int64_t> x_dim = phi::vectorize(x.dims());
-  std::vector<int64_t> out_grad_dim = phi::vectorize(out_grad.dims());
+  std::vector<int64_t> x_dim = common::vectorize(x.dims());
+  std::vector<int64_t> out_grad_dim = common::vectorize(out_grad.dims());
 
   if (x_dim.size() != out_grad_dim.size()) {
     auto dims_mapping = x_dist_attr.dims_mapping();
diff --git a/paddle/phi/infermeta/spmd_rules/replicated.cc b/paddle/phi/infermeta/spmd_rules/replicated.cc
index d0c90f7b2d2a96..a6759a8ea3aa7e 100644
--- a/paddle/phi/infermeta/spmd_rules/replicated.cc
+++ b/paddle/phi/infermeta/spmd_rules/replicated.cc
@@ -72,7 +72,7 @@ SpmdInfo ReplicatedInferSpmd(const std::vector<const DistMetaTensor*>& ins,
       continue;
     }
     VLOG(4) << "Input" << std::to_string(i) << " shape: ["
-            << str_join(phi::vectorize(ins[i]->dims())) << "] "
+            << str_join(common::vectorize(ins[i]->dims())) << "] "
             << "src_dims_mapping: ["
             << str_join(ins[i]->dist_attr().dims_mapping()) << "] "
             << "dst_dims_mapping: ["
@@ -81,7 +81,7 @@ SpmdInfo ReplicatedInferSpmd(const std::vector<const DistMetaTensor*>& ins,
 
   for (int64_t i = 0; i < noutputs; i++) {
     VLOG(4) << "Output" << std::to_string(i) << " shape: ["
-            << str_join(phi::vectorize(outs[i]->dims())) << "] "
+            << str_join(common::vectorize(outs[i]->dims())) << "] "
             << "dst_dims_mapping: ["
             << str_join(output_dist_attrs[i].dims_mapping()) << "]";
   }
@@ -122,7 +122,7 @@ SpmdInfo ReplicatedInferSpmdReverse(
   VLOG(4) << "ReplicatedSpmd InferBackward:";
   for (int64_t i = 0; i < noutputs; i++) {
     VLOG(4) << "Output" << std::to_string(i) << " shape: ["
-            << str_join(phi::vectorize(outs[i]->dims())) << "] "
+            << str_join(common::vectorize(outs[i]->dims())) << "] "
             << "src_dims_mapping: ["
             << str_join(outs[i]->dist_attr().dims_mapping()) << "] "
             << "dst_dims_mapping: ["
@@ -131,7 +131,7 @@ SpmdInfo ReplicatedInferSpmdReverse(
 
   for (int64_t i = 0; i < ninputs; i++) {
     VLOG(4) << "Input" << std::to_string(i) << " shape: ["
-            << str_join(phi::vectorize(ins[i]->dims())) << "] "
+            << str_join(common::vectorize(ins[i]->dims())) << "] "
             << "dst_dims_mapping: ["
             << str_join(dst_input_dist_attrs[i].dims_mapping()) << "]";
   }
diff --git a/paddle/phi/infermeta/spmd_rules/reshape.cc b/paddle/phi/infermeta/spmd_rules/reshape.cc
index c1364dc97d876e..5f4f59f9995dd8 100644
--- a/paddle/phi/infermeta/spmd_rules/reshape.cc
+++ b/paddle/phi/infermeta/spmd_rules/reshape.cc
@@ -225,8 +225,8 @@ SpmdInfo ReshapeInferSpmdReverse(const DistMetaTensor& x,
                                  const DistMetaTensor& out,
                                  const std::vector<int64_t>& shape) {
   // Step0: Verify input args based on reshape logic
-  auto x_shape = phi::vectorize(x.dims());
-  auto out_shape = phi::vectorize(out.dims());
+  auto x_shape = common::vectorize(x.dims());
+  auto out_shape = common::vectorize(out.dims());
   int x_ndim = x_shape.size();
   int out_ndim = out_shape.size();
   auto out_dist_attr_src = out.dist_attr();
@@ -317,7 +317,7 @@ SpmdInfo ReshapeInferSpmdDynamic(const DistMetaTensor& x,
 
 SpmdInfo ReshapeGradInferSpmd(const DistMetaTensor& x_shape,
                               const DistMetaTensor& out_grad) {
-  std::vector<int64_t> out_grad_shape = phi::vectorize(out_grad.dims());
+  std::vector<int64_t> out_grad_shape = common::vectorize(out_grad.dims());
   const auto& x_shape_dist_src = x_shape.dist_attr();
   auto tmp = ReshapeInferSpmdDynamic(x_shape, out_grad_shape);
   // check no shard is needed
diff --git a/paddle/phi/infermeta/spmd_rules/slice.cc b/paddle/phi/infermeta/spmd_rules/slice.cc
index 73caa2e65aa45b..54e0233ac8e10f 100644
--- a/paddle/phi/infermeta/spmd_rules/slice.cc
+++ b/paddle/phi/infermeta/spmd_rules/slice.cc
@@ -29,7 +29,7 @@ using phi::distributed::auto_parallel::str_join;
 SpmdInfo SliceInferSpmdBase(const DistMetaTensor& input,
                             const std::vector<int64_t>& axes) {
   // Step0: Verify input args based on slice logic
-  auto input_shape = phi::vectorize(input.dims());
+  auto input_shape = common::vectorize(input.dims());
   int input_ndim = input_shape.size();
   auto input_dist_attr_src = input.dist_attr();
   std::vector<int64_t> input_dims_mapping = input_dist_attr_src.dims_mapping();
@@ -109,12 +109,11 @@ SpmdInfo SliceInferSpmd(const DistMetaTensor& input,
 SpmdInfo SliceInferSpmdReverseBase(const DistMetaTensor& input,
                                    const DistMetaTensor& output,
                                    const std::vector<int64_t>& axes) {
-  // Step0: Verify input args based on slice logic
-  auto output_shape = phi::vectorize(output.dims());
+  auto output_shape = common::vectorize(output.dims());
   int out_ndim = output_shape.size();
   auto out_dist_attr = output.dist_attr();
   int out_dims_mapping_size = out_dist_attr.dims_mapping().size();
-  auto input_shape = phi::vectorize(input.dims());
+  auto input_shape = common::vectorize(input.dims());
   int input_ndim = input_shape.size();
   auto input_dist_attr = input.dist_attr();
   std::vector<int64_t> input_dims_mapping = input_dist_attr.dims_mapping();
@@ -177,7 +176,7 @@ SpmdInfo SliceInferSpmdReverseBase(const DistMetaTensor& input,
   VLOG(4) << "SliceInferSpmdReverse:";
   VLOG(4) << "Einsum Notation: " << input_axes << "-->" << out_axes;
   VLOG(4) << "Output"
-          << " shape: [" << str_join(phi::vectorize(output.dims())) << "] "
+          << " shape: [" << str_join(common::vectorize(output.dims())) << "] "
           << "axes: [" << str_join(axes) << "] "
           << "src_dims_mapping: ["
           << str_join(output.dist_attr().dims_mapping()) << "] "
@@ -223,10 +222,10 @@ SpmdInfo SliceGradInferBase(const DistMetaTensor& input,
   auto out_dist_attr = out_grad.dist_attr();
   input_dist_attr = UnShardTensorDims(input_dist_attr, axes);
   out_dist_attr = UnShardTensorDims(out_dist_attr, axes);
-  auto output_shape = phi::vectorize(out_grad.dims());
+  auto output_shape = common::vectorize(out_grad.dims());
   int out_ndim = output_shape.size();
   int out_dims_mapping_size = out_dist_attr.dims_mapping().size();
-  auto input_shape = phi::vectorize(input.dims());
+  auto input_shape = common::vectorize(input.dims());
   int input_ndim = input_shape.size();
   std::vector<int64_t> input_dims_mapping = input_dist_attr.dims_mapping();
 
diff --git a/paddle/phi/infermeta/spmd_rules/softmax.cc b/paddle/phi/infermeta/spmd_rules/softmax.cc
index 35f811ab99d2b7..1b3d9c5e56a946 100644
--- a/paddle/phi/infermeta/spmd_rules/softmax.cc
+++ b/paddle/phi/infermeta/spmd_rules/softmax.cc
@@ -30,7 +30,7 @@ using phi::distributed::auto_parallel::str_join;
 
 SpmdInfo SoftmaxInferSpmd(const DistMetaTensor& x, int axis) {
   // Step0: Verify input args based on softmax logic
-  auto x_shape = phi::vectorize(x.dims());
+  auto x_shape = common::vectorize(x.dims());
   int x_ndim = x_shape.size();
   auto x_dist_attr_src = x.dist_attr();
   std::vector<int64_t> x_dims_mapping = x_dist_attr_src.dims_mapping();
@@ -98,8 +98,8 @@ SpmdInfo SoftmaxInferSpmdReverse(const DistMetaTensor& x,
                                  const DistMetaTensor& out,
                                  int axis) {
   // Step0: verify input args based on softmax logic
-  auto x_shape = phi::vectorize(x.dims());
-  auto out_shape = phi::vectorize(out.dims());
+  auto x_shape = common::vectorize(x.dims());
+  auto out_shape = common::vectorize(out.dims());
   int x_ndim = x_shape.size();
   int out_ndim = out_shape.size();
   auto out_dist_attr_src = out.dist_attr();
diff --git a/paddle/phi/infermeta/spmd_rules/split.cc b/paddle/phi/infermeta/spmd_rules/split.cc
index 19c1ff96558710..895075a52f4578 100644
--- a/paddle/phi/infermeta/spmd_rules/split.cc
+++ b/paddle/phi/infermeta/spmd_rules/split.cc
@@ -28,7 +28,7 @@ using phi::distributed::auto_parallel::str_join;
 
 SpmdInfo SplitWithNumInferSpmd(const DistMetaTensor& x, int num, int axis) {
   // Step0: Verify input args based on split logic
-  auto x_shape = phi::vectorize(x.dims());
+  auto x_shape = common::vectorize(x.dims());
   int x_ndim = x_shape.size();
   auto x_dist_attr_src = x.dist_attr();
   std::vector<int64_t> x_dims_mapping = x_dist_attr_src.dims_mapping();
@@ -105,8 +105,8 @@ SpmdInfo SplitWithNumInferSpmdReverse(
     int axis) {
   // Step0: Verify input args based on split logic
   int nouts = outs.size();
-  int out_ndim = phi::vectorize(outs[0]->dims()).size();
-  auto x_shape = phi::vectorize(x.dims());
+  int out_ndim = common::vectorize(outs[0]->dims()).size();
+  auto x_shape = common::vectorize(x.dims());
   int x_ndim = x_shape.size();
   auto x_dist_attr = x.dist_attr();
   std::vector<int64_t> x_dims_mapping = x_dist_attr.dims_mapping();
@@ -125,7 +125,7 @@ SpmdInfo SplitWithNumInferSpmdReverse(
                                    x_ndim,
                                    out_ndim));
   for (int i = 0; i < num; i++) {
-    auto shape = phi::vectorize(outs[i]->dims());
+    auto shape = common::vectorize(outs[i]->dims());
     int ndim = shape.size();
     auto dist_attr = outs[i]->dist_attr();
     int dims_mapping_size = dist_attr.dims_mapping().size();
@@ -187,7 +187,7 @@ SpmdInfo SplitWithNumInferSpmdReverse(
   VLOG(4) << "Einsum Notation: " << x_axes << "-->" << out_axes;
   for (int i = 0; i < nouts; i++) {
     VLOG(4) << "Output" << std::to_string(i) << " shape: ["
-            << str_join(phi::vectorize(outs[i]->dims())) << "] "
+            << str_join(common::vectorize(outs[i]->dims())) << "] "
             << "src_dims_mapping: ["
             << str_join(outs[i]->dist_attr().dims_mapping()) << "] "
             << "dst_dims_mapping: ["
diff --git a/paddle/phi/infermeta/spmd_rules/squeeze.cc b/paddle/phi/infermeta/spmd_rules/squeeze.cc
index 8080e6c3d24ac3..6f711e04d6e219 100644
--- a/paddle/phi/infermeta/spmd_rules/squeeze.cc
+++ b/paddle/phi/infermeta/spmd_rules/squeeze.cc
@@ -105,7 +105,7 @@ void MakeSqueezeDimTransReverseWithAxis(
 SpmdInfo SqueezeInferSpmd(const DistMetaTensor& x,
                           const std::vector<int64_t>& axis) {
   // Step0: Verify input args based on squeeze logic
-  auto x_shape = phi::vectorize(x.dims());
+  auto x_shape = common::vectorize(x.dims());
   int x_ndim = x_shape.size();
   auto x_dist_attr_src = x.dist_attr();
   std::vector<int64_t> x_dims_mapping = x_dist_attr_src.dims_mapping();
@@ -177,9 +177,9 @@ SpmdInfo SqueezeInferSpmdReverse(const DistMetaTensor& x,
                                  const DistMetaTensor& out,
                                  const std::vector<int64_t>& axis) {
   // Step0: Verify input args based on squeeze logic
-  auto x_shape = phi::vectorize(x.dims());
+  auto x_shape = common::vectorize(x.dims());
   int x_ndim = x_shape.size();
-  auto out_shape = phi::vectorize(out.dims());
+  auto out_shape = common::vectorize(out.dims());
   int out_ndim = out_shape.size();
   auto out_dist_attr_src = out.dist_attr();
   std::vector<int64_t> out_dims_mapping = out_dist_attr_src.dims_mapping();
diff --git a/paddle/phi/infermeta/spmd_rules/stack.cc b/paddle/phi/infermeta/spmd_rules/stack.cc
index d6f667a7fbdfa9..5f3499a5b1ad7a 100644
--- a/paddle/phi/infermeta/spmd_rules/stack.cc
+++ b/paddle/phi/infermeta/spmd_rules/stack.cc
@@ -42,7 +42,7 @@ SpmdInfo StackInferSpmd(const std::vector<DistMetaTensor>& x, int axis) {
                  x.end(),
                  std::back_inserter(tensor_shapes),
                  [](const DistMetaTensor& meta) {
-                   return phi::vectorize<int64_t>(meta.dims());
+                   return common::vectorize<int64_t>(meta.dims());
                  });
   bool all_empty =
       std::all_of(tensor_shapes.begin(), tensor_shapes.end(), IsEmpty);
diff --git a/paddle/phi/infermeta/spmd_rules/transpose.cc b/paddle/phi/infermeta/spmd_rules/transpose.cc
index 441ede3850d363..e4942f2e4718ef 100644
--- a/paddle/phi/infermeta/spmd_rules/transpose.cc
+++ b/paddle/phi/infermeta/spmd_rules/transpose.cc
@@ -50,7 +50,7 @@ void BuildEinsumNotation(const size_t x_ndim,
 SpmdInfo TransposeInferSpmd(const DistMetaTensor& x,
                             const std::vector<int>& perm) {
   // Step0: Verify input args based on transpose logic
-  std::vector<int64_t> x_shape = phi::vectorize(x.dims());
+  std::vector<int64_t> x_shape = common::vectorize(x.dims());
   size_t x_ndim = x_shape.size();
   const TensorDistAttr& x_dist_attr_src = x.dist_attr();
   std::vector<int64_t> x_dims_mapping = x_dist_attr_src.dims_mapping();
@@ -107,8 +107,8 @@ SpmdInfo TransposeInferSpmdReverse(const DistMetaTensor& x,
                                    const DistMetaTensor& out,
                                    const std::vector<int>& perm) {
   // Step0: Verify input args based on transpose logic
-  const std::vector<int64_t> x_shape = phi::vectorize(x.dims());
-  const std::vector<int64_t> out_shape = phi::vectorize(out.dims());
+  const std::vector<int64_t> x_shape = common::vectorize(x.dims());
+  const std::vector<int64_t> out_shape = common::vectorize(out.dims());
   int x_ndim = x_shape.size();
   int out_ndim = out_shape.size();
   TensorDistAttr out_dist_attr_src = out.dist_attr();
@@ -171,7 +171,8 @@ SpmdInfo TransposeInferSpmdReverse(const DistMetaTensor& x,
 
 SpmdInfo TransposeGradInferSpmd(const DistMetaTensor& out_grad,
                                 const std::vector<int>& perm) {
-  const std::vector<int64_t> out_grad_shape = phi::vectorize(out_grad.dims());
+  const std::vector<int64_t> out_grad_shape =
+      common::vectorize(out_grad.dims());
   size_t out_grad_ndim = out_grad_shape.size();
   const std::vector<int64_t> out_grad_dims_mapping =
       out_grad.dist_attr().dims_mapping();
diff --git a/paddle/phi/infermeta/spmd_rules/triu.cc b/paddle/phi/infermeta/spmd_rules/triu.cc
index 4414e3b14016f7..ed98889de4ea72 100644
--- a/paddle/phi/infermeta/spmd_rules/triu.cc
+++ b/paddle/phi/infermeta/spmd_rules/triu.cc
@@ -24,7 +24,7 @@ namespace distributed {
 using phi::distributed::auto_parallel::str_join;
 
 SpmdInfo TriuInferSpmdBase(const DistMetaTensor& x) {
-  auto x_shape = phi::vectorize(x.dims());
+  auto x_shape = common::vectorize(x.dims());
   int x_ndim = x_shape.size();
   auto x_dist_attr_src = x.dist_attr();
   std::vector<int64_t> x_dims_mapping = x_dist_attr_src.dims_mapping();
@@ -71,7 +71,7 @@ SpmdInfo TriuInferSpmd(const DistMetaTensor& x, int diagonal) {
 
 SpmdInfo TriuInferSpmdReverseBase(const DistMetaTensor& x,
                                   const DistMetaTensor& out) {
-  auto out_shape = phi::vectorize(out.dims());
+  auto out_shape = common::vectorize(out.dims());
   int out_ndim = out_shape.size();
   auto out_dist_attr_src = out.dist_attr();
   std::vector<int64_t> out_dims_mapping = out_dist_attr_src.dims_mapping();
@@ -117,7 +117,7 @@ SpmdInfo TriuInferSpmdReverse(const DistMetaTensor& x,
 }
 
 SpmdInfo TriuGradInferSpmdBase(const DistMetaTensor& out_grad) {
-  auto out_shape = phi::vectorize(out_grad.dims());
+  auto out_shape = common::vectorize(out_grad.dims());
   int out_ndim = out_shape.size();
   auto out_dist_attr_src = out_grad.dist_attr();
   const std::vector<int64_t>& out_dims_mapping =
diff --git a/paddle/phi/infermeta/spmd_rules/unsqueeze.cc b/paddle/phi/infermeta/spmd_rules/unsqueeze.cc
index a5819f5adac39a..935140a0509bab 100644
--- a/paddle/phi/infermeta/spmd_rules/unsqueeze.cc
+++ b/paddle/phi/infermeta/spmd_rules/unsqueeze.cc
@@ -92,7 +92,7 @@ std::vector<std::shared_ptr<DimTrans>> MakeUnsqueezeDimTransReverse(
 SpmdInfo UnsqueezeInferSpmd(const DistMetaTensor& x,
                             const std::vector<int64_t>& axis) {
   // Step0: Verify input args based on unsqueeze logic
-  auto x_shape = phi::vectorize(x.dims());
+  auto x_shape = common::vectorize(x.dims());
   int x_ndim = x_shape.size();
   auto x_dist_attr_src = x.dist_attr();
   std::vector<int64_t> x_dims_mapping = x_dist_attr_src.dims_mapping();
@@ -161,9 +161,9 @@ SpmdInfo UnsqueezeInferSpmdReverse(const DistMetaTensor& x,
                                    const DistMetaTensor& out,
                                    const std::vector<int64_t>& axis) {
   // Step0: Verify input args based on unsqueeze logic
-  auto x_shape = phi::vectorize(x.dims());
+  auto x_shape = common::vectorize(x.dims());
   int x_ndim = x_shape.size();
-  auto out_shape = phi::vectorize(out.dims());
+  auto out_shape = common::vectorize(out.dims());
   int out_ndim = out_shape.size();
   auto out_dist_attr_src = out.dist_attr();
   std::vector<int64_t> out_dims_mapping = out_dist_attr_src.dims_mapping();
diff --git a/paddle/phi/infermeta/spmd_rules/where.cc b/paddle/phi/infermeta/spmd_rules/where.cc
index d5cd639f8d5798..b176365bb2d7d0 100644
--- a/paddle/phi/infermeta/spmd_rules/where.cc
+++ b/paddle/phi/infermeta/spmd_rules/where.cc
@@ -26,7 +26,7 @@ using phi::distributed::auto_parallel::str_join;
 SpmdInfo WhereInferSpmd(const DistMetaTensor& condition,
                         const DistMetaTensor& x,
                         const DistMetaTensor& y) {
-  auto cond_shape = phi::vectorize(condition.dims());
+  auto cond_shape = common::vectorize(condition.dims());
   int cond_ndim = cond_shape.size();
   auto cond_dist_attr_src = condition.dist_attr();
   std::vector<int64_t> cond_dims_mapping = cond_dist_attr_src.dims_mapping();
@@ -41,7 +41,7 @@ SpmdInfo WhereInferSpmd(const DistMetaTensor& condition,
   std::string alphabet = "abcdefghijklmnopqrstuvwxyz";
   std::string cond_axes = alphabet.substr(0, cond_ndim);
 
-  auto x_shape = phi::vectorize(x.dims());
+  auto x_shape = common::vectorize(x.dims());
   int x_ndim = x_shape.size();
   auto x_dist_attr_src = x.dist_attr();
   std::vector<int64_t> x_dims_mapping = x_dist_attr_src.dims_mapping();
@@ -63,7 +63,7 @@ SpmdInfo WhereInferSpmd(const DistMetaTensor& condition,
 
   std::string x_axes = alphabet.substr(cond_ndim - x_ndim, x_ndim);
 
-  auto y_shape = phi::vectorize(y.dims());
+  auto y_shape = common::vectorize(y.dims());
   int y_ndim = y_shape.size();
   auto y_dist_attr_src = y.dist_attr();
   std::vector<int64_t> y_dims_mapping = y_dist_attr_src.dims_mapping();
@@ -134,7 +134,7 @@ SpmdInfo WhereInferSpmdReverse(const DistMetaTensor& condition,
                                const DistMetaTensor& x,
                                const DistMetaTensor& y,
                                const DistMetaTensor& output) {
-  auto cond_shape = phi::vectorize(condition.dims());
+  auto cond_shape = common::vectorize(condition.dims());
   int cond_ndim = cond_shape.size();
   auto cond_dist_attr_src = condition.dist_attr();
   std::vector<int64_t> cond_dims_mapping = cond_dist_attr_src.dims_mapping();
@@ -149,7 +149,7 @@ SpmdInfo WhereInferSpmdReverse(const DistMetaTensor& condition,
   std::string alphabet = "abcdefghijklmnopqrstuvwxyz";
   std::string cond_axes = alphabet.substr(0, cond_ndim);
 
-  auto x_shape = phi::vectorize(x.dims());
+  auto x_shape = common::vectorize(x.dims());
   int x_ndim = x_shape.size();
   auto x_dist_attr_src = x.dist_attr();
   std::vector<int64_t> x_dims_mapping = x_dist_attr_src.dims_mapping();
@@ -170,7 +170,7 @@ SpmdInfo WhereInferSpmdReverse(const DistMetaTensor& condition,
 
   std::string x_axes = alphabet.substr(cond_ndim - x_ndim, x_ndim);
 
-  auto y_shape = phi::vectorize(y.dims());
+  auto y_shape = common::vectorize(y.dims());
   int y_ndim = y_shape.size();
   auto y_dist_attr_src = y.dist_attr();
   std::vector<int64_t> y_dims_mapping = y_dist_attr_src.dims_mapping();
@@ -191,7 +191,7 @@ SpmdInfo WhereInferSpmdReverse(const DistMetaTensor& condition,
 
   std::string y_axes = alphabet.substr(cond_ndim - y_ndim, y_ndim);
 
-  auto out_shape = phi::vectorize(output.dims());
+  auto out_shape = common::vectorize(output.dims());
   int out_ndim = out_shape.size();
   auto out_dist_attr_src = output.dist_attr();
   std::vector<int64_t> out_dims_mapping = out_dist_attr_src.dims_mapping();
@@ -248,7 +248,7 @@ SpmdInfo WhereGradInferSpmd(const DistMetaTensor& condition,
                             const DistMetaTensor& x,
                             const DistMetaTensor& y,
                             const DistMetaTensor& out_grad) {
-  auto cond_shape = phi::vectorize(condition.dims());
+  auto cond_shape = common::vectorize(condition.dims());
   int cond_ndim = cond_shape.size();
   auto cond_dist_attr_src = condition.dist_attr();
   std::vector<int64_t> cond_dims_mapping = cond_dist_attr_src.dims_mapping();
@@ -263,7 +263,7 @@ SpmdInfo WhereGradInferSpmd(const DistMetaTensor& condition,
   std::string alphabet = "abcdefghijklmnopqrstuvwxyz";
   std::string cond_axes = alphabet.substr(0, cond_ndim);
 
-  auto x_shape = phi::vectorize(x.dims());
+  auto x_shape = common::vectorize(x.dims());
   int x_ndim = x_shape.size();
   auto x_dist_attr_src = x.dist_attr();
   std::vector<int64_t> x_dims_mapping = x_dist_attr_src.dims_mapping();
@@ -284,7 +284,7 @@ SpmdInfo WhereGradInferSpmd(const DistMetaTensor& condition,
 
   std::string x_axes = alphabet.substr(cond_ndim - x_ndim, x_ndim);
 
-  auto y_shape = phi::vectorize(y.dims());
+  auto y_shape = common::vectorize(y.dims());
   int y_ndim = y_shape.size();
   auto y_dist_attr_src = y.dist_attr();
   std::vector<int64_t> y_dims_mapping = y_dist_attr_src.dims_mapping();
@@ -305,7 +305,7 @@ SpmdInfo WhereGradInferSpmd(const DistMetaTensor& condition,
 
   std::string y_axes = alphabet.substr(cond_ndim - y_ndim, y_ndim);
 
-  auto out_grad_shape = phi::vectorize(out_grad.dims());
+  auto out_grad_shape = common::vectorize(out_grad.dims());
   int out_grad_ndim = out_grad_shape.size();
   auto out_grad_dist_attr_src = out_grad.dist_attr();
   std::vector<int64_t> out_grad_dims_mapping =
diff --git a/paddle/phi/infermeta/strings/nullary.cc b/paddle/phi/infermeta/strings/nullary.cc
index c2428a2ff3ae9e..80f75c0e067217 100644
--- a/paddle/phi/infermeta/strings/nullary.cc
+++ b/paddle/phi/infermeta/strings/nullary.cc
@@ -17,7 +17,7 @@ namespace phi {
 namespace strings {
 
 void CreateInferMeta(const IntArray& shape, MetaTensor* out) {
-  const auto& out_dims = phi::make_ddim(shape.GetData());
+  const auto& out_dims = common::make_ddim(shape.GetData());
   out->set_dims(out_dims);
   out->set_dtype(DataType::PSTRING);
   out->set_layout(DataLayout::PSTRING_UNION);
diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc
index a38e9ca6f9a14f..27fe7dc19ae4cc 100644
--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include "glog/logging.h"
 
-#include "paddle/phi/common/layout.h"
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
+#include "paddle/common/layout.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 #include "paddle/phi/kernels/impl/box_coder.h"
 
@@ -66,9 +66,9 @@ void AccuracyInferMeta(const MetaTensor& out,
             label_dim[0]));
   }
 
-  accuracy->set_dims(phi::make_ddim({}));
-  correct->set_dims(phi::make_ddim({}));
-  total->set_dims(phi::make_ddim({}));
+  accuracy->set_dims(common::make_ddim({}));
+  correct->set_dims(common::make_ddim({}));
+  total->set_dims(common::make_ddim({}));
   accuracy->set_dtype(out.dtype());
   correct->set_dtype(out.dtype());
   total->set_dtype(out.dtype());
@@ -141,7 +141,7 @@ void AddmmInferMeta(const MetaTensor& input,
   output_dims.push_back(x_dims[0]);
   output_dims.push_back(y_dims[1]);
 
-  out->set_dims(make_ddim(output_dims));
+  out->set_dims(common::make_ddim(output_dims));
   out->share_lod(input);
   out->set_dtype(input.dtype());
 }
@@ -264,12 +264,12 @@ void DpsgdInferMeta(const MetaTensor& param,
                     int size,
                     MetaTensor* param_out) {
   auto lr_dims = learning_rate.dims();
-  PADDLE_ENFORCE_EQ(phi::product(lr_dims),
+  PADDLE_ENFORCE_EQ(common::product(lr_dims),
                     1,
                     phi::errors::InvalidArgument(
                         "Learning rate should have 1 dimension. But Received "
                         "LearningRate's dims [%s].",
-                        phi::product(lr_dims)));
+                        common::product(lr_dims)));
   auto param_dims = param.dims();
   PADDLE_ENFORCE_EQ(
       param_dims,
@@ -299,23 +299,23 @@ void ArangeTensorInferMeta(const MetaTensor& start,
                            const MetaTensor& end,
                            const MetaTensor& step,
                            MetaTensor* out) {
-  PADDLE_ENFORCE_EQ(phi::product(start.dims()),
+  PADDLE_ENFORCE_EQ(common::product(start.dims()),
                     1,
                     phi::errors::InvalidArgument(
                         "The numel of Input(start) should be 1, but got %d",
-                        phi::product(start.dims())));
+                        common::product(start.dims())));
 
-  PADDLE_ENFORCE_EQ(phi::product(end.dims()),
+  PADDLE_ENFORCE_EQ(common::product(end.dims()),
                     1,
                     phi::errors::InvalidArgument(
                         "The numel of Input(end) should be 1, but got %d",
-                        phi::product(end.dims())));
+                        common::product(end.dims())));
 
-  PADDLE_ENFORCE_EQ(phi::product(step.dims()),
+  PADDLE_ENFORCE_EQ(common::product(step.dims()),
                     1,
                     phi::errors::InvalidArgument(
                         "The numel of Input(step) should be 1, but got %d",
-                        phi::product(step.dims())));
+                        common::product(step.dims())));
 
   out->set_dims({-1});
   out->set_dtype(start.dtype());
@@ -334,7 +334,7 @@ void InstanceNormInferMeta(const MetaTensor& x,
                     phi::errors::InvalidArgument(
                         "The y in InstanceNormInferMeta can't be nullptr."));
   const auto x_dims = x.dims();
-  PADDLE_ENFORCE_NE(phi::product(x_dims),
+  PADDLE_ENFORCE_NE(common::product(x_dims),
                     0,
                     phi::errors::PreconditionNotMet(
                         "The Input variable X has not "
@@ -373,7 +373,7 @@ void InstanceNormInferMeta(const MetaTensor& x,
             "of scale is [%d]",
             scale_dim,
             scale_dim.size()));
-    bool check = !((!config.is_runtime) && (phi::product(scale_dim) <= 0));
+    bool check = !((!config.is_runtime) && (common::product(scale_dim) <= 0));
     if (check) {
       PADDLE_ENFORCE_EQ(scale_dim[0],
                         C,
@@ -395,7 +395,7 @@ void InstanceNormInferMeta(const MetaTensor& x,
             "of bias is [%d]",
             bias_dim,
             bias_dim.size()));
-    bool check = !((!config.is_runtime) && (phi::product(bias_dim) <= 0));
+    bool check = !((!config.is_runtime) && (common::product(bias_dim) <= 0));
     if (check) {
       PADDLE_ENFORCE_EQ(bias_dim[0],
                         C,
@@ -458,7 +458,7 @@ void GroupNormInferMeta(const MetaTensor& x,
           x_dim.size(),
           x_dim));
 
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
   const int64_t channel_num =
       (data_layout == DataLayout::kNCHW ? x_dim[1] : x_dim[x_dim.size() - 1]);
   auto batch_size = x_dim[0];
@@ -578,7 +578,7 @@ void LayerNormInferMeta(const MetaTensor& x,
           begin_norm_axis,
           x_dim.size()));
 
-  auto matrix_dim = phi::flatten_to_2d(x_dim, begin_norm_axis);
+  auto matrix_dim = common::flatten_to_2d(x_dim, begin_norm_axis);
 
   // keep the axis size before normalization for shape of variance and mean
   auto before_norm_dims = slice_ddim(x_dim, 0, begin_norm_axis);
@@ -693,27 +693,27 @@ void LinspaceRawInferMeta(const MetaTensor& start,
                           const MetaTensor& number,
                           MetaTensor* out) {
   PADDLE_ENFORCE_EQ(
-      phi::product(start.dims()),
+      common::product(start.dims()),
       1,
       phi::errors::InvalidArgument("The size of Input(start) should be 1,"
                                    "but got %d.",
-                                   phi::product(start.dims())));
+                                   common::product(start.dims())));
 
   PADDLE_ENFORCE_EQ(
-      phi::product(stop.dims()),
+      common::product(stop.dims()),
       1,
       phi::errors::InvalidArgument("The size of Input(stop) should be 1,"
                                    "but got %d.",
-                                   phi::product(stop.dims())));
+                                   common::product(stop.dims())));
 
   PADDLE_ENFORCE_EQ(
-      phi::product(number.dims()),
+      common::product(number.dims()),
       1,
       phi::errors::InvalidArgument("The size of Input(number) should be 1,"
                                    "but got %d.",
-                                   phi::product(number.dims())));
+                                   common::product(number.dims())));
 
-  out->set_dims(phi::make_ddim({-1}));
+  out->set_dims(common::make_ddim({-1}));
   out->set_dtype(start.dtype());
 }
 
@@ -810,11 +810,11 @@ void MultiClassNMSInferMeta(const MetaTensor& bboxes,
   // Here the box_dims[0] is not the real dimension of output.
   // It will be rewritten in the computing kernel.
 
-  out->set_dims(phi::make_ddim({-1, box_dims[2] + 2}));
+  out->set_dims(common::make_ddim({-1, box_dims[2] + 2}));
   out->set_dtype(bboxes.dtype());
-  index->set_dims(phi::make_ddim({-1, 1}));
+  index->set_dims(common::make_ddim({-1, 1}));
   index->set_dtype(DataType::INT32);
-  nms_rois_num->set_dims(phi::make_ddim({-1}));
+  nms_rois_num->set_dims(common::make_ddim({-1}));
   nms_rois_num->set_dtype(DataType::INT32);
 }
 
@@ -832,8 +832,8 @@ void NllLossRawInferMeta(const MetaTensor& input,
                     true,
                     phi::errors::InvalidArgument(
                         "The tensor rank of Input(X) must be 2 or 4."));
-  bool contain_unknown_dim =
-      phi::contain_unknown_dim(x_dims) || phi::contain_unknown_dim(label_dims);
+  bool contain_unknown_dim = common::contain_unknown_dim(x_dims) ||
+                             common::contain_unknown_dim(label_dims);
   bool check = config.is_runtime || !contain_unknown_dim;
   if (check) {
     PADDLE_ENFORCE_EQ(
@@ -867,7 +867,7 @@ void NllLossRawInferMeta(const MetaTensor& input,
     if (reduction == "none") {
       out->set_dims({x_dims[0]});
     } else {
-      out->set_dims(phi::make_ddim({}));
+      out->set_dims(common::make_ddim({}));
     }
   } else if (x_dims.size() == 4) {
     PADDLE_ENFORCE_EQ(label_dims.size(),
@@ -890,10 +890,10 @@ void NllLossRawInferMeta(const MetaTensor& input,
     if (reduction == "none") {
       out->set_dims({x_dims[0], x_dims[2], x_dims[3]});
     } else {
-      out->set_dims(phi::make_ddim({}));
+      out->set_dims(common::make_ddim({}));
     }
   }
-  total_weight->set_dims(phi::make_ddim({}));
+  total_weight->set_dims(common::make_ddim({}));
   out->set_dtype(input.dtype());
   total_weight->set_dtype(input.dtype());
 }
@@ -1250,9 +1250,9 @@ void SendURecvInferMeta(const MetaTensor& x,
                         "Src_index and Dst_index should have the same shape."));
 
   auto dims = x.dims();
-  std::vector<int64_t> dims_ = phi::vectorize(dims);
+  std::vector<int64_t> dims_ = common::vectorize(dims);
   dims_[0] = -1;
-  out->set_dims(phi::make_ddim(dims_));
+  out->set_dims(common::make_ddim(dims_));
   out->set_dtype(x.dtype());
 
   if (reduce_op == "MEAN") {
@@ -1267,7 +1267,7 @@ void SparseMomentumInferMeta(const MetaTensor& param,
                              MetaTensor* param_out,
                              MetaTensor* velocity_out,
                              MetaTensor* master_param_out) {
-  auto lr_dims = phi::product(learning_rate.dims());
+  auto lr_dims = common::product(learning_rate.dims());
   PADDLE_ENFORCE_EQ(lr_dims != 0 && lr_dims == 1,
                     true,
                     phi::errors::InvalidArgument(
@@ -1500,7 +1500,7 @@ void QuantLinearInferMeta(const MetaTensor& x,
 
   std::vector<int64_t> output_dims;
 
-  auto in_mat_dims = phi::flatten_to_2d(in_dims, in_num_col_dims);
+  auto in_mat_dims = common::flatten_to_2d(in_dims, in_num_col_dims);
   auto w_dims0 = padding_weights ? w_dims[0] - 4 : w_dims[0];
   auto w_dims1 = padding_weights ? w_dims[1] - 4 : w_dims[1];
   PADDLE_ENFORCE_EQ(
@@ -1514,14 +1514,14 @@ void QuantLinearInferMeta(const MetaTensor& x,
           in_mat_dims[1],
           in_mat_dims,
           w_dims0,
-          phi::make_ddim({w_dims0, w_dims1})));
+          common::make_ddim({w_dims0, w_dims1})));
   output_dims.reserve(static_cast<size_t>(in_num_col_dims + 1));
   for (int i = 0; i < in_num_col_dims; ++i) {
     output_dims.push_back(in_dims[i]);
   }
   output_dims.push_back(w_dims1);
 
-  y->set_dims(make_ddim(output_dims));
+  y->set_dims(common::make_ddim(output_dims));
   y->share_lod(x);
   y->set_dtype(x.dtype());
 }
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 26568d561ad007..bd3aea376503c3 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -37,12 +37,12 @@ namespace phi {
 namespace detail {
 // Used in MatrixRankInferMeta
 static DDim CheckAndGetOutputDim(const DDim& dim_x) {
-  auto x_vec = phi::vectorize(dim_x);
+  auto x_vec = common::vectorize(dim_x);
   if (x_vec.size() == 2) {
-    return phi::make_ddim({});
+    return common::make_ddim({});
   }
   x_vec.erase(x_vec.end() - 2, x_vec.end());
-  return phi::make_ddim(x_vec);
+  return common::make_ddim(x_vec);
 }
 }  // namespace detail
 
@@ -115,10 +115,10 @@ void AffineGridInferMeta(const MetaTensor& input,
           theta_dims));
   if (outputShape.GetData().size() == 4 && !is_from_tensor) {
     // N * H * W * 2
-    output->set_dims(phi::make_ddim({theta_dims[0], -1, -1, 2}));
+    output->set_dims(common::make_ddim({theta_dims[0], -1, -1, 2}));
   } else {
     // N * D * H * W * 3
-    output->set_dims(phi::make_ddim({theta_dims[0], -1, -1, -1, 3}));
+    output->set_dims(common::make_ddim({theta_dims[0], -1, -1, -1, 3}));
   }
   output->set_dtype(input.dtype());
   output->share_lod(input);
@@ -146,7 +146,7 @@ void AllToAllInferMeta(const MetaTensor& x, MetaTensor* out) {
 
 void ArrayLengthInferMeta(const MetaTensor& x, MetaTensor* out) {
   out->set_dtype(phi::DataType::INT64);
-  out->set_dims(make_ddim({1}));
+  out->set_dims(common::make_ddim({1}));
 }
 
 void ArrayToTensorInferMeta(const MetaTensor& x,
@@ -158,21 +158,21 @@ void ArrayToTensorInferMeta(const MetaTensor& x,
   if (config.is_runtime) return;
   auto dims = x.dims();
   // if the shape is empty
-  if (dims == phi::make_ddim({0UL})) return;
+  if (dims == common::make_ddim({0UL})) return;
   // otherwise, suppose the shape of array is the shape of tensor in the
   // array, which is consistent with what tensor_array_read_write dose
   if (use_stack) {
-    auto dim_vec = phi::vectorize<int>(dims);
+    auto dim_vec = common::vectorize<int>(dims);
     // use -1 for the stack dim size
     dim_vec.insert(dim_vec.begin() + axis, -1);
-    dims = phi::make_ddim(dim_vec);
+    dims = common::make_ddim(dim_vec);
   } else {
     // use -1 for the concat dim size
     dims[axis] = -1;
   }
   out->set_dims(dims);
   out_index->set_dtype(DataType::INT32);
-  out_index->set_dims(phi::make_ddim({-1}));
+  out_index->set_dims(common::make_ddim({-1}));
 }
 
 void ArgMinMaxInferMeta(const MetaTensor& x,
@@ -208,7 +208,7 @@ void ArgMinMaxInferMeta(const MetaTensor& x,
         vec = std::vector<int64_t>(x.dims().size() - 1, -1);
       }
     }
-    out->set_dims(phi::make_ddim(vec));
+    out->set_dims(common::make_ddim(vec));
     if (dtype == DataType::INT32 || dtype == DataType::INT64) {
       out->set_dtype(dtype);
     }
@@ -249,7 +249,7 @@ void ArgMinMaxInferMeta(const MetaTensor& x,
     if (dtype == DataType::INT32) {
       int64_t all_element_num = 0;
       if (flatten) {
-        all_element_num = phi::product(x_dims);
+        all_element_num = common::product(x_dims);
       } else {
         all_element_num = x_dims[static_cast<int>(int_axis)];
       }
@@ -282,7 +282,7 @@ void ArgMinMaxInferMeta(const MetaTensor& x,
       vec.emplace_back(x_dims[static_cast<int>(i)]);
   }
 
-  out->set_dims(phi::make_ddim(vec));
+  out->set_dims(common::make_ddim(vec));
   if (dtype == DataType::INT32 || dtype == DataType::INT64) {
     out->set_dtype(dtype);
   }
@@ -325,9 +325,9 @@ void ArgsortInferMeta(const MetaTensor& input,
 }
 
 void AsRealInferMeta(const MetaTensor& input, MetaTensor* output) {
-  auto out_dims_v = phi::vectorize(input.dims());
+  auto out_dims_v = common::vectorize(input.dims());
   out_dims_v.push_back(2);
-  auto out_dims = phi::make_ddim(out_dims_v);
+  auto out_dims = common::make_ddim(out_dims_v);
   output->set_dims(out_dims);
   output->share_lod(input);
   output->set_dtype(dtype::ToReal(input.dtype()));
@@ -374,7 +374,7 @@ void BatchSizeLikeInferMeta(const MetaTensor& x,
   std::transform(shape.begin(), shape.end(), shape_int64.begin(), [](int a) {
     return static_cast<int64_t>(a);
   });
-  auto output_dim = phi::make_ddim(shape_int64);
+  auto output_dim = common::make_ddim(shape_int64);
 
   int input_dim_size = static_cast<int>(x.dims().size());
   PADDLE_ENFORCE_GE(
@@ -459,7 +459,7 @@ void CINNBroadcastInferMeta(const MetaTensor& x,
                             const std::vector<int64_t>& axes,
                             const std::vector<int64_t>& out_shape,
                             MetaTensor* out) {
-  out->set_dims(phi::make_ddim(out_shape));
+  out->set_dims(common::make_ddim(out_shape));
   out->set_dtype(x.dtype());
 }
 
@@ -488,7 +488,7 @@ void ClassCenterSampleInferMeta(const MetaTensor& label,
           "output of sampled local class center should not be null."));
   remapped_label->set_dims(label.dims());
   remapped_label->set_dtype(label.dtype());
-  sampled_local_class_center->set_dims(phi::make_ddim({num_samples}));
+  sampled_local_class_center->set_dims(common::make_ddim({num_samples}));
   sampled_local_class_center->set_dtype(label.dtype());
 }
 
@@ -532,7 +532,7 @@ void CumInferMeta(const MetaTensor& x,
                   MetaTensor* out) {
   auto x_dims = x.dims();
   if (flatten) {
-    out->set_dims(phi::make_ddim({phi::product(x_dims)}));
+    out->set_dims(common::make_ddim({common::product(x_dims)}));
     out->set_dtype(x.dtype());
   } else {
     if (x_dims.size() > 0) {
@@ -598,7 +598,7 @@ void CumWithIndicesInferMeta(const MetaTensor& x,
       _axis = axis;
     }
     PADDLE_ENFORCE_LT(
-        phi::vectorize(x_dims)[_axis],
+        common::vectorize(x_dims)[_axis],
         INT32_MAX,
         phi::errors::OutOfRange(
             "cummax with axis %ld may be overflow, set dtype int64 to continue",
@@ -678,7 +678,7 @@ void CropInferMeta(const MetaTensor& x,
       }
     }
   }
-  out->set_dims(phi::make_ddim(out_dims));
+  out->set_dims(common::make_ddim(out_dims));
   out->set_dtype(x.dtype());
 }
 
@@ -698,7 +698,7 @@ void DecodeJpegInferMeta(const MetaTensor& x,
                   mode);
   }
   if (out != nullptr) {
-    out->set_dims(phi::make_ddim(out_dims));
+    out->set_dims(common::make_ddim(out_dims));
     out->set_dtype(x.dtype());
   }
 }
@@ -767,11 +767,11 @@ void DiagEmbedInferMeta(
                         dim2));
 
   int new_dim_len = static_cast<int>(offset_ + x_dims[x_dims.size() - 1]);
-  auto sizes = vectorize(x_dims);
+  auto sizes = common::vectorize(x_dims);
   sizes.pop_back();
   sizes.insert(sizes.begin() + std::min(dim1_, dim2_), new_dim_len);
   sizes.insert(sizes.begin() + std::max(dim1_, dim2_), new_dim_len);
-  out->set_dims(phi::make_ddim(sizes));
+  out->set_dims(common::make_ddim(sizes));
   out->set_dtype(x.dtype());
 }
 
@@ -873,7 +873,7 @@ void DiagonalInferMeta(const MetaTensor& input,
                                    axis1,
                                    axis2));
 
-  auto out_dims = vectorize(x_dims);
+  auto out_dims = common::vectorize(x_dims);
   // from out_dims get the dim size of axis1_.
   auto axis1_size = out_dims[axis1_];
   auto axis2_size = out_dims[axis2_];
@@ -903,7 +903,7 @@ void DiagonalInferMeta(const MetaTensor& input,
       out_dims.push_back(0);
     }
   }
-  out->set_dims(phi::make_ddim(out_dims));
+  out->set_dims(common::make_ddim(out_dims));
   out->set_dtype(input.dtype());
 }
 
@@ -962,7 +962,7 @@ void EigInferMeta(const MetaTensor& x, MetaTensor* out_w, MetaTensor* out_v) {
   const DataType& x_dtype = x.dtype();
   const DataType& out_dtype =
       IsComplexType(x_dtype) ? x_dtype : ToComplexType(x_dtype);
-  out_w->set_dims(phi::make_ddim(batch_dims_vec));
+  out_w->set_dims(common::make_ddim(batch_dims_vec));
   out_w->set_dtype(out_dtype);
   out_v->set_dims(x_dims);
   out_v->set_dtype(out_dtype);
@@ -1009,7 +1009,7 @@ void EighInferMeta(const MetaTensor& x,
   for (auto i = 0; i < rank - 1; i++) {
     values_dim.emplace_back(input_dim[i]);
   }
-  out_w->set_dims(phi::make_ddim(values_dim));
+  out_w->set_dims(common::make_ddim(values_dim));
   out_w->set_dtype(dtype::ToReal(x.dtype()));
   out_v->set_dims(input_dim);
   out_v->set_dtype(x.dtype());
@@ -1026,7 +1026,7 @@ void EigvalsInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config) {
                         x_dims.size(),
                         x_dims));
 
-  if (config.is_runtime || !phi::contain_unknown_dim(x_dims)) {
+  if (config.is_runtime || !common::contain_unknown_dim(x_dims)) {
     int last_dim = x_dims.size() - 1;
     PADDLE_ENFORCE_EQ(x_dims[last_dim],
                       x_dims[last_dim - 1],
@@ -1037,14 +1037,14 @@ void EigvalsInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config) {
                           x_dims));
   }
 
-  auto out_dims = vectorize(x_dims);
+  auto out_dims = common::vectorize(x_dims);
   out_dims.resize(x_dims.size() - 1);
 
   const DataType& x_dtype = x.dtype();
   const DataType& out_dtype =
       IsComplexType(x_dtype) ? x_dtype : ToComplexType(x_dtype);
 
-  out->set_dims(make_ddim(out_dims));
+  out->set_dims(common::make_ddim(out_dims));
   out->set_dtype(out_dtype);
 }
 
@@ -1079,7 +1079,7 @@ void EigvalshInferMeta(const MetaTensor& x,
   }
 
   if (out_w != nullptr) {
-    out_w->set_dims(phi::make_ddim(values_dim));
+    out_w->set_dims(common::make_ddim(values_dim));
     out_w->set_dtype(dtype::ToReal(x.dtype()));
   }
   if (out_v != nullptr) {
@@ -1127,7 +1127,7 @@ void EinsumInferMeta(const std::vector<const MetaTensor*>& inputs,
           << paddle::string::join_strings(output_dims, ",");
   VLOG(3) << "Label Type is : " << label_to_string(all_labels, labeltype);
   VLOG(3) << "Label Shape is : " << label_to_string(all_labels, labelshape);
-  out->set_dims(make_ddim(output_dims));
+  out->set_dims(common::make_ddim(output_dims));
   out->set_dtype(inputs[0]->dtype());
 }
 
@@ -1207,7 +1207,7 @@ void ExpandInferMeta(const MetaTensor& x,
     }
   }
 
-  out->set_dims(make_ddim(out_shape));
+  out->set_dims(common::make_ddim(out_shape));
   out->set_dtype(x.dtype());
   if (out_rank > 0 && out_shape[0] == x_dims[0]) {
     out->share_lod(x);
@@ -1403,7 +1403,7 @@ void FlattenWithXShapeInferMeta(const MetaTensor& x,
   for (int i = stop_axis + 1; i < in_dims_size; i++) {
     out_shape.push_back(x_dims[i]);  // NOLINT
   }
-  const auto& out_dims = phi::make_ddim(out_shape);
+  const auto& out_dims = common::make_ddim(out_shape);
   out->set_dims(out_dims);
   out->set_dtype(x.dtype());
   out->set_layout(x.layout());
@@ -1419,7 +1419,7 @@ void FlattenWithXShapeInferMeta(const MetaTensor& x,
   for (int i = 0; i < x_dims.size(); ++i) {
     xshape_dims[i + 1] = x_dims[i];
   }
-  xshape->set_dims(phi::make_ddim(xshape_dims));
+  xshape->set_dims(common::make_ddim(xshape_dims));
   xshape->share_lod(x);
 }
 
@@ -1493,7 +1493,7 @@ void FlipInferMeta(const MetaTensor& x,
     output_dims[i] = x_dims[i];
   }
 
-  out->set_dims(phi::make_ddim(output_dims));
+  out->set_dims(common::make_ddim(output_dims));
   out->set_dtype(x.dtype());
   out->share_lod(x);
 }
@@ -1699,7 +1699,7 @@ void FoldInferMeta(const MetaTensor& x,
   out_dims.push_back(output_height);
   out_dims.push_back(output_width);
   if (out != nullptr) {
-    out->set_dims(phi::make_ddim(out_dims));
+    out->set_dims(common::make_ddim(out_dims));
     out->set_dtype(x.dtype());
   }
 }
@@ -1751,7 +1751,7 @@ void FrameInferMeta(const MetaTensor& x,
     end_axis = x_rank - 2;
   }
 
-  bool contain_unknown_dim = phi::contain_unknown_dim(x_dims);
+  bool contain_unknown_dim = common::contain_unknown_dim(x_dims);
   bool check = config.is_runtime || !contain_unknown_dim;
   if (check) {
     PADDLE_ENFORCE_LE(frame_length,
@@ -1784,7 +1784,7 @@ void FrameInferMeta(const MetaTensor& x,
     output_shape.push_back(n_frames);
   }
 
-  out->set_dims(phi::make_ddim(output_shape));
+  out->set_dims(common::make_ddim(output_shape));
   out->set_dtype(x.dtype());
 }
 
@@ -1835,7 +1835,7 @@ void IdentityLossInferMeta(const MetaTensor& x,
     out->set_dtype(x.dtype());
     out->set_dims(x.dims());
   } else {
-    out->set_dims(phi::make_ddim({}));
+    out->set_dims(common::make_ddim({}));
     out->set_dtype(x.dtype());
   }
 }
@@ -1854,8 +1854,8 @@ void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out) {
 
 static phi::DDim ValidateShape(const std::vector<int64_t> shape,
                                const phi::DDim& in_dims) {
-  const int64_t in_size = phi::product(in_dims);
-  auto in_dims_vec = phi::vectorize(in_dims);
+  const int64_t in_size = common::product(in_dims);
+  auto in_dims_vec = common::vectorize(in_dims);
   std::vector<int64_t> output_shape(shape.size(), 0);
   int64_t capacity = 1;
   int unk_dim_idx = -1;
@@ -1869,7 +1869,7 @@ static phi::DDim ValidateShape(const std::vector<int64_t> shape,
           phi::errors::InvalidArgument(
               "Only one dimension value of 'shape' in ReshapeOp can "
               "be -1. But received shape = [%s], shape[%d] is also -1.",
-              phi::make_ddim(shape),
+              common::make_ddim(shape),
               i));
       unk_dim_idx = static_cast<int>(i);
       output_shape[i] = shape[i];
@@ -1893,7 +1893,7 @@ static phi::DDim ValidateShape(const std::vector<int64_t> shape,
               "Each dimension value of 'shape' in ReshapeOp must not "
               "be negative except one unknown dimension. "
               "But received  shape = [%s], shape[%d] = %d.",
-              phi::make_ddim(shape),
+              common::make_ddim(shape),
               i,
               shape[i]));
       output_shape[i] = shape[i];
@@ -1912,7 +1912,7 @@ static phi::DDim ValidateShape(const std::vector<int64_t> shape,
                           "can not rehsape %s to %s, because the unspecified "
                           "dimension %i can be any number and is ambiguous",
                           in_dims,
-                          phi::make_ddim(shape),
+                          common::make_ddim(shape),
                           unk_dim_idx));
   }
 
@@ -1934,7 +1934,7 @@ static phi::DDim ValidateShape(const std::vector<int64_t> shape,
               "'shape' is [%s], known capacity of 'shape' is %d.",
               in_dims,
               in_size,
-              phi::make_ddim(shape),
+              common::make_ddim(shape),
               capacity));
     } else {
       // such as [-1, 8, 3]->[-1, 8], out_shape will remain [-1, 8]
@@ -1953,12 +1953,12 @@ static phi::DDim ValidateShape(const std::vector<int64_t> shape,
               "[%s], the capacity of 'shape' is %d.",
               in_dims,
               in_size,
-              phi::make_ddim(shape),
+              common::make_ddim(shape),
               capacity));
     }
   }
 
-  return phi::make_ddim(output_shape);
+  return common::make_ddim(output_shape);
 }
 
 void InferMetaFromVecValue(const MetaTensor& x,
@@ -2015,7 +2015,7 @@ void InverseInferMeta(const MetaTensor& x, MetaTensor* out) {
 }
 
 void IsEmptyInferMeta(const MetaTensor& x, MetaTensor* out) {
-  out->set_dims(phi::make_ddim({}));
+  out->set_dims(common::make_ddim({}));
   out->set_dtype(DataType::BOOL);
 }
 
@@ -2086,7 +2086,7 @@ void KthvalueInferMeta(const MetaTensor& x,
   for (int i = axis + 1; i < dim_size; i++) {
     dimvec.emplace_back(input_dims[i]);
   }
-  DDim dims = phi::make_ddim(dimvec);
+  DDim dims = common::make_ddim(dimvec);
   out->set_dims(dims);
   out->share_lod(x);
   out->set_dtype(x.dtype());
@@ -2163,17 +2163,17 @@ void LUInferMeta(const MetaTensor& x,
   int m = static_cast<int>(x_dims[x_rank - 1]);
   int n = static_cast<int>(x_dims[x_rank - 2]);
   int min_mn = std::min(m, n);
-  auto dims_vec = phi::vectorize(x_dims);
+  auto dims_vec = common::vectorize(x_dims);
   PADDLE_ENFORCE_NOT_NULL(
       infos,
       phi::errors::InvalidArgument("Output(Infos) should not be nullptr."));
   if (x_rank == 2) {
     auto Infos_dim = std::vector<int>(1);
-    infos->set_dims(phi::make_ddim(Infos_dim));
+    infos->set_dims(common::make_ddim(Infos_dim));
   } else {
     auto Infos_dim =
         std::vector<int>(dims_vec.begin(), dims_vec.begin() + x_rank - 2);
-    infos->set_dims(phi::make_ddim(Infos_dim));
+    infos->set_dims(common::make_ddim(Infos_dim));
   }
   infos->set_dtype(DataType::INT32);
   if (pivot) {
@@ -2183,7 +2183,7 @@ void LUInferMeta(const MetaTensor& x,
     auto Pivots_dim =
         std::vector<int>(dims_vec.begin(), dims_vec.begin() + x_rank - 1);
     Pivots_dim[x_rank - 2] = min_mn;
-    pivots->set_dims(phi::make_ddim(Pivots_dim));
+    pivots->set_dims(common::make_ddim(Pivots_dim));
     pivots->set_dtype(DataType::INT32);
   }
 }
@@ -2253,7 +2253,7 @@ void MaxOutInferMeta(const MetaTensor& x,
   std::vector<int64_t> output_shape(
       {in_x_dims[0], in_x_dims[1], in_x_dims[2], in_x_dims[3]});
   output_shape[axis] = in_x_dims[axis] / groups;
-  out->set_dims(phi::make_ddim(output_shape));
+  out->set_dims(common::make_ddim(output_shape));
   out->set_dtype(x.dtype());
 }
 
@@ -2326,15 +2326,15 @@ void MaxPoolWithIndexInferMeta(const MetaTensor& x,
     }
   }
 
-  out->set_dims(make_ddim(output_shape));
+  out->set_dims(common::make_ddim(output_shape));
   out->set_dtype(x.dtype());
 
-  mask->set_dims(make_ddim(output_shape));
+  mask->set_dims(common::make_ddim(output_shape));
   mask->set_dtype(phi::CppTypeToDataType<int>::Type());
 }
 
 void MeanAllInferMeta(const MetaTensor& x, MetaTensor* out) {
-  out->set_dims(phi::make_ddim({}));
+  out->set_dims(common::make_ddim({}));
   out->set_dtype(x.dtype());
   out->set_layout(x.layout());
 }
@@ -2389,7 +2389,7 @@ void ModeInferMeta(const MetaTensor& x,
   for (int i = axis + 1; i < dim_size; i++) {
     dimvec.emplace_back(input_dims[i]);
   }
-  DDim dims = phi::make_ddim(dimvec);
+  DDim dims = common::make_ddim(dimvec);
   out->set_dims(dims);
   out->share_lod(x);
   out->set_dtype(x.dtype());
@@ -2436,7 +2436,7 @@ void MultinomialInferMeta(const MetaTensor& x,
     out_dims[x_rank - 1] = -1;
   }
 
-  out->set_dims(make_ddim(out_dims));
+  out->set_dims(common::make_ddim(out_dims));
   out->set_dtype(DataType::INT64);
 }
 
@@ -2450,7 +2450,7 @@ void NanmedianInferMeta(const MetaTensor& x,
   int64_t x_rank = x_dim.size();
   out->set_dtype(x.dtype());
   median_index->set_dtype(DataType::INT64);
-  median_index->set_dims(make_ddim({x.numel() * 2}));
+  median_index->set_dims(common::make_ddim({x.numel() * 2}));
 
   std::vector<int32_t> out_dim;
   if (axis_list.empty()) {
@@ -2506,7 +2506,7 @@ void NanmedianInferMeta(const MetaTensor& x,
     }
   }
 
-  out->set_dims(make_ddim(out_dim));
+  out->set_dims(common::make_ddim(out_dim));
 }
 
 void NMSInferMeta(const MetaTensor& x, float threshold, MetaTensor* out) {
@@ -2518,7 +2518,7 @@ void NMSInferMeta(const MetaTensor& x, float threshold, MetaTensor* out) {
                         "whose shape must be [N, 4] "
                         "N is the number of boxes "
                         "in last dimension in format [x1, x2, y1, y2]. "));
-  out->set_dims(phi::make_ddim({-1}));
+  out->set_dims(common::make_ddim({-1}));
   out->set_dtype(DataType::INT64);
 }
 
@@ -2529,7 +2529,7 @@ void NonZeroInferMeta(const MetaTensor& condition, MetaTensor* out) {
       1UL,
       phi::errors::InvalidArgument(
           "Input(Condition) should have number of dimension at least 1"));
-  out->set_dims(phi::make_ddim({-1, rank}));
+  out->set_dims(common::make_ddim({-1, rank}));
   out->set_dtype(DataType::INT64);
 }
 
@@ -2561,9 +2561,9 @@ void OneHotRawInferMeta(const MetaTensor& x,
       x_dims.size(),
       0,
       phi::errors::InvalidArgument("Rank of Input(X) should be at least 0."));
-  auto out_dims_vec = phi::vectorize(x_dims);
+  auto out_dims_vec = common::vectorize(x_dims);
   out_dims_vec.push_back(depth.to<int>());
-  auto out_dims = phi::make_ddim(out_dims_vec);
+  auto out_dims = common::make_ddim(out_dims_vec);
   out->set_dims(out_dims);
   out->share_lod(x);
   out->set_dtype(dtype);
@@ -2579,9 +2579,9 @@ void OneHotInferMeta(const MetaTensor& x,
       phi::errors::InvalidArgument("Rank of Input(X) should be at least 0."));
 
   int depth = depth_t.to<int>();
-  auto out_dims_vec = phi::vectorize(x_dims);
+  auto out_dims_vec = common::vectorize(x_dims);
   out_dims_vec.push_back(depth);
-  auto out_dims = phi::make_ddim(out_dims_vec);
+  auto out_dims = common::make_ddim(out_dims_vec);
   out->set_dims(out_dims);
   out->share_lod(x);
 
@@ -2637,7 +2637,7 @@ void OverlapAddInferMeta(const MetaTensor& x,
     end_axis = x_rank - 3;
   }
 
-  bool contain_unknown_dim = phi::contain_unknown_dim(x_dims);
+  bool contain_unknown_dim = common::contain_unknown_dim(x_dims);
   bool check = config.is_runtime || !contain_unknown_dim;
   if (check) {
     PADDLE_ENFORCE_LE(
@@ -2669,7 +2669,7 @@ void OverlapAddInferMeta(const MetaTensor& x,
     output_shape.push_back(seq_length);
   }
 
-  out->set_dims(phi::make_ddim(output_shape));
+  out->set_dims(common::make_ddim(output_shape));
 }
 
 void PadInferMeta(const MetaTensor& input,
@@ -2704,7 +2704,7 @@ void PadInferMeta(const MetaTensor& input,
       out_dims[i] = x_dim[i] + paddings[i * 2] + paddings[i * 2 + 1];
     }
   }
-  out->set_dims(phi::make_ddim(out_dims));
+  out->set_dims(common::make_ddim(out_dims));
   if (out_dims[0] == x_dim[0]) {
     // Only pass LoD when the first dimension is equal between
     // output and input.
@@ -2786,7 +2786,7 @@ void Pad3dInferMeta(const MetaTensor& x,
     }
   }
 
-  out->set_dims(phi::make_ddim(out_dims));
+  out->set_dims(common::make_ddim(out_dims));
   out->set_dtype(x.dtype());
   out->share_lod(x);
 }
@@ -2986,7 +2986,7 @@ void PNormInferMeta(const MetaTensor& x,
     }
   }
 
-  out->set_dims(phi::make_ddim(out_dim_vector));
+  out->set_dims(common::make_ddim(out_dim_vector));
   out->set_dtype(x.dtype());
 }
 
@@ -3007,7 +3007,7 @@ void Pool2DInferMeta(const MetaTensor& x,
                             (data_format == "NHWC" || data_format == "NDHWC");
   if (!config.is_runtime && kernel_size.FromTensor()) {
     auto x_dims = x.dims();
-    std::vector<int64_t> output_shape = std::move(phi::vectorize(x_dims));
+    std::vector<int64_t> output_shape = std::move(common::vectorize(x_dims));
     // set dims of HW -1
     output_shape[x_dims.size() - 2] = -1;
     if (channel_last) {  // for NHWC, NDHWC
@@ -3015,7 +3015,7 @@ void Pool2DInferMeta(const MetaTensor& x,
     } else {  // for NCHW
       output_shape[x_dims.size() - 1] = -1;
     }
-    out->set_dims(make_ddim(output_shape));
+    out->set_dims(common::make_ddim(output_shape));
     out->share_lod(x);
     out->set_dtype(x.dtype());
   } else {
@@ -3107,7 +3107,7 @@ void PoolInferMeta(const MetaTensor& x,
                         x_dims.size(),
                         x_dims,
                         kernel_size_.size(),
-                        make_ddim(kernel_size_)));
+                        common::make_ddim(kernel_size_)));
 
   PADDLE_ENFORCE_EQ(
       kernel_size_.size(),
@@ -3119,8 +3119,8 @@ void PoolInferMeta(const MetaTensor& x,
           "size is %d, Attr(kernel_size_) is [%s], Attr(strides)is [%s].",
           kernel_size_.size(),
           strides.size(),
-          make_ddim(kernel_size_),
-          make_ddim(strides)));
+          common::make_ddim(kernel_size_),
+          common::make_ddim(strides)));
 
   // MKL-DNN Kernels are using NCHW order of dims description
   // so we ignore data_format consideration for MKL-DNN kernel
@@ -3175,7 +3175,7 @@ void PoolInferMeta(const MetaTensor& x,
     output_shape.insert(output_shape.begin() + 1, x_dims[1]);
   }
 
-  out->set_dims(make_ddim(output_shape));
+  out->set_dims(common::make_ddim(output_shape));
   out->share_lod(x);
   out->set_dtype(x.dtype());
 }
@@ -3205,18 +3205,18 @@ void QrInferMeta(const MetaTensor& x,
 
   if (compute_q) {
     int k = reduced_mode ? min_mn : m;
-    auto q_dims_vec = phi::vectorize(x_dims);
+    auto q_dims_vec = common::vectorize(x_dims);
     q_dims_vec[q_dims_vec.size() - 1] = k;
-    q->set_dims(phi::make_ddim(q_dims_vec));
+    q->set_dims(common::make_ddim(q_dims_vec));
   } else {
-    q->set_dims(phi::make_ddim({0}));
+    q->set_dims(common::make_ddim({0}));
   }
 
   int k = reduced_mode ? min_mn : m;
-  auto r_dims_vec = phi::vectorize(x_dims);
+  auto r_dims_vec = common::vectorize(x_dims);
   r_dims_vec[r_dims_vec.size() - 2] = k;
   r_dims_vec[r_dims_vec.size() - 1] = n;
-  r->set_dims(phi::make_ddim(r_dims_vec));
+  r->set_dims(common::make_ddim(r_dims_vec));
 
   q->share_lod(x);
   r->share_lod(x);
@@ -3288,7 +3288,7 @@ DDim ReduceInferDim(const MetaTensor& x,
     }
   }
 
-  DDim out_dim = phi::make_ddim(out_dim_vector);
+  DDim out_dim = common::make_ddim(out_dim_vector);
   return out_dim;
 }
 
@@ -3339,7 +3339,7 @@ DDim ReduceInferDimForIntArrayAxis(const MetaTensor& x,
       }
     }
   }
-  return phi::make_ddim(vec_dim);
+  return common::make_ddim(vec_dim);
 }
 
 void ReduceIntArrayAxisInferMetaBase(const MetaTensor& x,
@@ -3390,7 +3390,7 @@ void RepeatInterleaveInferMeta(const MetaTensor& x,
                                int dim,
                                MetaTensor* out) {
   const auto& input_dim = x.dims();
-  auto output_dim = phi::vectorize(input_dim);
+  auto output_dim = common::vectorize(input_dim);
   auto n_dim = dim;
 
   if (n_dim < 0) n_dim += input_dim.size();
@@ -3425,7 +3425,7 @@ void RepeatInterleaveInferMeta(const MetaTensor& x,
           "repeat_interleave's output tensor can't be nullptr"));
 
   output_dim[n_dim] = input_dim[n_dim] * repeats;
-  out->set_dims(phi::make_ddim(output_dim));
+  out->set_dims(common::make_ddim(output_dim));
   out->share_lod(x);
   out->set_dtype(x.dtype());
 }
@@ -3439,7 +3439,7 @@ void ReshapeInferMeta(const MetaTensor& x,
                           phi::errors::InvalidArgument(
                               "Output(Out) of ReshapeOp should not be null."));
   if (!config.is_runtime && shape.FromTensor()) {
-    out->set_dims(phi::make_ddim(shape_data));
+    out->set_dims(common::make_ddim(shape_data));
     out->share_lod(x);
     return;
   }
@@ -3461,7 +3461,7 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x,
   for (int i = 0; i < x_dims.size(); ++i) {
     xshape_dims[i + 1] = x_dims[i];
   }
-  xshape->set_dims(phi::make_ddim(xshape_dims));
+  xshape->set_dims(common::make_ddim(xshape_dims));
   xshape->share_lod(x);
   xshape->set_strides(x.strides());
   ReshapeInferMeta(x, shape, out, config);
@@ -3620,7 +3620,7 @@ void SetValueInferMeta(const MetaTensor& x, MetaTensor* out) {
 
 void ShapeInferMeta(const MetaTensor& input, MetaTensor* out) {
   auto in_dim = input.dims();
-  out->set_dims(phi::make_ddim({in_dim.size()}));
+  out->set_dims(common::make_ddim({in_dim.size()}));
   out->set_dtype(DataType::INT32);
 }
 
@@ -3654,7 +3654,7 @@ void ShardIndexInferMeta(const MetaTensor& in,
 
 void NumelInferMeta(const MetaTensor& input, MetaTensor* out) {
   out->set_dtype(DataType::INT64);
-  out->set_dims(phi::make_ddim({}));
+  out->set_dims(common::make_ddim({}));
 }
 
 // This logic is copied from
@@ -3862,7 +3862,7 @@ void SplitInferMeta(const MetaTensor& x,
     if ((sections.FromTensor() && !config.is_runtime) || axis_value == -1) {
       out_dims = std::vector<phi::DDim>(
           sections_data.size(),
-          phi::make_ddim(std::vector<int>(x.dims().size(), -1)));
+          common::make_ddim(std::vector<int>(x.dims().size(), -1)));
     } else {
       out_dims = std::vector<phi::DDim>(sections_data.size(), x.dims());
     }
@@ -3904,7 +3904,7 @@ void SplitInferMeta(const MetaTensor& x,
                           "Only one dimension value of Attr(num_or_sections) "
                           "in SplitOp can be -1. "
                           "But received Attr(num_or_sections) = [%s].",
-                          phi::make_ddim(sections_data)));
+                          common::make_ddim(sections_data)));
 
     if (unknow_dim_idx != -1) {
       // for example, input shape = [4 ,5], axis = 1, sections = [2, 3, -1].
@@ -3919,7 +3919,7 @@ void SplitInferMeta(const MetaTensor& x,
               "size "
               "along the split dimension. But received Attr(num_or_sections) "
               "= [%s], input(X)'s shape = [%s], Attr(dim) = %d.",
-              phi::make_ddim(sections_data),
+              common::make_ddim(sections_data),
               x.dims(),
               axis_value));
 
@@ -3933,7 +3933,7 @@ void SplitInferMeta(const MetaTensor& x,
               "size "
               "along the split dimension. But received Attr(num_or_sections)"
               " = [%s], input(X)'s shape = [%s], Attr(dim) = %d.",
-              phi::make_ddim(sections_data),
+              common::make_ddim(sections_data),
               x.dims(),
               axis_value));
     }
@@ -3953,7 +3953,7 @@ void SplitWithNumInferMeta(const MetaTensor& x,
     std::vector<phi::DDim> out_dims;
     if (axis_value == -1) {
       out_dims = std::vector<phi::DDim>(
-          num, phi::make_ddim(std::vector<int>(x.dims().size(), -1)));
+          num, common::make_ddim(std::vector<int>(x.dims().size(), -1)));
     } else {
       out_dims = std::vector<phi::DDim>(num, x.dims());
     }
@@ -4024,7 +4024,7 @@ void SqueezeInferMeta(const MetaTensor& x,
     }
     std::vector<int64_t> vec_out_dims(output_size, -1);
 
-    out->set_dims(phi::make_ddim(vec_out_dims));
+    out->set_dims(common::make_ddim(vec_out_dims));
   } else {
     std::vector<int32_t> tmp;
     tmp.reserve(axes.GetData().size());
@@ -4055,7 +4055,7 @@ void SqueezeWithXShapeInferMeta(const MetaTensor& x,
     xshape_dims[i + 1] = x_dims[i];
   }
   if (xshape) {
-    xshape->set_dims(phi::make_ddim(xshape_dims));
+    xshape->set_dims(common::make_ddim(xshape_dims));
     xshape->share_lod(x);
     xshape->set_dtype(x.dtype());
   }
@@ -4159,7 +4159,7 @@ void StridedSliceRawInferMeta(const MetaTensor& x,
                                     axes.size(),
                                     true);
   }
-  DDim out_dims(phi::make_ddim(out_dims_vector));
+  DDim out_dims(common::make_ddim(out_dims_vector));
   // generate new shape
   if (!decrease_axis.empty()) {
     std::vector<int64_t> new_out_shape;
@@ -4180,7 +4180,7 @@ void StridedSliceRawInferMeta(const MetaTensor& x,
         new_out_shape.push_back(out_dims[i]);
       }
     }
-    out_dims = phi::make_ddim(new_out_shape);
+    out_dims = common::make_ddim(new_out_shape);
   }
   VLOG(4) << "out_dims: " << out_dims;
   out->set_dims(out_dims);
@@ -4255,24 +4255,24 @@ void SvdInferMeta(const MetaTensor& x,
                   MetaTensor* vh) {
   auto UDDim = [](const DDim& x_dim, int k) {
     // get x_dim and return the ddim of U
-    auto x_vec = vectorize(x_dim);
+    auto x_vec = common::vectorize(x_dim);
     x_vec[x_vec.size() - 1] = k;
-    return phi::make_ddim(x_vec);
+    return common::make_ddim(x_vec);
   };
 
   auto VHDDim = [](const DDim& x_dim, int k) {
     // get x_dim and return the ddim of U
-    auto x_vec = vectorize(x_dim);
+    auto x_vec = common::vectorize(x_dim);
     x_vec[x_vec.size() - 2] = k;
-    return phi::make_ddim(x_vec);
+    return common::make_ddim(x_vec);
   };
 
   auto SDDim = [](const DDim& x_dim, int k) {
     // get x_dim and return the ddim of U
-    auto x_vec = vectorize(x_dim);
+    auto x_vec = common::vectorize(x_dim);
     x_vec[x_vec.size() - 2] = k;
     x_vec.erase(x_vec.end() - 1);  // rank - 1
-    return phi::make_ddim(x_vec);
+    return common::make_ddim(x_vec);
   };
 
   auto in_dims = x.dims();
@@ -4380,7 +4380,7 @@ void TileInferMeta(const MetaTensor& x,
   auto out_rank =
       std::max(static_cast<size_t>(x_dims.size()), repeat_times_data.size());
   std::vector<int64_t> out_shape(out_rank);
-  auto x_dim_vec = phi::vectorize<int>(x_dims);
+  auto x_dim_vec = common::vectorize<int>(x_dims);
   if (x_dim_vec.size() > repeat_times_data.size()) {
     auto diff = x_dim_vec.size() - repeat_times_data.size();
     repeat_times_data.insert(repeat_times_data.begin(), diff, 1);
@@ -4403,7 +4403,7 @@ void TileInferMeta(const MetaTensor& x,
     }
   }
 
-  out->set_dims(phi::make_ddim(out_shape));
+  out->set_dims(common::make_ddim(out_shape));
   if (out_rank > 0 && (out_shape[0] == x_dims[0])) {
     out->share_lod(x);
   }
@@ -4529,14 +4529,14 @@ void TraceInferMeta(
                                    dim1,
                                    dim2));
 
-  auto sizes = vectorize(x_dims);
+  auto sizes = common::vectorize(x_dims);
   if (x_dims.size() == 2) {
     sizes.clear();
   } else {
     sizes.erase(sizes.begin() + std::max(dim1_, dim2_));
     sizes.erase(sizes.begin() + std::min(dim1_, dim2_));
   }
-  out->set_dims(phi::make_ddim(sizes));
+  out->set_dims(common::make_ddim(sizes));
   out->set_dtype(x.dtype());
 }
 
@@ -4634,7 +4634,7 @@ void UnbindInferMeta(const MetaTensor& x,
   for (int i = 0; i < in_dims.size(); ++i) {
     if (i != axis) out_dim.push_back(in_dims[i]);  // NOLINT
   }
-  auto out_dims = phi::make_ddim(out_dim);
+  auto out_dims = common::make_ddim(out_dim);
 
   for (auto& out : outs) {
     out->set_dtype(x.dtype());
@@ -4872,7 +4872,7 @@ void UnfoldInferMeta(const MetaTensor& x,
         output_height == -1 || output_width == -1 ? -1 : output_col_length;
   }
   out_dims.push_back(output_col_length);
-  out->set_dims(phi::make_ddim(out_dims));
+  out->set_dims(common::make_ddim(out_dims));
   out->set_dtype(x.dtype());
 }
 
@@ -4954,7 +4954,7 @@ void UniqueConsecutiveInferMeta(const MetaTensor& x,
     out->set_dims({-1});
     out->set_dtype(x.dtype());
     if (return_inverse) {
-      index->set_dims({phi::product(in_dims)});
+      index->set_dims({common::product(in_dims)});
     }
   } else {
     int axis_value = axis[0];
@@ -5023,7 +5023,7 @@ void UniqueRawInferMeta(const MetaTensor& x,
                           "The Input(X) should be 0-D or 1-D Tensor, "
                           "But now the dims of Input(X) is %d.",
                           x.dims().size()));
-    out->set_dims(phi::make_ddim({-1}));
+    out->set_dims(common::make_ddim({-1}));
     index->set_dims(x.dims());
     return;
   }
@@ -5038,9 +5038,9 @@ void UniqueRawInferMeta(const MetaTensor& x,
   }
 
   if (axis.empty()) {
-    out->set_dims(phi::make_ddim({-1}));
+    out->set_dims(common::make_ddim({-1}));
     if (return_inverse) {
-      index->set_dims(phi::make_ddim({phi::product(x.dims())}));
+      index->set_dims(common::make_ddim({common::product(x.dims())}));
     }
   } else {
     int axis_value = axis[0];
@@ -5067,16 +5067,16 @@ void UniqueRawInferMeta(const MetaTensor& x,
     out_dims[axis_value] = -1;
     out->set_dims(out_dims);
     if (return_inverse) {
-      index->set_dims(phi::make_ddim({x.dims()[axis_value]}));
+      index->set_dims(common::make_ddim({x.dims()[axis_value]}));
       index->set_dtype(dtype);
     }
   }
   if (return_index) {
-    indices->set_dims(phi::make_ddim({-1}));
+    indices->set_dims(common::make_ddim({-1}));
     indices->set_dtype(dtype);
   }
   if (return_counts) {
-    counts->set_dims(phi::make_ddim({-1}));
+    counts->set_dims(common::make_ddim({-1}));
     counts->set_dtype(dtype);
   }
 }
@@ -5098,7 +5098,7 @@ void UnsqueezeInferMeta(const MetaTensor& x,
     int output_size = static_cast<int>(x.dims().size() + axes.GetData().size());
     std::vector<int64_t> vec_out_dims(output_size, -1);
     out->set_dtype(x.dtype());
-    out->set_dims(phi::make_ddim(vec_out_dims));
+    out->set_dims(common::make_ddim(vec_out_dims));
   } else {
     auto out_dims = funcs::GetUnsqueezeShape(axes.GetData(), x_dims);
     out->set_dims(out_dims);
@@ -5123,7 +5123,7 @@ void UnsqueezeWithXShapeInferMeta(const MetaTensor& x,
     xshape_dims[i + 1] = x_dims[i];
   }
   if (xshape) {
-    xshape->set_dims(phi::make_ddim(xshape_dims));
+    xshape->set_dims(common::make_ddim(xshape_dims));
     xshape->share_lod(x);
     xshape->set_dtype(x.dtype());
   }
@@ -5168,10 +5168,10 @@ void UnStackInferMeta(const MetaTensor& x,
             x_dim[axis],
             num));
   }
-  auto vec = phi::vectorize<int>(x_dim);
+  auto vec = common::vectorize<int>(x_dim);
   vec.erase(vec.begin() + axis);
   for (size_t i = 0; i < output_count; i++) {
-    outs[i]->set_dims(phi::make_ddim(vec));
+    outs[i]->set_dims(common::make_ddim(vec));
     outs[i]->set_dtype(x.dtype());
   }
 }
@@ -5217,11 +5217,11 @@ void WeightQuantizeInferMeta(const MetaTensor& x,
         "'llm.int8'], but got[%s]",
         algo);
   }
-  out->set_dims(phi::make_ddim(dim_out));
+  out->set_dims(common::make_ddim(dim_out));
 
   out->set_dtype(DataType::INT8);
 
-  scale->set_dims(phi::make_ddim(dim_scale));
+  scale->set_dims(common::make_ddim(dim_scale));
   scale->set_dtype(DataType::FLOAT32);
 }
 
@@ -5280,9 +5280,9 @@ void CheckNumericsInferMeta(const MetaTensor& tensor,
                             MetaTensor* stats,
                             MetaTensor* values) {
   stats->set_dtype(DataType::INT64);
-  stats->set_dims(phi::make_ddim({3}));
+  stats->set_dims(common::make_ddim({3}));
   values->set_dtype(DataType::FLOAT32);
-  values->set_dims(phi::make_ddim({3}));
+  values->set_dims(common::make_ddim({3}));
 }
 
 void StridedUnChangedInferMeta(const MetaTensor& x, MetaTensor* out) {
diff --git a/paddle/phi/kernels/array_kernel.cc b/paddle/phi/kernels/array_kernel.cc
index 4217b41e2aed9a..8a599dcf9d80d8 100644
--- a/paddle/phi/kernels/array_kernel.cc
+++ b/paddle/phi/kernels/array_kernel.cc
@@ -14,8 +14,8 @@
 
 #include "paddle/phi/kernels/array_kernel.h"
 
+#include "paddle/common/layout.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/concat_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
@@ -92,9 +92,9 @@ void ArrayToTensorKernel(const Context& dev_ctx,
       }
     }
   }
-  auto vec = phi::vectorize<int>(out_dims);
+  auto vec = common::vectorize<int>(out_dims);
   vec.insert(vec.begin() + axis, x.size());  // NOLINT
-  out->Resize(phi::make_ddim(vec));
+  out->Resize(common::make_ddim(vec));
   std::vector<DenseTensor> tmp_inputs(x.size());
   std::vector<const DenseTensor*> inputs;
 
@@ -115,7 +115,7 @@ void ArrayToTensorKernel(const Context& dev_ctx,
     ConcatKernel<T, Context>(dev_ctx, inputs, axis, out);
   }
 
-  out_index->Resize(phi::make_ddim({static_cast<int>(x.size())}));
+  out_index->Resize(common::make_ddim({static_cast<int>(x.size())}));
   StackKernel<int, Context>(dev_ctx, indexs, 0, out_index);
 }
 
diff --git a/paddle/phi/kernels/assign_kernel.cc b/paddle/phi/kernels/assign_kernel.cc
index 7c8ed23131a88b..b4504f83818d77 100644
--- a/paddle/phi/kernels/assign_kernel.cc
+++ b/paddle/phi/kernels/assign_kernel.cc
@@ -106,7 +106,7 @@ void AssignValueKernel(const Context& dev_ctx,
                                    dtype,
                                    template_dtype));
   CopyVectorToTensor<T>(dev_ctx, values, out);
-  out->Resize(phi::make_ddim(shape));
+  out->Resize(common::make_ddim(shape));
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/autotune/cache_base.h b/paddle/phi/kernels/autotune/cache_base.h
index 68463e900c3578..82af1ccbb71325 100644
--- a/paddle/phi/kernels/autotune/cache_base.h
+++ b/paddle/phi/kernels/autotune/cache_base.h
@@ -18,8 +18,8 @@
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/common/errors.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/flags.h"
 
 PHI_DECLARE_int32(search_cache_max_number);
diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h
index c50a571a7fd95d..1882c21b9cd72b 100644
--- a/paddle/phi/kernels/autotune/gpu_timer.h
+++ b/paddle/phi/kernels/autotune/gpu_timer.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include "paddle/common/errors.h"
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/dynload/port.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
@@ -21,7 +22,6 @@
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/device_context.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
diff --git a/paddle/phi/kernels/coalesce_tensor_kernel.cc b/paddle/phi/kernels/coalesce_tensor_kernel.cc
index 707218e9940981..a60369af449f4e 100644
--- a/paddle/phi/kernels/coalesce_tensor_kernel.cc
+++ b/paddle/phi/kernels/coalesce_tensor_kernel.cc
@@ -194,7 +194,7 @@ void CoalesceTensorKernel(const Context &dev_ctx,
 
   // Alloc the continuous space
   void *fused_tensor_ptr = dev_ctx.Alloc(
-      &fused_output->Resize(phi::make_ddim({static_cast<int64_t>(numel)})),
+      &fused_output->Resize(common::make_ddim({static_cast<int64_t>(numel)})),
       dtype);
   VLOG(10) << "Fused tensor addr " << fused_tensor_ptr;
 
diff --git a/paddle/phi/kernels/cpu/affine_grid_grad_kernel.cc b/paddle/phi/kernels/cpu/affine_grid_grad_kernel.cc
index 86568a0a018468..abd7188acefe50 100644
--- a/paddle/phi/kernels/cpu/affine_grid_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/affine_grid_grad_kernel.cc
@@ -29,7 +29,7 @@ struct Linspace<phi::CPUContext, T> {
                   bool align_corners,
                   DenseTensor* numbers,
                   const phi::CPUContext& dev_ctx) {
-    numbers->Resize(phi::make_ddim({count}));
+    numbers->Resize(common::make_ddim({count}));
     T* number_data = dev_ctx.template Alloc<T>(numbers);
     T slice = (end - start) / (T)(count - 1);
     if (!align_corners) {
@@ -55,7 +55,7 @@ void AffineGridGrad4DKernel(const Context& dev_ctx,
   int w = 0;
   h = static_cast<int>(size_attr[2]);
   w = static_cast<int>(size_attr[3]);
-  theta_grad->Resize(phi::make_ddim({n, 2, 3}));
+  theta_grad->Resize(common::make_ddim({n, 2, 3}));
   dev_ctx.template Alloc<T>(theta_grad);
   phi::funcs::SetConstant<Context, T>()(dev_ctx, theta_grad, static_cast<T>(0));
   DenseTensor grid;
@@ -94,7 +94,7 @@ void AffineGridGrad5DKernel(const Context& dev_ctx,
   d = static_cast<int>(size_attr[2]);
   h = static_cast<int>(size_attr[3]);
   w = static_cast<int>(size_attr[4]);
-  theta_grad->Resize(phi::make_ddim({n, 3, 4}));
+  theta_grad->Resize(common::make_ddim({n, 3, 4}));
   dev_ctx.template Alloc<T>(theta_grad);
   phi::funcs::SetConstant<Context, T>()(dev_ctx, theta_grad, static_cast<T>(0));
   DenseTensor grid;
diff --git a/paddle/phi/kernels/cpu/affine_grid_kernel.cc b/paddle/phi/kernels/cpu/affine_grid_kernel.cc
index 3ad0812f441f38..fef81c008e23a9 100644
--- a/paddle/phi/kernels/cpu/affine_grid_kernel.cc
+++ b/paddle/phi/kernels/cpu/affine_grid_kernel.cc
@@ -29,7 +29,7 @@ struct Linspace<phi::CPUContext, T> {
                   bool align_corners,
                   DenseTensor* numbers,
                   const phi::CPUContext& dev_ctx) {
-    numbers->Resize(phi::make_ddim({count}));
+    numbers->Resize(common::make_ddim({count}));
     T* number_data = dev_ctx.template Alloc<T>(numbers);
     T slice = (end - start) / (T)(count - 1);
     if (!align_corners) {
@@ -55,7 +55,7 @@ void AffineGrid4DKernel(const Context& dev_ctx,
   int w = 0;
   h = static_cast<int>(size_attr[2]);
   w = static_cast<int>(size_attr[3]);
-  output->Resize(phi::make_ddim({n, h, w, 2}));
+  output->Resize(common::make_ddim({n, h, w, 2}));
   dev_ctx.template Alloc<T>(output);
   phi::funcs::SetConstant<Context, T>()(dev_ctx, output, static_cast<T>(0));
   DenseTensor grid;
@@ -89,7 +89,7 @@ void AffineGrid5DKernel(const Context& dev_ctx,
   d = static_cast<int>(size_attr[2]);
   h = static_cast<int>(size_attr[3]);
   w = static_cast<int>(size_attr[4]);
-  output->Resize(phi::make_ddim({n, d, h, w, 3}));
+  output->Resize(common::make_ddim({n, d, h, w, 3}));
   dev_ctx.template Alloc<T>(output);
   phi::funcs::SetConstant<Context, T>()(dev_ctx, output, static_cast<T>(0));
   DenseTensor grid;
diff --git a/paddle/phi/kernels/cpu/arange_kernel.cc b/paddle/phi/kernels/cpu/arange_kernel.cc
index b2684b2f6159af..4120e49c6af2fd 100644
--- a/paddle/phi/kernels/cpu/arange_kernel.cc
+++ b/paddle/phi/kernels/cpu/arange_kernel.cc
@@ -28,7 +28,7 @@ void ArangeFunc(const Context& dev_ctx,
                 DenseTensor* out) {
   int64_t size = 0;
   phi::funcs::GetSize(start_value, end_value, step_value, &size);
-  out->Resize(phi::make_ddim({size}));
+  out->Resize(common::make_ddim({size}));
   T* out_data = dev_ctx.template Alloc<T>(out);
   T value = start_value;
   for (int64_t i = 0; i < size; ++i) {
diff --git a/paddle/phi/kernels/cpu/arg_min_max_kernel.cc b/paddle/phi/kernels/cpu/arg_min_max_kernel.cc
index ce00926101f2cc..351701c97f675a 100644
--- a/paddle/phi/kernels/cpu/arg_min_max_kernel.cc
+++ b/paddle/phi/kernels/cpu/arg_min_max_kernel.cc
@@ -14,8 +14,8 @@
 
 #include "paddle/phi/kernels/arg_min_max_kernel.h"
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
@@ -96,8 +96,8 @@ struct VisitDataArgMinMaxFunctor {
     int new_axis = axis;
     if (flatten) {
       // always reduce 1D -> 0D
-      x_dims = phi::make_ddim({x.numel()});
-      out_dims = phi::make_ddim({});
+      x_dims = common::make_ddim({x.numel()});
+      out_dims = common::make_ddim({});
       new_axis = 0;
     } else {
       x_dims = x.dims();
diff --git a/paddle/phi/kernels/cpu/argsort_grad_kernel.cc b/paddle/phi/kernels/cpu/argsort_grad_kernel.cc
index 9958a23254f027..92135f1eb02346 100644
--- a/paddle/phi/kernels/cpu/argsort_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/argsort_grad_kernel.cc
@@ -74,7 +74,7 @@ void ArgsortGradKernel(const Context& dev_ctx,
   // Do full assign
   if (axis == -1 || axis + 1 == in_dims.size()) {
     const int64_t input_height =
-        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+        common::product(common::slice_ddim(in_dims, 0, in_dims.size() - 1));
     const int64_t input_width = in_dims[in_dims.size() - 1];
 
     FullAssign<T, int64_t>(input_height,
@@ -108,8 +108,8 @@ void ArgsortGradKernel(const Context& dev_ctx,
     TransposeKernel<T, Context>(dev_ctx, out_grad, trans, &trans_dO);
     TransposeKernel<int64_t, Context>(dev_ctx, indices, trans, &trans_ind);
 
-    const int64_t input_height =
-        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_height = common::product(
+        common::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
     const int64_t input_width = trans_dims[trans_dims.size() - 1];
 
     DenseTensor tmp_out;
diff --git a/paddle/phi/kernels/cpu/argsort_kernel.cc b/paddle/phi/kernels/cpu/argsort_kernel.cc
index ba78865d40acc9..cfca255e947948 100644
--- a/paddle/phi/kernels/cpu/argsort_kernel.cc
+++ b/paddle/phi/kernels/cpu/argsort_kernel.cc
@@ -91,7 +91,7 @@ void ArgsortKernel(const Context& dev_ctx,
   // Do full sort
   if (axis == -1 || axis + 1 == in_dims.size()) {
     const int64_t input_height =
-        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+        common::product(common::slice_ddim(in_dims, 0, in_dims.size() - 1));
     const int64_t input_width = in_dims[in_dims.size() - 1];
     int64_t* ids_data = dev_ctx.template Alloc<int64_t>(indices);
     FullSort<T, int64_t>(input_height,
@@ -123,8 +123,8 @@ void ArgsortKernel(const Context& dev_ctx,
     // Do transpose
     TransposeKernel<T, Context>(dev_ctx, input, trans, &trans_inp);
 
-    const int64_t input_height =
-        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_height = common::product(
+        common::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
     const int64_t input_width = trans_dims[trans_dims.size() - 1];
 
     DenseTensor tmp_out;
diff --git a/paddle/phi/kernels/cpu/assign_pos_kernel.cc b/paddle/phi/kernels/cpu/assign_pos_kernel.cc
index ceab18c5ecc7b4..7bad2262dad685 100644
--- a/paddle/phi/kernels/cpu/assign_pos_kernel.cc
+++ b/paddle/phi/kernels/cpu/assign_pos_kernel.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/assign_pos_kernel.h"
-#include "paddle/phi/core/errors.h"
+#include "paddle/common/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
index 7dc8f39da05132..23296fd352d15a 100644
--- a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
@@ -58,7 +58,7 @@ void BatchNormGradFunctor(const Context& ctx,
                           DenseTensor* bias_grad) {
   const auto* d_y = &y_grad;
 
-  DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  DataLayout data_layout = common::StringToDataLayout(data_layout_str);
 
   auto* d_x = x_grad;
   auto* d_scale = scale_grad;
@@ -381,7 +381,7 @@ void BatchNormDoubleGradKernel(
                         "you want to use global status in pre_train model, "
                         "please set `use_global_stats = True`"));
 
-  const auto data_layout = phi::StringToDataLayout(data_layout_str);
+  const auto data_layout = common::StringToDataLayout(data_layout_str);
 
   const auto* ddX = x_grad_grad.get_ptr();
   const auto* ddScale = scale_grad_grad.get_ptr();
diff --git a/paddle/phi/kernels/cpu/batch_norm_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_kernel.cc
index e6acb16a89185a..b0ee0b52cdd1c3 100644
--- a/paddle/phi/kernels/cpu/batch_norm_kernel.cc
+++ b/paddle/phi/kernels/cpu/batch_norm_kernel.cc
@@ -55,7 +55,7 @@ void BatchNormKernel(const Context& ctx,
 
   bool global_stats = test_mode || use_global_stats;
 
-  auto data_layout = phi::StringToDataLayout(data_layout_str);
+  auto data_layout = common::StringToDataLayout(data_layout_str);
 
   const auto& x_dims = x.dims();
   PADDLE_ENFORCE_GE(
diff --git a/paddle/phi/kernels/cpu/concat_kernel.cc b/paddle/phi/kernels/cpu/concat_kernel.cc
index fab30d620c10f6..4c49df6a2c7966 100644
--- a/paddle/phi/kernels/cpu/concat_kernel.cc
+++ b/paddle/phi/kernels/cpu/concat_kernel.cc
@@ -84,8 +84,8 @@ void ConcatKernel(const Context& dev_ctx,
       if (in->numel() == 0UL) {
         continue;
       }
-      auto in_stride = phi::stride_numel(in->dims());
-      auto out_stride = phi::stride_numel(out->dims());
+      auto in_stride = common::stride_numel(in->dims());
+      auto out_stride = common::stride_numel(out->dims());
       phi::funcs::StridedNumelCopyWithAxis<T, Context>(
           dev_ctx,
           axis,
diff --git a/paddle/phi/kernels/cpu/conv_util.h b/paddle/phi/kernels/cpu/conv_util.h
index f051002d367008..8c8ba34d784622 100644
--- a/paddle/phi/kernels/cpu/conv_util.h
+++ b/paddle/phi/kernels/cpu/conv_util.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 #include "paddle/phi/core/meta_tensor.h"
 
 namespace phi {
@@ -27,7 +27,7 @@ inline void UpdatePaddingAndDilation(std::vector<T>* paddings,
                                      const std::vector<T>& strides,
                                      const std::vector<T>& ksize) {
   // set padding size == data_dims.size() * 2
-  auto data_shape = vectorize<T>(data_dims);
+  auto data_shape = common::vectorize<T>(data_dims);
   if (static_cast<int>(paddings->size()) == data_dims.size()) {
     for (int i = 0; i < data_dims.size(); ++i) {
       T copy_pad = *(paddings->begin() + 2 * i);
@@ -43,7 +43,7 @@ inline void UpdatePaddingAndDilation(std::vector<T>* paddings,
             "But received: padding's size is %d, padding is [%s]; input's "
             "dimension is %d, input's shape is [%s].",
             paddings->size(),
-            make_ddim(*paddings),
+            common::make_ddim(*paddings),
             data_dims.size(),
             data_dims));
   }
@@ -173,7 +173,7 @@ inline std::vector<int64_t> ComputeOutputShape(
           in_dims.size(),
           in_dims,
           strides.size(),
-          phi::make_ddim(strides),
+          common::make_ddim(strides),
           in_dims.size() - stride_size));
 
   const auto input_channels =
@@ -218,19 +218,20 @@ inline std::vector<int64_t> ComputeOutputShape(
 
   phi::DDim in_data_dims;
   if (channel_last) {
-    in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
+    in_data_dims = common::slice_ddim(in_dims, 1, in_dims.size() - 1);
   } else {
-    in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
+    in_data_dims = common::slice_ddim(in_dims, 2, in_dims.size());
   }
 
   phi::DDim filter_data_dims;
   if (channel_last) {
-    filter_data_dims = phi::slice_ddim(filter_dims, 1, filter_dims.size() - 1);
+    filter_data_dims =
+        common::slice_ddim(filter_dims, 1, filter_dims.size() - 1);
   } else {
-    filter_data_dims = phi::slice_ddim(filter_dims, 2, filter_dims.size());
+    filter_data_dims = common::slice_ddim(filter_dims, 2, filter_dims.size());
   }
 
-  std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
+  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
   std::vector<int> paddings_vec = paddings;
   std::vector<int> dilations_vec = dilations;
   phi::UpdatePaddingAndDilation(&paddings_vec,
diff --git a/paddle/phi/kernels/cpu/cum_maxmin_kernel.cc b/paddle/phi/kernels/cpu/cum_maxmin_kernel.cc
index 881664601b85c4..72683b003685b1 100644
--- a/paddle/phi/kernels/cpu/cum_maxmin_kernel.cc
+++ b/paddle/phi/kernels/cpu/cum_maxmin_kernel.cc
@@ -59,7 +59,7 @@ void ComputeImp(const DenseTensor& x,
   int64_t x_stride = compute_stride<int64_t>(axis, x.dims());
   int64_t values_stride = compute_stride<int64_t>(axis, out->dims());
   int64_t indices_stride = compute_stride<int64_t>(axis, indices->dims());
-  auto x_dim_vec = phi::vectorize<int>(x.dims());
+  auto x_dim_vec = common::vectorize<int>(x.dims());
   int x_dim_size = x_dim_vec[axis];
   BinaryFunction op;
 
diff --git a/paddle/phi/kernels/cpu/cumprod_grad_kernel.cc b/paddle/phi/kernels/cpu/cumprod_grad_kernel.cc
index a2cc99c59fe2d8..cd4e90d2c7918f 100644
--- a/paddle/phi/kernels/cpu/cumprod_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/cumprod_grad_kernel.cc
@@ -14,9 +14,9 @@
 
 #include "paddle/phi/kernels/cumprod_grad_kernel.h"
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/allocator.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 #include "paddle/phi/kernels/funcs/cumprod.h"
diff --git a/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc b/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc
index d8383b45beb799..dcca60f97b30c9 100644
--- a/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc
@@ -30,11 +30,11 @@ void DiagonalGradKernel(const Context& dev_ctx,
                         DenseTensor* in_grad) {
   const auto* dout = &out_grad;
   const T* dout_data = dout->data<T>();
-  auto dout_dim = vectorize(dout->dims());
+  auto dout_dim = common::vectorize(dout->dims());
 
   auto* dx = in_grad;
   T* dx_data = dev_ctx.template Alloc<T>(dx);
-  auto dx_dim = vectorize(dx->dims());
+  auto dx_dim = common::vectorize(dx->dims());
   auto dx_dim_size = dx_dim.size();
 
   const int64_t offset_ = offset;
diff --git a/paddle/phi/kernels/cpu/diagonal_kernel.cc b/paddle/phi/kernels/cpu/diagonal_kernel.cc
index 58d542a1b7d328..cd71c6f06fd0af 100644
--- a/paddle/phi/kernels/cpu/diagonal_kernel.cc
+++ b/paddle/phi/kernels/cpu/diagonal_kernel.cc
@@ -29,12 +29,12 @@ void DiagonalKernel(const Context& dev_ctx,
                     DenseTensor* out) {
   auto* input = &x;
   const T* input_data = input->data<T>();
-  auto input_dim = vectorize(input->dims());
+  auto input_dim = common::vectorize(input->dims());
   auto input_dim_size = input_dim.size();
 
   auto* output = out;
   T* output_data = dev_ctx.template Alloc<T>(output);
-  auto output_dim = vectorize(output->dims());
+  auto output_dim = common::vectorize(output->dims());
   auto output_dim_size = output_dim.size();
 
   const int64_t offset_ = offset;
diff --git a/paddle/phi/kernels/cpu/dropout_grad_kernel.cc b/paddle/phi/kernels/cpu/dropout_grad_kernel.cc
index 445e92716a899d..9a48fb3994adb4 100644
--- a/paddle/phi/kernels/cpu/dropout_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/dropout_grad_kernel.cc
@@ -46,7 +46,7 @@ void DropoutNdGradKernel(const Context& dev_ctx,
       dX.device(place) = dY * static_cast<T>(1.0f - prob);
     }
   } else {
-    std::vector<int64_t> out_dims = phi::vectorize(out_grad.dims());
+    std::vector<int64_t> out_dims = common::vectorize(out_grad.dims());
     auto M = EigenVector<uint8_t>::Flatten(mask);
     if (dropout_implementation == "upscale_in_train") {
       if (prob == 1.0f) {
diff --git a/paddle/phi/kernels/cpu/dropout_kernel.cc b/paddle/phi/kernels/cpu/dropout_kernel.cc
index 79d805f62d2029..322ce0110d2bc0 100644
--- a/paddle/phi/kernels/cpu/dropout_kernel.cc
+++ b/paddle/phi/kernels/cpu/dropout_kernel.cc
@@ -65,7 +65,7 @@ void DropoutRawKernel(const Context& dev_ctx,
   bool upscale_in_train = (dropout_implementation == "upscale_in_train");
   if (!is_test && mask) {
     auto* mask_data = dev_ctx.template Alloc<uint8_t>(mask);
-    size_t size = phi::product(mask->dims());
+    size_t size = common::product(mask->dims());
 
     // Special case when dropout_prob is 1.0
     if (dropout_prob == 1.0f) {
@@ -135,7 +135,7 @@ void DropoutNdKernel(const Context& dev_ctx,
     t_mask.Resize(mask->dims());
     T* t_mask_data = dev_ctx.template Alloc<T>(&t_mask);
     auto* mask_data = dev_ctx.template Alloc<uint8_t>(mask);
-    size_t size = phi::product(mask->dims());
+    size_t size = common::product(mask->dims());
 
     // Special case when dropout_prob is 1.0
     if (dropout_prob == 1.0f) {
diff --git a/paddle/phi/kernels/cpu/eig.h b/paddle/phi/kernels/cpu/eig.h
index 3ec862c1d471b2..e23b27598c46d2 100644
--- a/paddle/phi/kernels/cpu/eig.h
+++ b/paddle/phi/kernels/cpu/eig.h
@@ -149,7 +149,7 @@ void LapackEig(DenseTensor* input,
   DenseTensor rwork;
   phi::dtype::Real<T>* rwork_data = nullptr;
 
-  rwork.Resize(phi::make_ddim({lda * 2}));
+  rwork.Resize(common::make_ddim({lda * 2}));
   rwork_data = dev_ctx.template Alloc<phi::dtype::Real<T>>(&rwork);
 
   // call lapackEig once to compute the size of work;
@@ -172,7 +172,7 @@ void LapackEig(DenseTensor* input,
   lwork = std::max<int>(
       1, static_cast<int>(phi::dtype::Real<T>(computed_work_size)));
   DenseTensor work;
-  work.Resize(phi::make_ddim({lwork}));
+  work.Resize(common::make_ddim({lwork}));
   T* work_data = dev_ctx.template Alloc<T>(&work);
 
   for (auto i = 0; i < batch_count; ++i) {
@@ -217,8 +217,8 @@ void ApplyEigKernel(const DenseTensor& input,
   DenseTensor vectors_row_major;
   int num_dims = input.dims().size();
 
-  // transfer to column-major memory layout i.e. make_ddim from tranposed_input:
-  // [batch,row,col]->[batch,col,row]
+  // transfer to column-major memory layout i.e. common::make_ddim from
+  // tranposed_input: [batch,row,col]->[batch,col,row]
   TransposeTwoAxis<T, Context>(
       input, &input_column_major, num_dims - 1, num_dims - 2, dev_ctx);
   // make sure 'vectors_row_major' holds memory before passed to LapackEig()
diff --git a/paddle/phi/kernels/cpu/eig_kernel.cc b/paddle/phi/kernels/cpu/eig_kernel.cc
index 8f5905f8f10892..f59e1abb7f0541 100644
--- a/paddle/phi/kernels/cpu/eig_kernel.cc
+++ b/paddle/phi/kernels/cpu/eig_kernel.cc
@@ -45,12 +45,12 @@ void EigKernel(const Context& dev_ctx,
 
     // double the size of real_w, the first half stores the real part,
     // the next half stores the imag part
-    std::vector<int> origin_dim = phi::vectorize<int>(out_w->dims());
+    std::vector<int> origin_dim = common::vectorize<int>(out_w->dims());
     int last_item = origin_dim.back();
     origin_dim.pop_back();
     origin_dim.push_back(last_item * 2);
 
-    phi::DDim big_dim = phi::make_ddim(origin_dim);
+    phi::DDim big_dim = common::make_ddim(origin_dim);
 
     real_w.Resize(big_dim);
     dev_ctx.template Alloc<phi::dtype::Real<T>>(&real_w);
diff --git a/paddle/phi/kernels/cpu/eigvals_kernel.cc b/paddle/phi/kernels/cpu/eigvals_kernel.cc
index cd4aaca2ecf83f..f716a4de539222 100644
--- a/paddle/phi/kernels/cpu/eigvals_kernel.cc
+++ b/paddle/phi/kernels/cpu/eigvals_kernel.cc
@@ -78,7 +78,7 @@ typename std::enable_if<std::is_floating_point<T>::value>::type LapackEigvals(
 
   DenseTensor w;
   int64_t n_dim = input.dims()[1];
-  w.Resize(make_ddim({n_dim << 1}));
+  w.Resize(common::make_ddim({n_dim << 1}));
   T* w_data = ctx.template Alloc<T>(&w);
 
   int64_t work_mem = static_cast<int64_t>(work->memory_size());
@@ -190,9 +190,9 @@ void SpiltBatchSquareMatrix(const DenseTensor& input,
   DDim flattened_input_dims, flattened_output_dims;
   if (input_dims.size() > 2) {
     flattened_input_dims =
-        phi::flatten_to_3d(input_dims, last_dim - 1, last_dim);
+        common::flatten_to_3d(input_dims, last_dim - 1, last_dim);
   } else {
-    flattened_input_dims = phi::make_ddim({1, n_dim, n_dim});
+    flattened_input_dims = common::make_ddim({1, n_dim, n_dim});
   }
 
   DenseTensor flattened_input;
@@ -211,7 +211,7 @@ void EigvalsKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
   int64_t n_dim = x_matrices[0].dims()[1];
   int64_t n_batch = static_cast<int64_t>(x_matrices.size());
   DDim out_dims = out->dims();
-  out->Resize(make_ddim({n_batch, n_dim}));
+  out->Resize(common::make_ddim({n_batch, n_dim}));
   std::vector<DenseTensor> out_vectors = out->Split(1, 0);
 
   // query workspace size
@@ -235,11 +235,11 @@ void EigvalsKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
 
   DenseTensor work, rwork;
 
-  work.Resize(make_ddim({lwork}));
+  work.Resize(common::make_ddim({lwork}));
   ctx.template Alloc<T>(&work);
 
   if (IsComplexType(x.dtype())) {
-    rwork.Resize(make_ddim({n_dim << 1}));
+    rwork.Resize(common::make_ddim({n_dim << 1}));
     ctx.template Alloc<dtype::Real<T>>(&rwork);
   }
 
diff --git a/paddle/phi/kernels/cpu/fill_diagonal_tensor_kernel.cc b/paddle/phi/kernels/cpu/fill_diagonal_tensor_kernel.cc
index fa4d1ae7a710e5..c612b7a6a3f5d5 100644
--- a/paddle/phi/kernels/cpu/fill_diagonal_tensor_kernel.cc
+++ b/paddle/phi/kernels/cpu/fill_diagonal_tensor_kernel.cc
@@ -86,7 +86,7 @@ void FillDiagonalTensorKernel(const Context &ctx,
   phi::Copy(ctx, x, ctx.GetPlace(), false, out);
   auto out_dims = out->dims();
   auto matdims = y.dims();
-  auto fill_dims = phi::flatten_to_2d(matdims, matdims.size() - 1);
+  auto fill_dims = common::flatten_to_2d(matdims, matdims.size() - 1);
 
   std::array<int64_t, 2> new_dims;
   std::array<int64_t, 2> strides;
diff --git a/paddle/phi/kernels/cpu/flip_kernel.cc b/paddle/phi/kernels/cpu/flip_kernel.cc
index f83967073e293e..d53ffaa3df439b 100644
--- a/paddle/phi/kernels/cpu/flip_kernel.cc
+++ b/paddle/phi/kernels/cpu/flip_kernel.cc
@@ -38,7 +38,7 @@ void FlipKernel(const Context& dev_ctx,
     }
     dim_bitset[dim] = true;
   }
-  auto x_strides = phi::stride(x_dims);
+  auto x_strides = common::stride(x_dims);
   auto numel = x.numel();
   const T* x_data = x.data<T>();
   T* out_data = dev_ctx.template Alloc<T>(out);
diff --git a/paddle/phi/kernels/cpu/full_kernel.cc b/paddle/phi/kernels/cpu/full_kernel.cc
index e4ba06778817c0..b1a6ceda3647d5 100644
--- a/paddle/phi/kernels/cpu/full_kernel.cc
+++ b/paddle/phi/kernels/cpu/full_kernel.cc
@@ -35,7 +35,7 @@ void FullKernel(const Context& dev_ctx,
                 const Scalar& val,
                 DataType dtype UNUSED,
                 DenseTensor* out) {
-  out->Resize(phi::make_ddim(shape.GetData()));
+  out->Resize(common::make_ddim(shape.GetData()));
   FullValue<T>(dev_ctx, out, val.to<T>());
 }
 
@@ -91,7 +91,7 @@ void FullIntArrayKernel(const Context& dev_ctx,
                         const std::vector<int64_t>& shape,
                         DataType dtype UNUSED,
                         DenseTensor* out) {
-  out->Resize(phi::make_ddim({static_cast<int64_t>(shape.size())}));
+  out->Resize(common::make_ddim({static_cast<int64_t>(shape.size())}));
   T* out_data = dev_ctx.template Alloc<T>(out);
   for (size_t i = 0; i < shape.size(); ++i) {
     int64_t val = shape[i];
diff --git a/paddle/phi/kernels/cpu/gaussian_kernel.cc b/paddle/phi/kernels/cpu/gaussian_kernel.cc
index 00ed6aaf357409..8915f721a6911b 100644
--- a/paddle/phi/kernels/cpu/gaussian_kernel.cc
+++ b/paddle/phi/kernels/cpu/gaussian_kernel.cc
@@ -32,7 +32,7 @@ void GaussianKernel(const Context& dev_ctx,
 
   std::normal_distribution<T> dist(mean, std);
 
-  tensor->Resize(phi::make_ddim(shape.GetData()));
+  tensor->Resize(common::make_ddim(shape.GetData()));
   int64_t size = tensor->numel();
   T* data = dev_ctx.template Alloc<T>(tensor);
   std::shared_ptr<std::mt19937_64> engine;
diff --git a/paddle/phi/kernels/cpu/generate_proposals_kernel.cc b/paddle/phi/kernels/cpu/generate_proposals_kernel.cc
index e9764035613ed3..3f3398ae59496c 100644
--- a/paddle/phi/kernels/cpu/generate_proposals_kernel.cc
+++ b/paddle/phi/kernels/cpu/generate_proposals_kernel.cc
@@ -73,7 +73,7 @@ void FilterBoxes(const phi::CPUContext& ctx,
                  bool pixel_offset = true) {
   const T* im_info_data = im_info.data<T>();
   const T* boxes_data = boxes->data<T>();
-  keep->Resize(phi::make_ddim({boxes->dims()[0]}));
+  keep->Resize(common::make_ddim({boxes->dims()[0]}));
   min_size = std::max(min_size, 1.0f);
   int* keep_data = ctx.template Alloc<int>(keep);
   T offset = pixel_offset ? static_cast<T>(1.0) : 0;
@@ -101,7 +101,7 @@ void FilterBoxes(const phi::CPUContext& ctx,
       }
     }
   }
-  keep->Resize(phi::make_ddim({keep_len}));
+  keep->Resize(common::make_ddim({keep_len}));
 }
 
 template <class T>
@@ -189,7 +189,7 @@ std::pair<DenseTensor, DenseTensor> ProposalForOneImage(
 
   // Sort index
   DenseTensor index_t;
-  index_t.Resize(phi::make_ddim({scores_slice.numel()}));
+  index_t.Resize(common::make_ddim({scores_slice.numel()}));
   int* index = ctx.template Alloc<int>(&index_t);
   for (int i = 0; i < scores_slice.numel(); ++i) {
     index[i] = i;
@@ -203,20 +203,20 @@ std::pair<DenseTensor, DenseTensor> ProposalForOneImage(
   } else {
     std::nth_element(
         index, index + pre_nms_top_n, index + scores_slice.numel(), compare);
-    index_t.Resize(phi::make_ddim({pre_nms_top_n}));
+    index_t.Resize(common::make_ddim({pre_nms_top_n}));
   }
 
   DenseTensor scores_sel, bbox_sel, anchor_sel, var_sel;
-  scores_sel.Resize(phi::make_ddim({index_t.numel(), 1}));
+  scores_sel.Resize(common::make_ddim({index_t.numel(), 1}));
   ctx.template Alloc<T>(&scores_sel);
 
-  bbox_sel.Resize(phi::make_ddim({index_t.numel(), 4}));
+  bbox_sel.Resize(common::make_ddim({index_t.numel(), 4}));
   ctx.template Alloc<T>(&bbox_sel);
 
-  anchor_sel.Resize(phi::make_ddim({index_t.numel(), 4}));
+  anchor_sel.Resize(common::make_ddim({index_t.numel(), 4}));
   ctx.template Alloc<T>(&anchor_sel);
 
-  var_sel.Resize(phi::make_ddim({index_t.numel(), 4}));
+  var_sel.Resize(common::make_ddim({index_t.numel(), 4}));
   ctx.template Alloc<T>(&var_sel);
 
   phi::funcs::CPUGather<T>(ctx, scores_slice, index_t, &scores_sel);
@@ -225,7 +225,7 @@ std::pair<DenseTensor, DenseTensor> ProposalForOneImage(
   phi::funcs::CPUGather<T>(ctx, variances, index_t, &var_sel);
 
   DenseTensor proposals;
-  proposals.Resize(phi::make_ddim({index_t.numel(), 4}));
+  proposals.Resize(common::make_ddim({index_t.numel(), 4}));
   ctx.template Alloc<T>(&proposals);
 
   BoxCoder<T>(ctx, &anchor_sel, &bbox_sel, &var_sel, &proposals, pixel_offset);
@@ -239,20 +239,20 @@ std::pair<DenseTensor, DenseTensor> ProposalForOneImage(
   // Handle the case when there is no keep index left
   if (keep.numel() == 0) {
     phi::funcs::SetConstant<phi::CPUContext, T> set_zero;
-    bbox_sel.Resize(phi::make_ddim({1, 4}));
+    bbox_sel.Resize(common::make_ddim({1, 4}));
     ctx.template Alloc<T>(&bbox_sel);
     set_zero(ctx, &bbox_sel, static_cast<T>(0));
     DenseTensor scores_filter;
-    scores_filter.Resize(phi::make_ddim({1, 1}));
+    scores_filter.Resize(common::make_ddim({1, 1}));
     ctx.template Alloc<T>(&scores_filter);
     set_zero(ctx, &scores_filter, static_cast<T>(0));
     return std::make_pair(bbox_sel, scores_filter);
   }
 
   DenseTensor scores_filter;
-  bbox_sel.Resize(phi::make_ddim({keep.numel(), 4}));
+  bbox_sel.Resize(common::make_ddim({keep.numel(), 4}));
   ctx.template Alloc<T>(&bbox_sel);
-  scores_filter.Resize(phi::make_ddim({keep.numel(), 1}));
+  scores_filter.Resize(common::make_ddim({keep.numel(), 1}));
   ctx.template Alloc<T>(&scores_filter);
   phi::funcs::CPUGather<T>(ctx, proposals, keep, &bbox_sel);
   phi::funcs::CPUGather<T>(ctx, scores_sel, keep, &scores_filter);
@@ -264,12 +264,12 @@ std::pair<DenseTensor, DenseTensor> ProposalForOneImage(
       ctx, &bbox_sel, &scores_filter, nms_thresh, eta, pixel_offset);
 
   if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) {
-    keep_nms.Resize(phi::make_ddim({post_nms_top_n}));
+    keep_nms.Resize(common::make_ddim({post_nms_top_n}));
   }
 
-  proposals.Resize(phi::make_ddim({keep_nms.numel(), 4}));
+  proposals.Resize(common::make_ddim({keep_nms.numel(), 4}));
   ctx.template Alloc<T>(&proposals);
-  scores_sel.Resize(phi::make_ddim({keep_nms.numel(), 1}));
+  scores_sel.Resize(common::make_ddim({keep_nms.numel(), 1}));
   ctx.template Alloc<T>(&scores_sel);
   phi::funcs::CPUGather<T>(ctx, bbox_sel, keep_nms, &proposals);
   phi::funcs::CPUGather<T>(ctx, scores_filter, keep_nms, &scores_sel);
@@ -304,17 +304,17 @@ void GenerateProposalsKernel(const Context& ctx,
   int64_t h_bbox = bbox_dim[2];
   int64_t w_bbox = bbox_dim[3];
 
-  rpn_rois->Resize(phi::make_ddim({bbox_deltas.numel() / 4, 4}));
+  rpn_rois->Resize(common::make_ddim({bbox_deltas.numel() / 4, 4}));
   ctx.template Alloc<T>(rpn_rois);
 
-  rpn_roi_probs->Resize(phi::make_ddim({scores.numel(), 1}));
+  rpn_roi_probs->Resize(common::make_ddim({scores.numel(), 1}));
   ctx.template Alloc<T>(rpn_roi_probs);
 
   DenseTensor bbox_deltas_swap, scores_swap;
-  bbox_deltas_swap.Resize(phi::make_ddim({num, h_bbox, w_bbox, c_bbox}));
+  bbox_deltas_swap.Resize(common::make_ddim({num, h_bbox, w_bbox, c_bbox}));
   ctx.template Alloc<T>(&bbox_deltas_swap);
 
-  scores_swap.Resize(phi::make_ddim({num, h_score, w_score, c_score}));
+  scores_swap.Resize(common::make_ddim({num, h_score, w_score, c_score}));
   ctx.template Alloc<T>(&scores_swap);
 
   phi::funcs::Transpose<phi::CPUContext, T, 4> trans;
@@ -328,8 +328,8 @@ void GenerateProposalsKernel(const Context& ctx,
   lod0.push_back(0);
   DenseTensor tmp_anchors = anchors;
   DenseTensor tmp_variances = variances;
-  tmp_anchors.Resize(phi::make_ddim({tmp_anchors.numel() / 4, 4}));
-  tmp_variances.Resize(phi::make_ddim({tmp_variances.numel() / 4, 4}));
+  tmp_anchors.Resize(common::make_ddim({tmp_anchors.numel() / 4, 4}));
+  tmp_variances.Resize(common::make_ddim({tmp_variances.numel() / 4, 4}));
   std::vector<int> tmp_num;
 
   int64_t num_proposals = 0;
@@ -338,8 +338,9 @@ void GenerateProposalsKernel(const Context& ctx,
     DenseTensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1);
     DenseTensor scores_slice = scores_swap.Slice(i, i + 1);
 
-    bbox_deltas_slice.Resize(phi::make_ddim({h_bbox * w_bbox * c_bbox / 4, 4}));
-    scores_slice.Resize(phi::make_ddim({h_score * w_score * c_score, 1}));
+    bbox_deltas_slice.Resize(
+        common::make_ddim({h_bbox * w_bbox * c_bbox / 4, 4}));
+    scores_slice.Resize(common::make_ddim({h_score * w_score * c_score, 1}));
 
     std::pair<DenseTensor, DenseTensor> tensor_pair =
         ProposalForOneImage<T>(ctx,
@@ -364,16 +365,16 @@ void GenerateProposalsKernel(const Context& ctx,
     tmp_num.push_back(static_cast<int>(proposals.dims()[0]));
   }
   if (rpn_rois_num != nullptr) {
-    rpn_rois_num->Resize(phi::make_ddim({num}));
+    rpn_rois_num->Resize(common::make_ddim({num}));
     ctx.template Alloc<int>(rpn_rois_num);
     int* num_data = rpn_rois_num->data<int>();
     for (int i = 0; i < num; i++) {
       num_data[i] = tmp_num[i];
     }
-    rpn_rois_num->Resize(phi::make_ddim({num}));
+    rpn_rois_num->Resize(common::make_ddim({num}));
   }
-  rpn_rois->Resize(phi::make_ddim({num_proposals, 4}));
-  rpn_roi_probs->Resize(phi::make_ddim({num_proposals, 1}));
+  rpn_rois->Resize(common::make_ddim({num_proposals, 4}));
+  rpn_roi_probs->Resize(common::make_ddim({num_proposals, 1}));
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/grid_sample_kernel.cc b/paddle/phi/kernels/cpu/grid_sample_kernel.cc
index 172ca16d0deb87..1d8a8536e01dbc 100644
--- a/paddle/phi/kernels/cpu/grid_sample_kernel.cc
+++ b/paddle/phi/kernels/cpu/grid_sample_kernel.cc
@@ -320,7 +320,7 @@ void GridSampleKernel(const Context& dev_ctx,
     const int in_h = static_cast<int>(x.dims()[2]);
     const int in_w = static_cast<int>(x.dims()[3]);
 
-    out->Resize(phi::make_ddim({n, c, out_h, out_w}));
+    out->Resize(common::make_ddim({n, c, out_h, out_w}));
     dev_ctx.template Alloc<T>(out);
     phi::funcs::SetConstant<Context, T>()(dev_ctx, out, static_cast<T>(0));
 
@@ -353,7 +353,7 @@ void GridSampleKernel(const Context& dev_ctx,
     const int in_h = static_cast<int>(x.dims()[3]);
     const int in_w = static_cast<int>(x.dims()[4]);
 
-    out->Resize(phi::make_ddim({n, c, out_d, out_h, out_w}));
+    out->Resize(common::make_ddim({n, c, out_d, out_h, out_w}));
     dev_ctx.template Alloc<T>(out);
     phi::funcs::SetConstant<Context, T>()(dev_ctx, out, static_cast<T>(0));
 
diff --git a/paddle/phi/kernels/cpu/group_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/group_norm_grad_kernel.cc
index a4c123f2f94ff1..75d7a164a99240 100644
--- a/paddle/phi/kernels/cpu/group_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/group_norm_grad_kernel.cc
@@ -19,8 +19,8 @@
 #include <numeric>
 #include <string>
 
+#include "paddle/common/layout.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
@@ -44,7 +44,7 @@ void GroupNormGradKernel(const Context& dev_ctx,
                          DenseTensor* d_x,
                          DenseTensor* d_scale,
                          DenseTensor* d_bias) {
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
   const auto scale_ptr = scale.get_ptr();
   const auto bias_ptr = bias.get_ptr();
   const auto& x_dims = y.dims();
diff --git a/paddle/phi/kernels/cpu/group_norm_kernel.cc b/paddle/phi/kernels/cpu/group_norm_kernel.cc
index 35975018dca1cc..499e47949c32e8 100644
--- a/paddle/phi/kernels/cpu/group_norm_kernel.cc
+++ b/paddle/phi/kernels/cpu/group_norm_kernel.cc
@@ -19,8 +19,8 @@
 #include <numeric>
 #include <string>
 
+#include "paddle/common/layout.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
@@ -40,7 +40,7 @@ void GroupNormKernel(const Context& dev_ctx,
                      DenseTensor* y,
                      DenseTensor* mean,
                      DenseTensor* var) {
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
   const auto scale_ptr = scale.get_ptr();
   const auto bias_ptr = bias.get_ptr();
 
diff --git a/paddle/phi/kernels/cpu/gumbel_softmax_kernel.cc b/paddle/phi/kernels/cpu/gumbel_softmax_kernel.cc
index 12d4a668be33f6..94ef3231c70101 100644
--- a/paddle/phi/kernels/cpu/gumbel_softmax_kernel.cc
+++ b/paddle/phi/kernels/cpu/gumbel_softmax_kernel.cc
@@ -34,7 +34,7 @@ struct GumbleNoiseGenerator<CPUContext, T> {
     std::uniform_real_distribution<T> dist(0.00001, 1);
     auto engine = ctx.GetGenerator()->GetCPUEngine();
     DenseTensor random_tensor;
-    random_tensor.Resize(make_ddim({size}));
+    random_tensor.Resize(common::make_ddim({size}));
     auto* random_data = ctx.template Alloc<T>(&random_tensor);
     for (int64_t i = 0; i < size; ++i) {
       random_data[i] = dist(*engine);
diff --git a/paddle/phi/kernels/cpu/hsigmoid_loss_kernel.cc b/paddle/phi/kernels/cpu/hsigmoid_loss_kernel.cc
index 3b43c2e8c2e5c7..cee94ff3fd734f 100644
--- a/paddle/phi/kernels/cpu/hsigmoid_loss_kernel.cc
+++ b/paddle/phi/kernels/cpu/hsigmoid_loss_kernel.cc
@@ -51,7 +51,7 @@ void HSigmoidLossKernel(const Context& ctx,
           : static_cast<int64_t>(phi::funcs::FindLastSet(num_classes_st - 1));
   int64_t batch_size = x.dims()[0];
   DenseTensor sum;
-  pre_out->Resize(phi::make_ddim({batch_size, code_length}));
+  pre_out->Resize(common::make_ddim({batch_size, code_length}));
   ctx.template Alloc<T>(pre_out);
   auto* pre_out_data = pre_out->data<T>();
   auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
@@ -72,7 +72,7 @@ void HSigmoidLossKernel(const Context& ctx,
   }
 
   std::vector<int64_t> sum_dims({batch_size, 1UL});
-  sum.Resize(phi::make_ddim(sum_dims));
+  sum.Resize(common::make_ddim(sum_dims));
   ctx.template Alloc<T>(&sum);
   auto sum_mat = EigenMatrix<T>::From(sum);
   ctx.template Alloc<T>(out);
diff --git a/paddle/phi/kernels/cpu/index_add_impl.h b/paddle/phi/kernels/cpu/index_add_impl.h
index 0a0671951b357a..d16cbc382215dc 100644
--- a/paddle/phi/kernels/cpu/index_add_impl.h
+++ b/paddle/phi/kernels/cpu/index_add_impl.h
@@ -77,8 +77,8 @@ void IndexAddInner(const Context& ctx,
   VLOG(3) << "Index_Add_Debug; outer_nums: " << outer_nums
           << "; slice_size: " << slice_size << "; index_size: " << index_size;
 
-  output->Resize(phi::make_ddim({outer_nums, input_dim[axis], slice_size}));
-  add_value->Resize(phi::make_ddim({outer_nums, index_size, slice_size}));
+  output->Resize(common::make_ddim({outer_nums, input_dim[axis], slice_size}));
+  add_value->Resize(common::make_ddim({outer_nums, index_size, slice_size}));
   VLOG(3) << "output.dims: " << output->dims()
           << ", add_value.dims: " << add_value->dims();
 
diff --git a/paddle/phi/kernels/cpu/index_put_grad_kernel.cc b/paddle/phi/kernels/cpu/index_put_grad_kernel.cc
index 7385a928c17916..8a100af33f0184 100644
--- a/paddle/phi/kernels/cpu/index_put_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/index_put_grad_kernel.cc
@@ -91,7 +91,7 @@ void LaunchIndexPutGradKernel(const Context& dev_ctx,
 
       auto x_grad_dims = x_grad->dims();
       const int64_t numel = indices[0]->numel();
-      auto x_grad_stride = phi::stride(x_grad_dims);
+      auto x_grad_stride = common::stride(x_grad_dims);
 
       set_zero_kernel<T>(
           numel, pd_indices.data(), x_grad_stride, x_grad_dims, x_grad_data);
@@ -100,7 +100,7 @@ void LaunchIndexPutGradKernel(const Context& dev_ctx,
 
   auto out_grad_dims = out_grad.dims();
   const int64_t numel = indices[0]->numel();
-  auto out_grad_stride = phi::stride(out_grad_dims);
+  auto out_grad_stride = common::stride(out_grad_dims);
 
   if (value_grad) {
     if (value_grad->numel() == 1) {
@@ -150,8 +150,9 @@ void LaunchIndexPutGradKernel(const Context& dev_ctx,
                                out_grad_dims,
                                tmp_value_grad_data);
 
-      std::vector<int64_t> after_dims = phi::vectorize(tmp_value_grad.dims());
-      std::vector<int64_t> before_dims = phi::vectorize(value_grad->dims());
+      std::vector<int64_t> after_dims =
+          common::vectorize(tmp_value_grad.dims());
+      std::vector<int64_t> before_dims = common::vectorize(value_grad->dims());
       std::vector<int64_t> compress_dims;
       std::vector<int64_t> dims_without_1;
 
@@ -159,7 +160,7 @@ void LaunchIndexPutGradKernel(const Context& dev_ctx,
           &after_dims, &before_dims, &compress_dims, &dims_without_1);
 
       auto pre_dims = value_grad->dims();
-      value_grad->Resize(phi::make_ddim(dims_without_1));
+      value_grad->Resize(common::make_ddim(dims_without_1));
       IntArray v_axis(compress_dims);
       SumKernel<T>(dev_ctx,
                    tmp_value_grad,
@@ -196,7 +197,7 @@ void IndexPutGradKernel(const Context& dev_ctx,
     }
     if (value_grad) {
       FullKernel<T, Context>(dev_ctx,
-                             phi::vectorize(value_grad->dims()),
+                             common::vectorize(value_grad->dims()),
                              0.0f,
                              value_grad->dtype(),
                              value_grad);
@@ -205,7 +206,7 @@ void IndexPutGradKernel(const Context& dev_ctx,
   }
   auto bd_dim = funcs::BroadCastTensorsDims(int_indices_v);
 
-  std::vector<int64_t> res_dim_v(phi::vectorize(bd_dim));
+  std::vector<int64_t> res_dim_v(common::vectorize(bd_dim));
   std::vector<const phi::DenseTensor*> res_indices_v(x.dims().size(), nullptr);
   std::vector<DenseTensor> tmp_res_indices_v;
   std::vector<DenseTensor> range_tensor_v;
diff --git a/paddle/phi/kernels/cpu/index_put_kernel.cc b/paddle/phi/kernels/cpu/index_put_kernel.cc
index f587978c2c2adf..4820dbc3087b9c 100644
--- a/paddle/phi/kernels/cpu/index_put_kernel.cc
+++ b/paddle/phi/kernels/cpu/index_put_kernel.cc
@@ -72,7 +72,7 @@ void LaunchIndexPutKernel(const Context& dev_ctx,
 
   auto x_dims = x.dims();
   const int64_t numel = indices[0]->numel();
-  auto x_stride = phi::stride(x_dims);
+  auto x_stride = common::stride(x_dims);
 
   int64_t is_single_val_tensor = (value.numel() == 1) ? 0 : INT64_MAX;
 
@@ -127,7 +127,7 @@ void IndexPutKernel(const Context& dev_ctx,
 
   auto bd_dim = funcs::BroadCastTensorsDims(int_indices_v);
 
-  std::vector<int64_t> res_dim_v(phi::vectorize(bd_dim));
+  std::vector<int64_t> res_dim_v(common::vectorize(bd_dim));
   std::vector<const phi::DenseTensor*> res_indices_v(x.dims().size(), nullptr);
   std::vector<DenseTensor> tmp_res_indices_v;
   std::vector<DenseTensor> tmp_value_v;
@@ -150,7 +150,7 @@ void IndexPutKernel(const Context& dev_ctx,
                                      &res_dim_v);
   if (value.numel() != 1) {
     tmp_value_v.emplace_back(
-        DenseTensor(value.dtype()).Resize(phi::make_ddim(res_dim_v)));
+        DenseTensor(value.dtype()).Resize(common::make_ddim(res_dim_v)));
     ExpandKernel<T, Context>(
         dev_ctx, value, IntArray(res_dim_v), &tmp_value_v[0]);
     ptr_value = &tmp_value_v[0];
diff --git a/paddle/phi/kernels/cpu/index_sample_kernel.cc b/paddle/phi/kernels/cpu/index_sample_kernel.cc
index 02f3afcb67b6ef..f2aa55b9d326f2 100644
--- a/paddle/phi/kernels/cpu/index_sample_kernel.cc
+++ b/paddle/phi/kernels/cpu/index_sample_kernel.cc
@@ -76,7 +76,7 @@ void IndexSampleInner(const Context &context,
     res[i] = v;
   }
 
-  auto ddim = phi::make_ddim({batch_size, index_length});
+  auto ddim = common::make_ddim({batch_size, index_length});
   context.template Alloc<T>(output);
   phi::TensorFromVector(res, context, output);
   output->Resize(ddim);
diff --git a/paddle/phi/kernels/cpu/index_select_impl.h b/paddle/phi/kernels/cpu/index_select_impl.h
index 7c20ab8ea5dfd2..522719e56fcfdc 100644
--- a/paddle/phi/kernels/cpu/index_select_impl.h
+++ b/paddle/phi/kernels/cpu/index_select_impl.h
@@ -106,8 +106,8 @@ void IndexSelectInner(const Context& ctx,
   VLOG(3) << "Index_Select_Debug; outer_nums: " << outer_nums
           << "; slice_size: " << slice_size << "; index_size: " << index_size;
 
-  input->Resize(phi::make_ddim({outer_nums, input_dim[dim], slice_size}));
-  output->Resize(phi::make_ddim({outer_nums, index_size, slice_size}));
+  input->Resize(common::make_ddim({outer_nums, input_dim[dim], slice_size}));
+  output->Resize(common::make_ddim({outer_nums, index_size, slice_size}));
 
   auto input_tensor = EigenTensor<T, 3>::From(*input);
   auto output_tensor = EigenTensor<T, 3>::From(*output);
diff --git a/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc
index d798c6b81c9666..b53482c9d8d3fd 100644
--- a/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc
@@ -18,8 +18,8 @@
 #include <string>
 #include <unordered_map>
 
+#include "paddle/common/layout.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/extensions.h"
diff --git a/paddle/phi/kernels/cpu/instance_norm_kernel.cc b/paddle/phi/kernels/cpu/instance_norm_kernel.cc
index 1242babaf0c835..56af2dc5f23403 100644
--- a/paddle/phi/kernels/cpu/instance_norm_kernel.cc
+++ b/paddle/phi/kernels/cpu/instance_norm_kernel.cc
@@ -18,8 +18,8 @@
 #include <string>
 #include <unordered_map>
 
+#include "paddle/common/layout.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
diff --git a/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc b/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc
index e32738b4588c83..79aac41a34903f 100644
--- a/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc
@@ -15,9 +15,9 @@
 #include "paddle/phi/kernels/interpolate_grad_kernel.h"
 #include <array>
 
+#include "paddle/common/layout.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/interpolate_function.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -406,7 +406,7 @@ static void Interpolate1DCPUBwd(
     bool align_corners,
     int align_mode,
     DenseTensor* input_grad) {
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
   int n = 0, c = 0, in_d = 0, in_h = 0, in_w = 0;
   funcs::ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
 
@@ -507,7 +507,7 @@ static void Interpolate2DCPUBwd(
     bool align_corners,
     int align_mode,
     DenseTensor* input_grad) {
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
   int n = 0, c = 0, in_d = 0, in_h = 0, in_w = 0;
   funcs::ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
 
@@ -673,7 +673,7 @@ static void Interpolate3DCPUBwd(
     bool align_corners,
     int align_mode,
     DenseTensor* input_grad) {
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
   int n = 0, c = 0, in_d = 0, in_h = 0, in_w = 0;
   funcs::ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
 
diff --git a/paddle/phi/kernels/cpu/interpolate_kernel.cc b/paddle/phi/kernels/cpu/interpolate_kernel.cc
index 7c957657ceb39e..495ecc6b3cdfd4 100644
--- a/paddle/phi/kernels/cpu/interpolate_kernel.cc
+++ b/paddle/phi/kernels/cpu/interpolate_kernel.cc
@@ -15,9 +15,9 @@
 #include "paddle/phi/kernels/interpolate_kernel.h"
 #include <array>
 
+#include "paddle/common/layout.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/interpolate_function.h"
 
@@ -560,7 +560,7 @@ static void Interpolate1DCPUFwd(
     bool align_corners,
     int align_mode,
     DenseTensor* output) {
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
   int n = 0, c = 0, in_d = 0, in_h = 0, in_w = 0;
   funcs::ExtractNCDWH(x.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
 
@@ -661,7 +661,7 @@ static void Interpolate2DCPUFwd(
     bool align_corners,
     int align_mode,
     DenseTensor* output) {
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
   int n = 0, c = 0, in_d = 0, in_h = 0, in_w = 0;
   funcs::ExtractNCDWH(x.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
 
@@ -832,7 +832,7 @@ static void Interpolate3DCPUFwd(
     bool align_corners,
     int align_mode,
     DenseTensor* output) {
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
   int n = 0, c = 0, in_d = 0, in_h = 0, in_w = 0;
   funcs::ExtractNCDWH(x.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
 
diff --git a/paddle/phi/kernels/cpu/kthvalue_grad_kernel.cc b/paddle/phi/kernels/cpu/kthvalue_grad_kernel.cc
index 5239512537b428..12c6d9fb28e06e 100644
--- a/paddle/phi/kernels/cpu/kthvalue_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/kthvalue_grad_kernel.cc
@@ -73,12 +73,12 @@ void KthvalueGradKernel(const Context& dev_ctx,
     for (int i = axis + 1; i < in_dims.size(); i++) {
       tmp_out_shape.emplace_back(out_dims[i - 1]);
     }
-    out_dims = phi::make_ddim(tmp_out_shape);
+    out_dims = common::make_ddim(tmp_out_shape);
   }
 
   if (axis == in_dims.size() - 1) {
     const int64_t input_height =
-        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+        common::product(common::slice_ddim(in_dims, 0, in_dims.size() - 1));
     const int64_t input_width = in_dims[in_dims.size() - 1];
     memset(x_grad_data, 0, d_x->numel() * sizeof(T));
     if (keepdim) {
@@ -147,8 +147,8 @@ void KthvalueGradKernel(const Context& dev_ctx,
       funcs::TransCompute<phi::CPUContext, int64_t>(
           ndims, dev_ctx, indices_tmp, &trans_ind, trans);
     }
-    const int64_t input_height = phi::product(
-        phi::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1));
+    const int64_t input_height = common::product(
+        common::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1));
     const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1];
     DenseTensor tmp_out;
     tmp_out.Resize(trans_in_dims);
diff --git a/paddle/phi/kernels/cpu/kthvalue_kernel.cc b/paddle/phi/kernels/cpu/kthvalue_kernel.cc
index eb8cc8f813ff9e..6e719f52b6e051 100644
--- a/paddle/phi/kernels/cpu/kthvalue_kernel.cc
+++ b/paddle/phi/kernels/cpu/kthvalue_kernel.cc
@@ -101,7 +101,7 @@ void KthvalueKernel(const Context& dev_ctx,
   auto out_dims = output->dims();
   if (axis == in_dims.size() - 1) {
     const int64_t& input_height =
-        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+        common::product(common::slice_ddim(in_dims, 0, in_dims.size() - 1));
     const int64_t& input_width = in_dims[in_dims.size() - 1];
     getKthvalue<T, int64_t>(input_height,
                             input_width,
@@ -129,7 +129,7 @@ void KthvalueKernel(const Context& dev_ctx,
       for (int i = axis + 1; i < in_dims.size(); i++) {
         tmp_out_shape.emplace_back(in_dims[i]);
       }
-      DDim tmp_out_dims = phi::make_ddim(tmp_out_shape);
+      DDim tmp_out_dims = common::make_ddim(tmp_out_shape);
       output->Resize(tmp_out_dims);
       indices->Resize(tmp_out_dims);
     }
@@ -148,8 +148,8 @@ void KthvalueKernel(const Context& dev_ctx,
     funcs::TransCompute<phi::CPUContext, T>(
         ndims, dev_ctx, x, &trans_inp, trans);
 
-    const int64_t input_height =
-        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_height = common::product(
+        common::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
     const int64_t input_width = trans_dims[trans_dims.size() - 1];
     DenseTensor tmp_out, tmp_indices;
     tmp_out.Resize(trans_out_dims);
diff --git a/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc
index ddc63598756710..341d8ef98fd947 100644
--- a/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc
@@ -48,7 +48,7 @@ void LayerNormGradKernel(const Context& dev_ctx,
   auto* d_bias = bias_grad;
 
   const auto& x_dims = x.dims();
-  auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
+  auto matrix_dim = common::flatten_to_2d(x_dims, begin_norm_axis);
   int left = static_cast<int>(matrix_dim[0]);
   int right = static_cast<int>(matrix_dim[1]);
   DDim matrix_shape({left, right});
diff --git a/paddle/phi/kernels/cpu/layer_norm_kernel.cc b/paddle/phi/kernels/cpu/layer_norm_kernel.cc
index b15b1554a51c43..8713d2f49e60e9 100644
--- a/paddle/phi/kernels/cpu/layer_norm_kernel.cc
+++ b/paddle/phi/kernels/cpu/layer_norm_kernel.cc
@@ -46,7 +46,7 @@ void LayerNormKernel(const Context& dev_ctx,
   dev_ctx.template Alloc<T>(mean);
   dev_ctx.template Alloc<T>(var);
 
-  auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
+  auto matrix_dim = common::flatten_to_2d(x_dims, begin_norm_axis);
   int left = static_cast<int>(matrix_dim[0]);
   int right = static_cast<int>(matrix_dim[1]);
   DDim matrix_shape({left, right});
diff --git a/paddle/phi/kernels/cpu/limit_by_capacity_kernel.cc b/paddle/phi/kernels/cpu/limit_by_capacity_kernel.cc
index ea2f6cbc6ee82c..1057120b2ae5e1 100644
--- a/paddle/phi/kernels/cpu/limit_by_capacity_kernel.cc
+++ b/paddle/phi/kernels/cpu/limit_by_capacity_kernel.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/limit_by_capacity_kernel.h"
-#include "paddle/phi/core/errors.h"
+#include "paddle/common/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 #if defined(PADDLE_WITH_GLOO)
diff --git a/paddle/phi/kernels/cpu/linspace_kernel.cc b/paddle/phi/kernels/cpu/linspace_kernel.cc
index 4b8b7f7a2e05c7..70cd1e17ca9ce5 100644
--- a/paddle/phi/kernels/cpu/linspace_kernel.cc
+++ b/paddle/phi/kernels/cpu/linspace_kernel.cc
@@ -40,7 +40,7 @@ void LinspaceKernel(const Context& ctx,
                                    "than 0, but received num is %d",
                                    num));
 
-  out->Resize(phi::make_ddim({num}));
+  out->Resize(common::make_ddim({num}));
   T* out_data = ctx.template Alloc<T>(out);
 
   if (num > 1) {
diff --git a/paddle/phi/kernels/cpu/logspace_kernel.cc b/paddle/phi/kernels/cpu/logspace_kernel.cc
index fbb31057a35ae9..f6a31fed1e13ff 100644
--- a/paddle/phi/kernels/cpu/logspace_kernel.cc
+++ b/paddle/phi/kernels/cpu/logspace_kernel.cc
@@ -45,7 +45,7 @@ void LogspaceKernel(const Context& ctx,
                                    "than 0, but received num is %d",
                                    num));
 
-  out->Resize(phi::make_ddim({num}));
+  out->Resize(common::make_ddim({num}));
   T* out_data = ctx.template Alloc<T>(out);
 
   if (num > 1) {
diff --git a/paddle/phi/kernels/cpu/lstsq_kernel.cc b/paddle/phi/kernels/cpu/lstsq_kernel.cc
index 3e4782c1a9f0a9..2b81649caf904f 100644
--- a/paddle/phi/kernels/cpu/lstsq_kernel.cc
+++ b/paddle/phi/kernels/cpu/lstsq_kernel.cc
@@ -70,11 +70,11 @@ void LstsqKernel(const Context& dev_ctx,
   int ldb = std::max<int>(1, std::max(m, n));
 
   DenseTensor* new_x = new DenseTensor();
-  new_x->Resize(phi::make_ddim({batch_count, m, n}));
+  new_x->Resize(common::make_ddim({batch_count, m, n}));
   dev_ctx.template Alloc<T>(new_x);
   phi::Copy<Context>(dev_ctx, x, dev_ctx.GetPlace(), true, new_x);
 
-  solution->Resize(phi::make_ddim({batch_count, std::max(m, n), nrhs}));
+  solution->Resize(common::make_ddim({batch_count, std::max(m, n), nrhs}));
   dev_ctx.template Alloc<T>(solution);
 
   if (m >= n) {
@@ -122,7 +122,7 @@ void LstsqKernel(const Context& dev_ctx,
   DenseTensor* jpvt = new DenseTensor();
   int* jpvt_data = nullptr;
   if (driver == LapackDriverType::Gelsy) {
-    jpvt->Resize(phi::make_ddim({std::max<int>(1, n)}));
+    jpvt->Resize(common::make_ddim({std::max<int>(1, n)}));
     jpvt_data = dev_ctx.template Alloc<int>(jpvt);
   }
 
@@ -185,7 +185,7 @@ void LstsqKernel(const Context& dev_ctx,
 
   lwork = std::max<int>(1, static_cast<int>(phi::dtype::Real<T>(wkopt)));
   DenseTensor* work = new DenseTensor();
-  work->Resize(phi::make_ddim({lwork}));
+  work->Resize(common::make_ddim({lwork}));
   T* work_data = dev_ctx.template Alloc<T>(work);
 
   // "rwork" only used for complex inputs and "gelsy/gelsd/gelss" drivers
@@ -200,7 +200,7 @@ void LstsqKernel(const Context& dev_ctx,
     } else if (driver == LapackDriverType::Gelsd) {
       rwork_len = std::max<int>(1, rwkopt);
     }
-    rwork->Resize(phi::make_ddim({rwork_len}));
+    rwork->Resize(common::make_ddim({rwork_len}));
     rwork_data = dev_ctx.template Alloc<ValueType>(rwork);
   }
 
@@ -208,7 +208,7 @@ void LstsqKernel(const Context& dev_ctx,
   DenseTensor* iwork = new DenseTensor();
   int* iwork_data = nullptr;
   if (driver == LapackDriverType::Gelsd) {
-    iwork->Resize(phi::make_ddim({std::max<int>(1, iwkopt)}));
+    iwork->Resize(common::make_ddim({std::max<int>(1, iwkopt)}));
     iwork_data = dev_ctx.template Alloc<int>(iwork);
   }
 
@@ -293,7 +293,7 @@ void LstsqKernel(const Context& dev_ctx,
   if (batch_count > 1) {
     solution->Resize(solution_dim);
   } else {
-    solution->Resize(phi::make_ddim({n, nrhs}));
+    solution->Resize(common::make_ddim({n, nrhs}));
   }
 
   GetResidualsTensor<Context, T>(dev_ctx, x, y, solution, residuals);
diff --git a/paddle/phi/kernels/cpu/lu_kernel.cc b/paddle/phi/kernels/cpu/lu_kernel.cc
index 731a722372d656..5790b24025b2e0 100644
--- a/paddle/phi/kernels/cpu/lu_kernel.cc
+++ b/paddle/phi/kernels/cpu/lu_kernel.cc
@@ -43,15 +43,15 @@ void LUKernel(const Context& dev_ctx,
   int n = static_cast<int>(outdims[outrank - 2]);
   int lda = std::max(1, m);
 
-  auto ipiv_dims = phi::slice_ddim(outdims, 0, outrank - 1);
+  auto ipiv_dims = common::slice_ddim(outdims, 0, outrank - 1);
   ipiv_dims[outrank - 2] = std::min(m, n);
   pivots->Resize(ipiv_dims);
   dev_ctx.template Alloc<int>(pivots);
   auto ipiv_data = pivots->data<int>();
 
-  auto info_dims = phi::slice_ddim(outdims, 0, outrank - 2);
+  auto info_dims = common::slice_ddim(outdims, 0, outrank - 2);
   if (info_dims.size() == 0) {
-    info_dims = phi::make_ddim({1});
+    info_dims = common::make_ddim({1});
   }
   infos->Resize(info_dims);
   dev_ctx.template Alloc<int>(infos);
diff --git a/paddle/phi/kernels/cpu/masked_select_grad_kernel.cc b/paddle/phi/kernels/cpu/masked_select_grad_kernel.cc
index 58f4f7361eb64d..fa120de4b79521 100644
--- a/paddle/phi/kernels/cpu/masked_select_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/masked_select_grad_kernel.cc
@@ -40,8 +40,8 @@ void MaskedSelectGradKernel(const Context& dev_ctx,
   bool expand_x = false;
 
   auto expanded_size = funcs::MatrixGetBroadcastBatchPortion(
-      vectorize(x_grad->dims()), vectorize(mask.dims()));
-  auto expaned_dims = make_ddim(expanded_size);
+      common::vectorize(x_grad->dims()), common::vectorize(mask.dims()));
+  auto expaned_dims = common::make_ddim(expanded_size);
 
   if (mask.dims() != expaned_dims) {
     ExpandKernel<bool, Context>(
diff --git a/paddle/phi/kernels/cpu/masked_select_kernel.cc b/paddle/phi/kernels/cpu/masked_select_kernel.cc
index 837a8921e8148a..8e9e3bbebecd4d 100644
--- a/paddle/phi/kernels/cpu/masked_select_kernel.cc
+++ b/paddle/phi/kernels/cpu/masked_select_kernel.cc
@@ -29,9 +29,9 @@ void MaskedSelectKernel(const Context& dev_ctx,
   DenseTensor x_expand;
 
   auto expanded_size = funcs::MatrixGetBroadcastBatchPortion(
-      vectorize(x.dims()), vectorize(mask.dims()));
+      common::vectorize(x.dims()), common::vectorize(mask.dims()));
 
-  DDim epxand_dims = make_ddim(expanded_size);
+  DDim epxand_dims = common::make_ddim(expanded_size);
   if (mask.dims() != epxand_dims) {
     ExpandKernel<bool, Context>(
         dev_ctx, mask, IntArray(expanded_size), &mask_expand);
diff --git a/paddle/phi/kernels/cpu/matrix_nms_kernel.cc b/paddle/phi/kernels/cpu/matrix_nms_kernel.cc
index b2827d039bacce..fb993029bb1916 100644
--- a/paddle/phi/kernels/cpu/matrix_nms_kernel.cc
+++ b/paddle/phi/kernels/cpu/matrix_nms_kernel.cc
@@ -14,8 +14,8 @@
 
 #include "paddle/phi/kernels/matrix_nms_kernel.h"
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
@@ -257,7 +257,7 @@ void MatrixNMSKernel(const Context& ctx,
                      DenseTensor* out,
                      DenseTensor* index,
                      DenseTensor* roisnum) {
-  auto score_dims = phi::vectorize<int>(scores.dims());
+  auto score_dims = common::vectorize<int>(scores.dims());
   auto batch_size = score_dims[0];
   auto num_boxes = score_dims[2];
   auto box_dim = bboxes.dims()[2];
@@ -297,21 +297,21 @@ void MatrixNMSKernel(const Context& ctx,
 
   int64_t num_kept = static_cast<int64_t>(offsets.back());
   if (num_kept == 0) {
-    out->Resize(phi::make_ddim({0, out_dim}));
+    out->Resize(common::make_ddim({0, out_dim}));
     ctx.template Alloc<T>(out);
-    index->Resize(phi::make_ddim({0, 1}));
+    index->Resize(common::make_ddim({0, 1}));
     ctx.template Alloc<int>(index);
   } else {
-    out->Resize(phi::make_ddim({num_kept, out_dim}));
+    out->Resize(common::make_ddim({num_kept, out_dim}));
     ctx.template Alloc<T>(out);
-    index->Resize(phi::make_ddim({num_kept, 1}));
+    index->Resize(common::make_ddim({num_kept, 1}));
     ctx.template Alloc<int>(index);
     std::copy(detections.begin(), detections.end(), out->data<T>());
     std::copy(indices.begin(), indices.end(), index->data<int>());
   }
 
   if (roisnum != nullptr) {
-    roisnum->Resize(phi::make_ddim({batch_size}));
+    roisnum->Resize(common::make_ddim({batch_size}));
     ctx.template Alloc<int>(roisnum);
     std::copy(num_per_batch.begin(), num_per_batch.end(), roisnum->data<int>());
   }
diff --git a/paddle/phi/kernels/cpu/mode_grad_kernel.cc b/paddle/phi/kernels/cpu/mode_grad_kernel.cc
index c4268080d50c07..a0c592c4bab9a0 100644
--- a/paddle/phi/kernels/cpu/mode_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/mode_grad_kernel.cc
@@ -53,14 +53,14 @@ void ModeGradKernel(const Context& dev_ctx,
     for (int i = axis + 1; i < in_dims.size(); i++) {
       tmp_out_shape.emplace_back(out_dims[i - 1]);
     }
-    out_dims = phi::make_ddim(tmp_out_shape);
+    out_dims = common::make_ddim(tmp_out_shape);
   }
 
   if (axis == in_dims.size() - 1) {
     // allocate the memory for the input_grad
     // assign the out_grad to input_grad directly
     const int64_t input_height =
-        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+        common::product(common::slice_ddim(in_dims, 0, in_dims.size() - 1));
     const int64_t input_width = in_dims[in_dims.size() - 1];
 
     // init the output grad with 0, because some input elements has no grad
@@ -143,8 +143,8 @@ void ModeGradKernel(const Context& dev_ctx,
       funcs::TransCompute<CPUContext, int64_t>(
           ndims, dev_ctx, indices_tmp, &trans_ind, trans_axis);
     }
-    const int64_t input_height = phi::product(
-        phi::slice_ddim(trans_in_shape, 0, trans_in_shape.size() - 1));
+    const int64_t input_height = common::product(
+        common::slice_ddim(trans_in_shape, 0, trans_in_shape.size() - 1));
     const int64_t input_width = trans_in_shape[trans_in_shape.size() - 1];
 
     // Assign the out_grad to tranpose input_grad
diff --git a/paddle/phi/kernels/cpu/mode_kernel.cc b/paddle/phi/kernels/cpu/mode_kernel.cc
index 3459fd8372d9de..f5a0a1ee05e914 100644
--- a/paddle/phi/kernels/cpu/mode_kernel.cc
+++ b/paddle/phi/kernels/cpu/mode_kernel.cc
@@ -52,7 +52,7 @@ void ModeKernel(const Context& dev_ctx,
   // calculation, then tranpose it back to original axis.
   if (axis == in_dims.size() - 1) {
     const int64_t& input_height =
-        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+        common::product(common::slice_ddim(in_dims, 0, in_dims.size() - 1));
     const int64_t& input_width = in_dims[in_dims.size() - 1];
     funcs::GetMode<T, int64_t>(input_height,
                                input_width,
@@ -80,7 +80,7 @@ void ModeKernel(const Context& dev_ctx,
       for (int i = axis + 1; i < in_dims.size(); i++) {
         tmp_out_shape.emplace_back(in_dims[i]);
       }
-      DDim tmp_out_dim = phi::make_ddim(tmp_out_shape);
+      DDim tmp_out_dim = common::make_ddim(tmp_out_shape);
       out->Resize(tmp_out_dim);
       indices->Resize(tmp_out_dim);
     }
@@ -104,8 +104,8 @@ void ModeKernel(const Context& dev_ctx,
     funcs::TransCompute<CPUContext, T>(
         ndims, dev_ctx, x, &trans_input, trans_axis);
 
-    const int64_t input_height =
-        phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1));
+    const int64_t input_height = common::product(
+        common::slice_ddim(trans_shape, 0, trans_shape.size() - 1));
     const int64_t input_width = trans_shape[trans_shape.size() - 1];
     DenseTensor tmp_out;
     tmp_out.Resize(trans_out_shape);
diff --git a/paddle/phi/kernels/cpu/multiclass_nms3_kernel.cc b/paddle/phi/kernels/cpu/multiclass_nms3_kernel.cc
index aa04288124a9b7..7bfc41e91d6cc5 100644
--- a/paddle/phi/kernels/cpu/multiclass_nms3_kernel.cc
+++ b/paddle/phi/kernels/cpu/multiclass_nms3_kernel.cc
@@ -494,7 +494,7 @@ void MultiClassNMSKernel(const Context& ctx,
                          DenseTensor* nms_rois_num) {
   bool return_index = index != nullptr;
   bool has_roisnum = rois_num.get_ptr() != nullptr;
-  auto score_dims = phi::vectorize<int>(scores.dims());
+  auto score_dims = common::vectorize<int>(scores.dims());
   auto score_size = score_dims.size();
 
   std::vector<std::map<int, std::vector<int>>> all_indices;
diff --git a/paddle/phi/kernels/cpu/nms_kernel.cc b/paddle/phi/kernels/cpu/nms_kernel.cc
index 5534fd71a18274..e733af0128d15b 100644
--- a/paddle/phi/kernels/cpu/nms_kernel.cc
+++ b/paddle/phi/kernels/cpu/nms_kernel.cc
@@ -84,7 +84,7 @@ void NMSKernel(const Context& dev_ctx,
 
   int64_t num_boxes = boxes.dims()[0];
   DenseTensor output_tmp;
-  output_tmp.Resize(phi::make_ddim({num_boxes}));
+  output_tmp.Resize(common::make_ddim({num_boxes}));
   auto output_tmp_data = dev_ctx.template Alloc<int64_t>(&output_tmp);
 
   int64_t num_keep_boxes =
diff --git a/paddle/phi/kernels/cpu/nonzero_kernel.cc b/paddle/phi/kernels/cpu/nonzero_kernel.cc
index 653a03e7f99071..038244accfdef6 100644
--- a/paddle/phi/kernels/cpu/nonzero_kernel.cc
+++ b/paddle/phi/kernels/cpu/nonzero_kernel.cc
@@ -62,7 +62,7 @@ void NonZeroKernel(const Context& dev_ctx,
     }
   }
   auto true_num = true_index.size();
-  out->Resize(phi::make_ddim({static_cast<int64_t>(true_num), rank}));
+  out->Resize(common::make_ddim({static_cast<int64_t>(true_num), rank}));
   auto* out_ptr = dev_ctx.template Alloc<int64_t>(out);
 
   if (true_num == 0) {
diff --git a/paddle/phi/kernels/cpu/overlap_add_grad_kernel.cc b/paddle/phi/kernels/cpu/overlap_add_grad_kernel.cc
index 93fe7a6cb6f095..fe0ab8c309fc4a 100644
--- a/paddle/phi/kernels/cpu/overlap_add_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/overlap_add_grad_kernel.cc
@@ -53,17 +53,17 @@ void OverlapAddGradKernel(const Context& dev_ctx,
     phi::DDim x_grad_resized_dims;
     phi::DDim out_grad_resized_dims;
     if (axis == 0) {
-      preserved_dims =
-          phi::slice_ddim(out_grad_.dims(), 1, static_cast<int>(out_grad_rank));
+      preserved_dims = common::slice_ddim(
+          out_grad_.dims(), 1, static_cast<int>(out_grad_rank));
       x_grad_resized_dims = {
-          n_frames, frame_length, phi::product(preserved_dims)};
-      out_grad_resized_dims = {seq_length, phi::product(preserved_dims)};
+          n_frames, frame_length, common::product(preserved_dims)};
+      out_grad_resized_dims = {seq_length, common::product(preserved_dims)};
     } else {
-      preserved_dims = phi::slice_ddim(
+      preserved_dims = common::slice_ddim(
           out_grad_.dims(), 0, static_cast<int>(out_grad_rank) - 1);
       x_grad_resized_dims = {
-          phi::product(preserved_dims), frame_length, n_frames};
-      out_grad_resized_dims = {phi::product(preserved_dims), seq_length};
+          common::product(preserved_dims), frame_length, n_frames};
+      out_grad_resized_dims = {common::product(preserved_dims), seq_length};
     }
     x_grad->Resize(x_grad_resized_dims);
     out_grad_.Resize(out_grad_resized_dims);
@@ -78,31 +78,31 @@ void OverlapAddGradKernel(const Context& dev_ctx,
       trans_out_grad = out_grad_;
 
       std::vector<int> perm_x_grad{1, 0};
-      auto x_grad_dims_vec = phi::vectorize(x_grad->dims());
+      auto x_grad_dims_vec = common::vectorize(x_grad->dims());
       for (int i = 0; i < x_grad->dims().size(); ++i) {
         x_grad_dims_vec[i] = x_grad->dims()[perm_x_grad[i]];
       }
-      trans_x_grad.Resize(phi::make_ddim(x_grad_dims_vec));
+      trans_x_grad.Resize(common::make_ddim(x_grad_dims_vec));
       dev_ctx.template Alloc<T>(&trans_x_grad);
       phi::funcs::TransCompute<Context, T>(
           perm_x_grad.size(), dev_ctx, *x_grad, &trans_x_grad, perm_x_grad);
     } else {
       std::vector<int> perm_d_out{1, 0};
-      auto out_grad_dims_vec = phi::vectorize(out_grad_.dims());
+      auto out_grad_dims_vec = common::vectorize(out_grad_.dims());
       for (int i = 0; i < out_grad_.dims().size(); ++i) {
         out_grad_dims_vec[i] = out_grad_.dims()[perm_d_out[i]];
       }
-      trans_out_grad.Resize(phi::make_ddim(out_grad_dims_vec));
+      trans_out_grad.Resize(common::make_ddim(out_grad_dims_vec));
       dev_ctx.template Alloc<T>(&trans_out_grad);
       phi::funcs::TransCompute<Context, T>(
           perm_d_out.size(), dev_ctx, out_grad_, &trans_out_grad, perm_d_out);
 
       std::vector<int> perm_x_grad{2, 1, 0};
-      auto x_grad_dims_vec = phi::vectorize(x_grad->dims());
+      auto x_grad_dims_vec = common::vectorize(x_grad->dims());
       for (int i = 0; i < x_grad->dims().size(); ++i) {
         x_grad_dims_vec[i] = x_grad->dims()[perm_x_grad[i]];
       }
-      trans_x_grad.Resize(phi::make_ddim(x_grad_dims_vec));
+      trans_x_grad.Resize(common::make_ddim(x_grad_dims_vec));
       dev_ctx.template Alloc<T>(&trans_x_grad);
       phi::funcs::TransCompute<Context, T>(
           perm_x_grad.size(), dev_ctx, *x_grad, &trans_x_grad, perm_x_grad);
@@ -151,7 +151,7 @@ void OverlapAddGradKernel(const Context& dev_ctx,
       restored_x_grad_shape.push_back(n_frames);
     }
 
-    x_grad->Resize(phi::make_ddim(restored_x_grad_shape));
+    x_grad->Resize(common::make_ddim(restored_x_grad_shape));
   }
 }
 
diff --git a/paddle/phi/kernels/cpu/overlap_add_kernel.cc b/paddle/phi/kernels/cpu/overlap_add_kernel.cc
index ec7fc6d656d03c..ac00eec3e9f138 100644
--- a/paddle/phi/kernels/cpu/overlap_add_kernel.cc
+++ b/paddle/phi/kernels/cpu/overlap_add_kernel.cc
@@ -50,14 +50,16 @@ void OverlapAddKernel(const Context& dev_ctx,
     phi::DDim out_resized_dims;
     if (axis == 0) {
       preserved_dims =
-          phi::slice_ddim(out->dims(), 1, static_cast<int>(out_rank));
-      x_resized_dims = {n_frames, frame_length, phi::product(preserved_dims)};
-      out_resized_dims = {seq_length, phi::product(preserved_dims)};
+          common::slice_ddim(out->dims(), 1, static_cast<int>(out_rank));
+      x_resized_dims = {
+          n_frames, frame_length, common::product(preserved_dims)};
+      out_resized_dims = {seq_length, common::product(preserved_dims)};
     } else {
       preserved_dims =
-          phi::slice_ddim(out->dims(), 0, static_cast<int>(out_rank) - 1);
-      x_resized_dims = {phi::product(preserved_dims), frame_length, n_frames};
-      out_resized_dims = {phi::product(preserved_dims), seq_length};
+          common::slice_ddim(out->dims(), 0, static_cast<int>(out_rank) - 1);
+      x_resized_dims = {
+          common::product(preserved_dims), frame_length, n_frames};
+      out_resized_dims = {common::product(preserved_dims), seq_length};
     }
     x_.Resize(x_resized_dims);
     out->Resize(out_resized_dims);
@@ -72,31 +74,31 @@ void OverlapAddKernel(const Context& dev_ctx,
       trans_out = *out;
 
       std::vector<int> perm_x{1, 0};
-      auto x_dims_vec = phi::vectorize(x_.dims());
+      auto x_dims_vec = common::vectorize(x_.dims());
       for (int i = 0; i < x_.dims().size(); ++i) {
         x_dims_vec[i] = x_.dims()[perm_x[i]];
       }
-      trans_x.Resize(phi::make_ddim(x_dims_vec));
+      trans_x.Resize(common::make_ddim(x_dims_vec));
       dev_ctx.template Alloc<T>(&trans_x);
       phi::funcs::TransCompute<Context, T>(
           perm_x.size(), dev_ctx, x_, &trans_x, perm_x);
     } else {
       std::vector<int> perm_out{1, 0};
-      auto out_dims_vec = phi::vectorize(out->dims());
+      auto out_dims_vec = common::vectorize(out->dims());
       for (int i = 0; i < out->dims().size(); ++i) {
         out_dims_vec[i] = out->dims()[perm_out[i]];
       }
-      trans_out.Resize(phi::make_ddim(out_dims_vec));
+      trans_out.Resize(common::make_ddim(out_dims_vec));
       dev_ctx.template Alloc<T>(&trans_out);
       phi::funcs::TransCompute<Context, T>(
           perm_out.size(), dev_ctx, *out, &trans_out, perm_out);
 
       std::vector<int> perm_x{2, 1, 0};
-      auto x_dims_vec = phi::vectorize(x_.dims());
+      auto x_dims_vec = common::vectorize(x_.dims());
       for (int i = 0; i < x_.dims().size(); ++i) {
         x_dims_vec[i] = x_.dims()[perm_x[i]];
       }
-      trans_x.Resize(phi::make_ddim(x_dims_vec));
+      trans_x.Resize(common::make_ddim(x_dims_vec));
       dev_ctx.template Alloc<T>(&trans_x);
       phi::funcs::TransCompute<Context, T>(
           perm_x.size(), dev_ctx, x_, &trans_x, perm_x);
@@ -137,7 +139,7 @@ void OverlapAddKernel(const Context& dev_ctx,
       restored_out_shape.push_back(seq_length);
     }
 
-    out->Resize(phi::make_ddim(restored_out_shape));
+    out->Resize(common::make_ddim(restored_out_shape));
   }
 }
 
diff --git a/paddle/phi/kernels/cpu/prior_box_kernel.cc b/paddle/phi/kernels/cpu/prior_box_kernel.cc
index c289d11069992b..c2c98661686485 100644
--- a/paddle/phi/kernels/cpu/prior_box_kernel.cc
+++ b/paddle/phi/kernels/cpu/prior_box_kernel.cc
@@ -138,7 +138,7 @@ void PriorBoxKernel(const Context& ctx,
   }
 
   DenseTensor var_t;
-  var_t.Resize(phi::make_ddim({1, static_cast<int>(variances.size())}));
+  var_t.Resize(common::make_ddim({1, static_cast<int>(variances.size())}));
   ctx.template Alloc<T>(&var_t);
   auto var_et = EigenTensor<T, 2>::From(var_t);
 
diff --git a/paddle/phi/kernels/cpu/prune_gate_by_capacity_kernel.cc b/paddle/phi/kernels/cpu/prune_gate_by_capacity_kernel.cc
index ed26b4f37dd5cc..7f2717b8ecacef 100644
--- a/paddle/phi/kernels/cpu/prune_gate_by_capacity_kernel.cc
+++ b/paddle/phi/kernels/cpu/prune_gate_by_capacity_kernel.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/prune_gate_by_capacity_kernel.h"
-#include "paddle/phi/core/errors.h"
+#include "paddle/common/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/randint_kernel.cc b/paddle/phi/kernels/cpu/randint_kernel.cc
index 781d5199457f01..c3f1ffe4248ec7 100644
--- a/paddle/phi/kernels/cpu/randint_kernel.cc
+++ b/paddle/phi/kernels/cpu/randint_kernel.cc
@@ -29,7 +29,7 @@ void RandintKernel(const Context& dev_ctx,
                    DataType dtype UNUSED,
                    DenseTensor* out) {
   int seed = 0;
-  out->Resize(phi::make_ddim(shape.GetData()));
+  out->Resize(common::make_ddim(shape.GetData()));
   T* data = dev_ctx.template Alloc<T>(out);
   auto numel = out->numel();
   std::shared_ptr<std::mt19937_64> engine;
diff --git a/paddle/phi/kernels/cpu/random_routing_kernel.cc b/paddle/phi/kernels/cpu/random_routing_kernel.cc
index 0e1d450c1894ae..cdeab98f4c1ab3 100644
--- a/paddle/phi/kernels/cpu/random_routing_kernel.cc
+++ b/paddle/phi/kernels/cpu/random_routing_kernel.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/random_routing_kernel.h"
-#include "paddle/phi/core/errors.h"
+#include "paddle/common/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/repeat_interleave_grad_kernel.cc b/paddle/phi/kernels/cpu/repeat_interleave_grad_kernel.cc
index 05f19ac36107ec..b7b33d4290daec 100644
--- a/paddle/phi/kernels/cpu/repeat_interleave_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/repeat_interleave_grad_kernel.cc
@@ -90,7 +90,7 @@ void RepeatInterleaveGradKernel(const Context& ctx,
   for (int i = 0; i < x_grad->dims()[dim]; i++) {
     std::fill_n(index_vec.begin() + i * repeats, repeats, i);
   }
-  index.Resize(phi::make_ddim({index_size}));
+  index.Resize(common::make_ddim({index_size}));
   phi::TensorFromVector<int>(index_vec, ctx, &index);
   const DenseTensor index_copy = index;
   IndexSelectGradInner<Context, T, int>(ctx, out_grad, index_copy, x_grad, dim);
diff --git a/paddle/phi/kernels/cpu/rnn_functor.h b/paddle/phi/kernels/cpu/rnn_functor.h
index 4adb754174dacb..d0f5e5787bd170 100644
--- a/paddle/phi/kernels/cpu/rnn_functor.h
+++ b/paddle/phi/kernels/cpu/rnn_functor.h
@@ -140,7 +140,7 @@ void DropoutCpuFunctionInplace(const CPUContext& dev_ctx,
   if (is_test) {
     return;
   }
-  size_t size = phi::product(x->dims());
+  size_t size = common::product(x->dims());
   auto* mask_data = mask->data<uint8_t>();
   if (!(*is_has_reset)) {
     // Special case when dropout_prob is 1.0
diff --git a/paddle/phi/kernels/cpu/rnn_grad_kernel.cc b/paddle/phi/kernels/cpu/rnn_grad_kernel.cc
index 48d6ea98c16ded..3e0e4c7a3d7a5a 100644
--- a/paddle/phi/kernels/cpu/rnn_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/rnn_grad_kernel.cc
@@ -82,7 +82,7 @@ struct GradCell {
     if (has_sequence_length) {
       auto& place = *dev_ctx.eigen_device();
       auto mask = EigenMatrix<T>::From(
-          mask_tensor, phi::make_ddim({mask_tensor.dims()[1], 1}));
+          mask_tensor, common::make_ddim({mask_tensor.dims()[1], 1}));
       auto mask_broadcast = mask.broadcast(Eigen::DSizes<int, 2>(
           1, static_cast<int>(grad_pre_hidden->dims()[2])));
       auto pre_hidden_grad = EigenMatrix<T>::Reshape(
@@ -394,7 +394,7 @@ struct GradLayer {
     std::vector<DenseTensor> mask_tensor_list;
     int mask_min_length = time_step;
     if (has_sequence_length) {
-      mask_matrix.Resize(phi::make_ddim({time_step, input->dims()[1]}));
+      mask_matrix.Resize(common::make_ddim({time_step, input->dims()[1]}));
       CreateMaskMatrix<T>(
           dev_ctx, sequence_length, &mask_matrix, is_reverse, &mask_min_length);
       mask_tensor_list = Unbind(mask_matrix);
@@ -598,7 +598,7 @@ struct GradLayer {
                        const std::string& mode) {
     auto& place = *dev_ctx.eigen_device();
     auto mask = EigenMatrix<T>::From(
-        mask_tensor, phi::make_ddim({mask_tensor.dims()[1], 1}));
+        mask_tensor, common::make_ddim({mask_tensor.dims()[1], 1}));
     auto mask_broadcast = mask.broadcast(
         Eigen::DSizes<int, 2>(1, static_cast<int>(grad_output->dims()[2])));
 
@@ -1121,8 +1121,8 @@ void RnnGradFunc(const CPUContext& dev_ctx,
   }
   // squeeze the hidden first dim
   for (auto& hidden_tensor : hidden_tensor_unbind) {
-    hidden_tensor.Resize(
-        phi::slice_ddim(hidden_tensor.dims(), 1, hidden_tensor.dims().size()));
+    hidden_tensor.Resize(common::slice_ddim(
+        hidden_tensor.dims(), 1, hidden_tensor.dims().size()));
   }
   // add the output tensor to the hidden vector
   DenseTensor tmp;
diff --git a/paddle/phi/kernels/cpu/rnn_kernel.cc b/paddle/phi/kernels/cpu/rnn_kernel.cc
index 606a8f3ff400d9..a0035c6db4a75d 100644
--- a/paddle/phi/kernels/cpu/rnn_kernel.cc
+++ b/paddle/phi/kernels/cpu/rnn_kernel.cc
@@ -220,7 +220,7 @@ struct Layer {
     // crate the temp input for the X * W_ih^T + Bias_ih
     const int& hidden_size = weight.dims()[0];  // NOLINT
     cache_input->Resize(
-        phi::make_ddim({input.dims()[0], input.dims()[1], hidden_size}));
+        common::make_ddim({input.dims()[0], input.dims()[1], hidden_size}));
     if (is_test) {
       dev_ctx.Alloc<T>(cache_input);
     }
@@ -240,9 +240,9 @@ struct Layer {
 
     auto in =
         EigenMatrix<T>::Reshape(*cache_input, cache_input->dims().size() - 1);
-    auto bias_ih_tmp =
-        EigenMatrix<T>::From(bias_ih, phi::make_ddim({1, bias_ih.dims()[0]}));
-    const int row_num = static_cast<int>(phi::product(cache_input->dims()) /
+    auto bias_ih_tmp = EigenMatrix<T>::From(
+        bias_ih, common::make_ddim({1, bias_ih.dims()[0]}));
+    const int row_num = static_cast<int>(common::product(cache_input->dims()) /
                                          cache_input->dims()[2]);
     in = in + bias_ih_tmp.broadcast(Eigen::DSizes<int, 2>(row_num, 1));
     if (is_gru(mode)) {
@@ -255,11 +255,11 @@ struct Layer {
       zero(dev_ctx, &bias_hh_tmp_unbind[2], static_cast<T>(0.0));
 
       auto bias_hh_after_mask = EigenMatrix<T>::From(
-          bias_hh_tmp, phi::make_ddim({1, bias_hh.dims()[0]}));
+          bias_hh_tmp, common::make_ddim({1, bias_hh.dims()[0]}));
       in = in + bias_hh_after_mask.broadcast(Eigen::DSizes<int, 2>(row_num, 1));
     } else {
-      auto bias_hh_no_mask =
-          EigenMatrix<T>::From(bias_hh, phi::make_ddim({1, bias_hh.dims()[0]}));
+      auto bias_hh_no_mask = EigenMatrix<T>::From(
+          bias_hh, common::make_ddim({1, bias_hh.dims()[0]}));
       in = in + bias_hh_no_mask.broadcast(Eigen::DSizes<int, 2>(row_num, 1));
     }
   }
@@ -276,7 +276,7 @@ struct Layer {
     auto& place = *dev_ctx.eigen_device();
     auto out = EigenMatrix<T>::Reshape(*output, output->dims().size() - 1);
     auto mask = EigenMatrix<T>::From(
-        mask_tensor, phi::make_ddim({mask_tensor.dims()[1], 1}));
+        mask_tensor, common::make_ddim({mask_tensor.dims()[1], 1}));
     auto pre_h = EigenMatrix<T>::Reshape(*init_h, init_h->dims().size() - 1);
     auto curr_h = EigenMatrix<T>::Reshape(*last_h, last_h->dims().size() - 1);
     auto mask_broadcast = mask.broadcast(
@@ -356,7 +356,7 @@ struct Layer {
     DenseTensor mask_matrix;
     int mask_min_length = time_step;
     if (has_sequence_length) {
-      mask_matrix.Resize(phi::make_ddim({time_step, input->dims()[1]}));
+      mask_matrix.Resize(common::make_ddim({time_step, input->dims()[1]}));
 
       CreateMaskMatrix<T>(
           dev_ctx, sequence_length, &mask_matrix, is_reverse, &mask_min_length);
@@ -556,7 +556,7 @@ struct Layer {
     DenseTensor mask_matrix;
     int mask_min_length = time_step;
     if (has_sequence_length) {
-      mask_matrix.Resize(phi::make_ddim({time_step, input->dims()[1]}));
+      mask_matrix.Resize(common::make_ddim({time_step, input->dims()[1]}));
       CreateMaskMatrix<T>(
           dev_ctx, sequence_length, &mask_matrix, is_reverse, &mask_min_length);
       mask_tensor_list = Unbind(mask_matrix);
diff --git a/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc b/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc
index 119f4ea1b0ac40..f6599b2ed47333 100644
--- a/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc
@@ -81,7 +81,7 @@ void RoiAlignGradKernel(const Context& dev_ctx,
                         int sampling_ratio,
                         bool aligned,
                         DenseTensor* dx) {
-  const auto& in_dims = phi::vectorize<int>(x.dims());
+  const auto& in_dims = common::vectorize<int>(x.dims());
   int channels = in_dims[1];
   int height = in_dims[2];
   int width = in_dims[3];
@@ -129,9 +129,9 @@ void RoiAlignGradKernel(const Context& dev_ctx,
   const T* out_grad_data = out_grad.data<T>();
   T* dx_data = dev_ctx.template Alloc<T>(dx);
 
-  auto in_stride = phi::stride(x.dims());
-  auto roi_stride = phi::stride(boxes.dims());
-  auto out_stride = phi::stride(out_grad.dims());
+  auto in_stride = common::stride(x.dims());
+  auto roi_stride = common::stride(boxes.dims());
+  auto out_stride = common::stride(out_grad.dims());
 
   T roi_offset = aligned ? T(0.5) : 0;
   for (int n = 0; n < rois_num; ++n) {
diff --git a/paddle/phi/kernels/cpu/roi_align_kernel.cc b/paddle/phi/kernels/cpu/roi_align_kernel.cc
index bf303b17fbc8bd..7a0a00f82e7cd4 100644
--- a/paddle/phi/kernels/cpu/roi_align_kernel.cc
+++ b/paddle/phi/kernels/cpu/roi_align_kernel.cc
@@ -198,9 +198,9 @@ void RoiAlignKernel(const Context& dev_ctx,
     return;
   }
 
-  auto in_stride = phi::stride(in_dims);
-  auto roi_stride = phi::stride(boxes.dims());
-  auto out_stride = phi::stride(out->dims());
+  auto in_stride = common::stride(in_dims);
+  auto roi_stride = common::stride(boxes.dims());
+  auto out_stride = common::stride(out->dims());
 
   const T* input_data = x.data<T>();
   DenseTensor roi_batch_id_list = Empty<int>(dev_ctx, {rois_num});
diff --git a/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc
index e25a581cbd9dd9..ff1f8578a78b06 100644
--- a/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc
@@ -66,10 +66,10 @@ void RoiPoolGradKernel(const Context& dev_ctx,
     phi::funcs::SetConstant<Context, T> set_zero;
     set_zero(dev_ctx, dx, static_cast<T>(0));
 
-    auto in_stride = phi::stride(x.dims());
-    auto arg_max_stride = phi::stride(arg_max.dims());
-    auto roi_stride = phi::stride(boxes.dims());
-    auto out_stride = phi::stride(out_grad.dims());
+    auto in_stride = common::stride(x.dims());
+    auto arg_max_stride = common::stride(arg_max.dims());
+    auto roi_stride = common::stride(boxes.dims());
+    auto out_stride = common::stride(out_grad.dims());
 
     int channels = static_cast<int>(x.dims()[1]);
 
diff --git a/paddle/phi/kernels/cpu/roi_pool_kernel.cc b/paddle/phi/kernels/cpu/roi_pool_kernel.cc
index 9208308ed12e2a..0789fbb4c227fa 100644
--- a/paddle/phi/kernels/cpu/roi_pool_kernel.cc
+++ b/paddle/phi/kernels/cpu/roi_pool_kernel.cc
@@ -42,10 +42,10 @@ void RoiPoolKernel(const Context& dev_ctx,
     return;
   }
 
-  auto in_stride = phi::stride(x_dims);
-  auto arg_max_stride = phi::stride(arg_max->dims());
-  auto box_stride = phi::stride(boxes.dims());
-  auto out_stride = phi::stride(out->dims());
+  auto in_stride = common::stride(x_dims);
+  auto arg_max_stride = common::stride(arg_max->dims());
+  auto box_stride = common::stride(boxes.dims());
+  auto out_stride = common::stride(out->dims());
 
   const T* input_data = x.data<T>();
 
diff --git a/paddle/phi/kernels/cpu/send_u_recv_kernel.cc b/paddle/phi/kernels/cpu/send_u_recv_kernel.cc
index 3db7844875f244..9e186aeedfab36 100644
--- a/paddle/phi/kernels/cpu/send_u_recv_kernel.cc
+++ b/paddle/phi/kernels/cpu/send_u_recv_kernel.cc
@@ -97,11 +97,11 @@ void GraphSendRecvOpKernelLaunchHelper(const Context& ctx,
     }
   } else {
     // Set out dim following out_size.
-    std::vector<int64_t> dims_ = phi::vectorize(src_dims);
+    std::vector<int64_t> dims_ = common::vectorize(src_dims);
     if (!dims_.empty()) {
       dims_[0] = out_size;
     }
-    out->Resize(phi::make_ddim(dims_));
+    out->Resize(common::make_ddim(dims_));
     memset_size = out_size;
     for (int i = 1; i < src_dims.size(); ++i) {
       memset_size *= src_dims[i];
diff --git a/paddle/phi/kernels/cpu/send_ue_recv_kernel.cc b/paddle/phi/kernels/cpu/send_ue_recv_kernel.cc
index 0dd727811b3ed8..a53efc2bc17b05 100644
--- a/paddle/phi/kernels/cpu/send_ue_recv_kernel.cc
+++ b/paddle/phi/kernels/cpu/send_ue_recv_kernel.cc
@@ -118,13 +118,13 @@ void GraphSendUERecvOpKernelLaunchHelper(const Context& ctx,
   const int& index_size = src_index.dims()[0];  // NOLINT
   auto out_dims = out->dims();
   int64_t memset_size = 1;
-  std::vector<int64_t> dims_ = phi::vectorize(out_dims);
+  std::vector<int64_t> dims_ = common::vectorize(out_dims);
   if (out_size <= 0) {
     dims_[0] = x.dims()[0];
   } else {
     dims_[0] = out_size;
   }
-  out->Resize(phi::make_ddim(dims_));
+  out->Resize(common::make_ddim(dims_));
   for (auto dim : dims_) {
     memset_size *= dim;
   }
diff --git a/paddle/phi/kernels/cpu/send_uv_grad_kernel.cc b/paddle/phi/kernels/cpu/send_uv_grad_kernel.cc
index c04bdaec0177eb..fb77091d0dbcf4 100644
--- a/paddle/phi/kernels/cpu/send_uv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/send_uv_grad_kernel.cc
@@ -60,7 +60,7 @@ void CalculateGrad(const Context& ctx,
       }
     } else {
       const auto& bcast_info = phi::CalcBCastInfo(out_grad_dims, x_grad_dims);
-      auto out_grad_dims_1 = phi::vectorize<int>(out_grad_dims);
+      auto out_grad_dims_1 = common::vectorize<int>(out_grad_dims);
       std::vector<int> out_grad_dims_2(out_grad_dims_1.begin() + 1,
                                        out_grad_dims_1.end());
       out_grad_dims_2.emplace(out_grad_dims_2.begin(), x_grad_dims[0]);
@@ -117,7 +117,7 @@ void CalculateGrad(const Context& ctx,
         }
       }
     } else {
-      auto out_grad_dims_1 = phi::vectorize<int>(out_grad_dims);
+      auto out_grad_dims_1 = common::vectorize<int>(out_grad_dims);
       std::vector<int> out_grad_dims_2(out_grad_dims_1.begin() + 1,
                                        out_grad_dims_1.end());
       out_grad_dims_2.emplace(out_grad_dims_2.begin(), x_grad_dims[0]);
diff --git a/paddle/phi/kernels/cpu/shuffle_batch_kernel.cc b/paddle/phi/kernels/cpu/shuffle_batch_kernel.cc
index f7717d5aee4af1..78ab7492084e0c 100644
--- a/paddle/phi/kernels/cpu/shuffle_batch_kernel.cc
+++ b/paddle/phi/kernels/cpu/shuffle_batch_kernel.cc
@@ -83,7 +83,7 @@ void ShuffleBatchKernel(const Context& dev_ctx,
   // std::shuffle(idx_vec.begin(), idx_vec.end(), engine);
 
   // ShuffleIdx record shuffle order
-  shuffleidx->Resize(phi::make_ddim({(int64_t)idx_vec.size()}));
+  shuffleidx->Resize(common::make_ddim({(int64_t)idx_vec.size()}));
   auto* shuffleidx_data = dev_ctx.template HostAlloc<int64_t>(shuffleidx);
 
   for (size_t i = 0; i < idx_vec.size(); i++) {
@@ -99,7 +99,7 @@ void ShuffleBatchKernel(const Context& dev_ctx,
            x_embed_size * sizeof(T));
   }
   // set new seed
-  seed_out->Resize(phi::make_ddim({1}));
+  seed_out->Resize(common::make_ddim({1}));
   auto* seed_out_data = dev_ctx.template HostAlloc<int64_t>(seed_out);
   *seed_out_data = engine();
 }
diff --git a/paddle/phi/kernels/cpu/sparse_weight_embedding_grad_kernel.cc b/paddle/phi/kernels/cpu/sparse_weight_embedding_grad_kernel.cc
index d296aba66503b7..f3c724489714af 100644
--- a/paddle/phi/kernels/cpu/sparse_weight_embedding_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/sparse_weight_embedding_grad_kernel.cc
@@ -143,7 +143,7 @@ struct SparseWeightEmbeddingSparseGradCPUFunctor {
 
     auto d_output_dims = d_output->dims();
     auto d_output_dims_2d =
-        phi::flatten_to_2d(d_output_dims, d_output_dims.size() - 1);
+        common::flatten_to_2d(d_output_dims, d_output_dims.size() - 1);
     PADDLE_ENFORCE_EQ(d_table_value->dims(),
                       d_output_dims_2d,
                       phi::errors::InvalidArgument(
diff --git a/paddle/phi/kernels/cpu/strided_copy_kernel.cc b/paddle/phi/kernels/cpu/strided_copy_kernel.cc
index a25893eb6b571a..19ce258a4313cb 100644
--- a/paddle/phi/kernels/cpu/strided_copy_kernel.cc
+++ b/paddle/phi/kernels/cpu/strided_copy_kernel.cc
@@ -29,8 +29,8 @@ void StridedCopyKernel(const Context& dev_ctx,
                        int64_t offset,
                        DenseTensor* out) {
   phi::DenseTensorMeta meta = input.meta();
-  meta.strides = phi::make_ddim(out_stride);
-  meta.dims = phi::make_ddim(dims);
+  meta.strides = common::make_ddim(out_stride);
+  meta.dims = common::make_ddim(dims);
   meta.offset = offset;
   out->set_meta(meta);
 
diff --git a/paddle/phi/kernels/cpu/temporal_shift_grad_kernel.cc b/paddle/phi/kernels/cpu/temporal_shift_grad_kernel.cc
index 42d0acd901c71a..1707517f57455c 100644
--- a/paddle/phi/kernels/cpu/temporal_shift_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/temporal_shift_grad_kernel.cc
@@ -14,8 +14,8 @@
 
 #include "paddle/phi/kernels/temporal_shift_grad_kernel.h"
 
+#include "paddle/common/layout.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
@@ -92,7 +92,7 @@ void TemporalShiftGradKernel(const Context& dev_ctx,
   auto* input_grad = x_grad;
   auto* output_grad = &out_grad;
   int t = seg_num;
-  const DataLayout data_layout = phi::StringToDataLayout(data_format_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_format_str);
 
   const int nt = static_cast<int>(output_grad->dims()[0]);
   const int c = static_cast<int>(data_layout == DataLayout::kNCHW
@@ -114,8 +114,8 @@ void TemporalShiftGradKernel(const Context& dev_ctx,
   const int c2 = static_cast<int>(static_cast<float>(c) * 2.f * shift_ratio);
 
   DDim in_grad_dims =
-      (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w})
-                                        : phi::make_ddim({nt, h, w, c}));
+      (data_layout == DataLayout::kNCHW ? common::make_ddim({nt, c, h, w})
+                                        : common::make_ddim({nt, h, w, c}));
   const T* output_grad_data = output_grad->data<T>();
   input_grad->Resize(in_grad_dims);
 
diff --git a/paddle/phi/kernels/cpu/temporal_shift_kernel.cc b/paddle/phi/kernels/cpu/temporal_shift_kernel.cc
index 4e183bf81b4997..8e7084264382ce 100644
--- a/paddle/phi/kernels/cpu/temporal_shift_kernel.cc
+++ b/paddle/phi/kernels/cpu/temporal_shift_kernel.cc
@@ -14,8 +14,8 @@
 
 #include "paddle/phi/kernels/temporal_shift_kernel.h"
 
+#include "paddle/common/layout.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
@@ -92,7 +92,7 @@ void TemporalShiftKernel(const Context& dev_ctx,
   auto* input = &x;
   auto* output = out;
   int t = seg_num;
-  const DataLayout data_layout = phi::StringToDataLayout(data_format_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_format_str);
 
   const int nt = static_cast<int>(input->dims()[0]);
   const int c = static_cast<int>(
@@ -111,8 +111,8 @@ void TemporalShiftKernel(const Context& dev_ctx,
   const int c2 = static_cast<int>(static_cast<float>(c) * 2.f * shift_ratio);
 
   DDim out_dims =
-      (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w})
-                                        : phi::make_ddim({nt, h, w, c}));
+      (data_layout == DataLayout::kNCHW ? common::make_ddim({nt, c, h, w})
+                                        : common::make_ddim({nt, h, w, c}));
   const T* input_data = input->data<T>();
   output->Resize(out_dims);
   T* output_data = dev_ctx.template Alloc<T>(output);
diff --git a/paddle/phi/kernels/cpu/top_k_grad_kernel.cc b/paddle/phi/kernels/cpu/top_k_grad_kernel.cc
index d98ca1702e3b56..1e7f6b890df290 100644
--- a/paddle/phi/kernels/cpu/top_k_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/top_k_grad_kernel.cc
@@ -76,7 +76,7 @@ void TopkGradKernel(const Context& dev_ctx,
 
     // assign the out_grad to input_grad directly
     const int64_t input_height =
-        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+        common::product(common::slice_ddim(in_dims, 0, in_dims.size() - 1));
     const int64_t input_width = in_dims[in_dims.size() - 1];
 
     // init the output grad with 0, because some input elements has no grad
@@ -120,8 +120,8 @@ void TopkGradKernel(const Context& dev_ctx,
         ndims, dev_ctx, out_grad, &trans_dO, trans);
     funcs::TransCompute<phi::CPUContext, int64_t>(
         ndims, dev_ctx, indices, &trans_ind, trans);
-    const int64_t input_height = phi::product(
-        phi::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1));
+    const int64_t input_height = common::product(
+        common::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1));
     const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1];
 
     // Assign the out_grad to tranpose input_grad
diff --git a/paddle/phi/kernels/cpu/top_k_kernel.cc b/paddle/phi/kernels/cpu/top_k_kernel.cc
index 858be275734d42..d769613bef92b0 100644
--- a/paddle/phi/kernels/cpu/top_k_kernel.cc
+++ b/paddle/phi/kernels/cpu/top_k_kernel.cc
@@ -172,7 +172,7 @@ void TopkKernel(const Context& dev_ctx,
   const auto& out_dims = out->dims();
   if (axis + 1 == in_dims.size()) {
     const int64_t& input_height =
-        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+        common::product(common::slice_ddim(in_dims, 0, in_dims.size() - 1));
     const int64_t& input_width = in_dims[in_dims.size() - 1];
     FullTopK<T, int64_t>(input_height,
                          input_width,
@@ -214,8 +214,8 @@ void TopkKernel(const Context& dev_ctx,
     funcs::TransCompute<phi::CPUContext, T>(
         ndims, dev_ctx, *input, &trans_inp, trans);
 
-    const int64_t input_height =
-        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_height = common::product(
+        common::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
     const int64_t input_width = trans_dims[trans_dims.size() - 1];
 
     // Allocate the temp tensor to the save the topk indices, values
diff --git a/paddle/phi/kernels/cpu/triangular_solve_kernel.cc b/paddle/phi/kernels/cpu/triangular_solve_kernel.cc
index 06c897b2199845..6245eb90426405 100644
--- a/paddle/phi/kernels/cpu/triangular_solve_kernel.cc
+++ b/paddle/phi/kernels/cpu/triangular_solve_kernel.cc
@@ -14,8 +14,8 @@
 
 #include "paddle/phi/kernels/triangular_solve_kernel.h"
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/expand_kernel.h"
@@ -46,7 +46,7 @@ void TriangularSolveKernel(const Context& dev_ctx,
   const T* x_bst_data = x_bst.data<T>();
   ExpandKernel<T, Context>(dev_ctx, x, x_bst_dims, &x_bst);
 
-  out->Resize(phi::make_ddim(y_bst_dims_vec));
+  out->Resize(common::make_ddim(y_bst_dims_vec));
   T* out_data = dev_ctx.template Alloc<T>(out);
   IntArray y_bst_dims(y_bst_dims_vec);
   ExpandKernel<T, Context>(dev_ctx, y, y_bst_dims, out);
diff --git a/paddle/phi/kernels/cpu/uniform_kernel.cc b/paddle/phi/kernels/cpu/uniform_kernel.cc
index d850dc5074e033..5a85675bdeffa0 100644
--- a/paddle/phi/kernels/cpu/uniform_kernel.cc
+++ b/paddle/phi/kernels/cpu/uniform_kernel.cc
@@ -27,7 +27,7 @@ void UniformKernel(const Context &dev_ctx,
                    const Scalar &max,
                    int seed,
                    DenseTensor *out) {
-  out->Resize(phi::make_ddim(shape.GetData()));
+  out->Resize(common::make_ddim(shape.GetData()));
   T *data = dev_ctx.template Alloc<T>(out);
   auto size = out->numel();
   std::shared_ptr<std::mt19937_64> engine;
diff --git a/paddle/phi/kernels/cpu/unique_consecutive_functor.h b/paddle/phi/kernels/cpu/unique_consecutive_functor.h
index 2daee69eed92dc..cf7086f80f5a6a 100644
--- a/paddle/phi/kernels/cpu/unique_consecutive_functor.h
+++ b/paddle/phi/kernels/cpu/unique_consecutive_functor.h
@@ -60,18 +60,18 @@ static void UniqueConsecutiveFlattenedTensor(const Context& context,
   }
   out_vec.resize(output_size);
 
-  out->Resize(phi::make_ddim({output_size}));
+  out->Resize(common::make_ddim({output_size}));
   auto* out_data = context.template Alloc<InT>(out);
   std::copy(out_vec.begin(), out_vec.end(), out_data);
 
   if (return_inverse) {
-    inverse->Resize(phi::make_ddim({in.numel()}));
+    inverse->Resize(common::make_ddim({in.numel()}));
     auto* inverse_data = context.template Alloc<IndexT>(inverse);
     std::copy(inverse_vec.begin(), inverse_vec.end(), inverse_data);
   }
 
   if (return_counts) {
-    count->Resize(phi::make_ddim({out->numel()}));
+    count->Resize(common::make_ddim({out->numel()}));
     auto* counts_data = context.template Alloc<IndexT>(count);
     std::copy(counts_vec.begin(), counts_vec.end(), counts_data);
   }
@@ -156,17 +156,17 @@ static void UniqueConsecutiveDim(const Context& context,
   std::iota(permute.begin(), permute.end(), 0);
   permute[axis] = 0;
   permute[0] = axis;
-  std::vector<int64_t> in_trans_dims_vec(phi::vectorize(in.dims()));
+  std::vector<int64_t> in_trans_dims_vec(common::vectorize(in.dims()));
   in_trans_dims_vec[axis] = in.dims()[0];
   in_trans_dims_vec[0] = in.dims()[axis];
   DenseTensor in_trans;
-  DDim in_trans_dims = phi::make_ddim(in_trans_dims_vec);
+  DDim in_trans_dims = common::make_ddim(in_trans_dims_vec);
   in_trans.Resize(in_trans_dims);
   context.template Alloc<InT>(&in_trans);
   phi::funcs::TransCompute<Context, InT>(
       in.dims().size(), context, in, &in_trans, permute);
   // reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2]
-  DDim in_trans_flat_dims = phi::flatten_to_2d(in_trans_dims, 1);
+  DDim in_trans_flat_dims = common::flatten_to_2d(in_trans_dims, 1);
   in_trans.Resize(in_trans_flat_dims);
 
   std::vector<IndexT> sorted_indices_vec(in_trans.dims()[0]);
@@ -202,10 +202,10 @@ static void UniqueConsecutiveDim(const Context& context,
   DenseTensor out_trans;
   std::vector<int64_t> out_trans_dims_vec = in_trans_dims_vec;
   out_trans_dims_vec[0] = input_unbind.size();
-  out_trans.Resize(phi::make_ddim(out_trans_dims_vec));
+  out_trans.Resize(common::make_ddim(out_trans_dims_vec));
   context.template Alloc<InT>(&out_trans);
   std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]);
-  out->Resize(phi::make_ddim(out_trans_dims_vec));
+  out->Resize(common::make_ddim(out_trans_dims_vec));
   context.template Alloc<InT>(out);
   concat_functor(context, input_unbind, 0, &out_trans);
   phi::funcs::TransCompute<Context, InT>(
diff --git a/paddle/phi/kernels/cpu/unique_consecutive_kernel.cc b/paddle/phi/kernels/cpu/unique_consecutive_kernel.cc
index 8c3a14a5edf76e..dc0b96ec839973 100644
--- a/paddle/phi/kernels/cpu/unique_consecutive_kernel.cc
+++ b/paddle/phi/kernels/cpu/unique_consecutive_kernel.cc
@@ -17,8 +17,8 @@
 #include "paddle/phi/kernels/cpu/unique_consecutive_functor.h"
 #include "paddle/phi/kernels/unique_consecutive_kernel.h"
 
+#include "paddle/common/errors.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
 
diff --git a/paddle/phi/kernels/dist_grad_kernel.cc b/paddle/phi/kernels/dist_grad_kernel.cc
index e6ef962c665c28..088a4fe4ffd266 100644
--- a/paddle/phi/kernels/dist_grad_kernel.cc
+++ b/paddle/phi/kernels/dist_grad_kernel.cc
@@ -69,7 +69,7 @@ void DistGradKernel(const Context& dev_ctx,
     // the dims of output iternally, so we Resize x/y_grad twice.
     auto res_x = GetReduceDims(x_grad_tmp.dims(), x.dims());
     if (!std::get<0>(res_x).empty()) {
-      x_grad->Resize(phi::make_ddim(std::get<1>(res_x)));
+      x_grad->Resize(common::make_ddim(std::get<1>(res_x)));
       SumKernel<T, Context>(
           dev_ctx, x_grad_tmp, std::get<0>(res_x), x.dtype(), false, x_grad);
       x_grad->Resize(x.dims());
@@ -82,7 +82,7 @@ void DistGradKernel(const Context& dev_ctx,
     ScaleKernel<T, Context>(dev_ctx, x_grad_tmp, -1.0, 0.0, false, &y_grad_tmp);
     auto res_y = GetReduceDims(y_grad_tmp.dims(), y.dims());
     if (!std::get<0>(res_y).empty()) {
-      y_grad->Resize(phi::make_ddim(std::get<1>(res_y)));
+      y_grad->Resize(common::make_ddim(std::get<1>(res_y)));
       SumKernel<T, Context>(
           dev_ctx, y_grad_tmp, std::get<0>(res_y), y.dtype(), false, y_grad);
       y_grad->Resize(y.dims());
diff --git a/paddle/phi/kernels/empty_kernel.cc b/paddle/phi/kernels/empty_kernel.cc
index 8df5e9a543eb25..d2391a5702d4b1 100644
--- a/paddle/phi/kernels/empty_kernel.cc
+++ b/paddle/phi/kernels/empty_kernel.cc
@@ -12,10 +12,10 @@
  See the License for the specific language governing permissions and
  limitations under the License. */
 #include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/common/macros.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/macros.h"
 
 namespace phi {
 
@@ -24,7 +24,7 @@ void EmptyKernel(const Context& dev_ctx,
                  const IntArray& shape,
                  DataType dtype UNUSED,
                  DenseTensor* out) {
-  out->Resize(phi::make_ddim(shape.GetData()));
+  out->Resize(common::make_ddim(shape.GetData()));
   dev_ctx.template Alloc<T>(out);
 }
 
diff --git a/paddle/phi/kernels/flatten_grad_kernel.cc b/paddle/phi/kernels/flatten_grad_kernel.cc
index 42d137ba4f4190..ebe1b1d24e50a5 100644
--- a/paddle/phi/kernels/flatten_grad_kernel.cc
+++ b/paddle/phi/kernels/flatten_grad_kernel.cc
@@ -27,7 +27,7 @@ void FlattenGradKernel(const Context& dev_ctx,
                        DenseTensor* x_grad) {
   auto xshape_dims = xshape.dims();
   dev_ctx.Alloc(x_grad, out_grad.dtype());
-  auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
+  auto x_dims = common::slice_ddim(xshape_dims, 1, xshape_dims.size());
   phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
   x_grad->Resize(x_dims);
 }
diff --git a/paddle/phi/kernels/full_kernel.cc b/paddle/phi/kernels/full_kernel.cc
index 31bcbc2eeea949..cd603dd57e64d1 100644
--- a/paddle/phi/kernels/full_kernel.cc
+++ b/paddle/phi/kernels/full_kernel.cc
@@ -31,7 +31,7 @@ void FullBatchSizeLikeKernel(const Context& dev_ctx,
     // set the correct batch size for the LoDTensor.
     auto odims = out->dims();
     odims[out_batch_size_dim] = static_cast<int>(x.lod().back().size()) - 1;
-    FullKernel<T, Context>(dev_ctx, phi::vectorize(odims), val, dtype, out);
+    FullKernel<T, Context>(dev_ctx, common::vectorize(odims), val, dtype, out);
   }
   FullLikeKernel<T, Context>(dev_ctx, x, val, dtype, out);
 }
diff --git a/paddle/phi/kernels/funcs/affine_grid_utils.h b/paddle/phi/kernels/funcs/affine_grid_utils.h
index 1e6701d0c7e833..b973d75a9becdc 100644
--- a/paddle/phi/kernels/funcs/affine_grid_utils.h
+++ b/paddle/phi/kernels/funcs/affine_grid_utils.h
@@ -45,7 +45,7 @@ inline void GetIdxMap4D(int n,
                         DenseTensor* grid,
                         const Context& dev_ctx) {
   auto& place = *dev_ctx.eigen_device();
-  grid->Resize(phi::make_ddim({n, h, w, 3}));
+  grid->Resize(common::make_ddim({n, h, w, 3}));
   dev_ctx.template Alloc<T>(grid);
   auto grid_t = EigenTensor<T, 4>::From(*grid);
   // Get indexes of height with shape [height, width, 1]
@@ -59,7 +59,7 @@ inline void GetIdxMap4D(int n,
   auto w_idx_t = EigenTensor<T, 1>::From(w_idx);
   // Get constant ones tensor with shape [height, width, 1]
   DenseTensor ones;
-  ones.Resize(phi::make_ddim({h, w, 1}));
+  ones.Resize(common::make_ddim({h, w, 1}));
   dev_ctx.template Alloc<T>(&ones);
 
   phi::funcs::SetConstant<Context, T>()(dev_ctx, &ones, static_cast<T>(1));
@@ -67,22 +67,22 @@ inline void GetIdxMap4D(int n,
   // Get grid tensor with shape [n, h, w, 3] by concatenating h_idx, w_idx and
   // ones
   DenseTensor w_idx_map;
-  w_idx_map.Resize(phi::make_ddim({h, w, 1}));
+  w_idx_map.Resize(common::make_ddim({h, w, 1}));
   dev_ctx.template Alloc<T>(&w_idx_map);
   auto w_idx_map_t = EigenTensor<T, 3>::From(w_idx_map);
 
   DenseTensor h_idx_map;
-  h_idx_map.Resize(phi::make_ddim({h, w, 1}));
+  h_idx_map.Resize(common::make_ddim({h, w, 1}));
   dev_ctx.template Alloc<T>(&h_idx_map);
   auto h_idx_map_t = EigenTensor<T, 3>::From(h_idx_map);
 
   DenseTensor w_h_idx_map;
-  w_h_idx_map.Resize(phi::make_ddim({h, w, 2}));
+  w_h_idx_map.Resize(common::make_ddim({h, w, 2}));
   dev_ctx.template Alloc<T>(&w_h_idx_map);
   auto w_h_idx_map_t = EigenTensor<T, 3>::From(w_h_idx_map);
 
   DenseTensor w_h_one_idx_map;
-  w_h_one_idx_map.Resize(phi::make_ddim({h, w, 3}));
+  w_h_one_idx_map.Resize(common::make_ddim({h, w, 3}));
   dev_ctx.template Alloc<T>(&w_h_one_idx_map);
   auto w_h_one_idx_map_t = EigenTensor<T, 3>::From(w_h_one_idx_map);
 
@@ -109,7 +109,7 @@ inline void GetIdxMap5D(int n,
                         DenseTensor* grid,
                         const Context& dev_ctx) {
   auto& place = *dev_ctx.eigen_device();
-  grid->Resize(phi::make_ddim({n, d, h, w, 4}));
+  grid->Resize(common::make_ddim({n, d, h, w, 4}));
   dev_ctx.template Alloc<T>(grid);
   auto grid_t = EigenTensor<T, 5>::From(*grid);
   // Get indexes of height with shape [depth, height, width, 1]
@@ -127,7 +127,7 @@ inline void GetIdxMap5D(int n,
   auto w_idx_t = EigenTensor<T, 1>::From(w_idx);
   // Get constant ones tensor with shape [depth, height, width, 1]
   DenseTensor ones;
-  ones.Resize(phi::make_ddim({d, h, w, 1}));
+  ones.Resize(common::make_ddim({d, h, w, 1}));
   dev_ctx.template Alloc<T>(&ones);
 
   phi::funcs::SetConstant<Context, T>()(dev_ctx, &ones, static_cast<T>(1));
@@ -135,32 +135,32 @@ inline void GetIdxMap5D(int n,
   // Get grid tensor with shape [n, d, h, w, 4] by concatenating d_idx, h_idx,
   // w_idx and ones
   DenseTensor w_idx_map;
-  w_idx_map.Resize(phi::make_ddim({d, h, w, 1}));
+  w_idx_map.Resize(common::make_ddim({d, h, w, 1}));
   dev_ctx.template Alloc<T>(&w_idx_map);
   auto w_idx_map_t = EigenTensor<T, 4>::From(w_idx_map);
 
   DenseTensor h_idx_map;
-  h_idx_map.Resize(phi::make_ddim({d, h, w, 1}));
+  h_idx_map.Resize(common::make_ddim({d, h, w, 1}));
   dev_ctx.template Alloc<T>(&h_idx_map);
   auto h_idx_map_t = EigenTensor<T, 4>::From(h_idx_map);
 
   DenseTensor d_idx_map;
-  d_idx_map.Resize(phi::make_ddim({d, h, w, 1}));
+  d_idx_map.Resize(common::make_ddim({d, h, w, 1}));
   dev_ctx.template Alloc<T>(&d_idx_map);
   auto d_idx_map_t = EigenTensor<T, 4>::From(d_idx_map);
 
   DenseTensor w_h_idx_map;
-  w_h_idx_map.Resize(phi::make_ddim({d, h, w, 2}));
+  w_h_idx_map.Resize(common::make_ddim({d, h, w, 2}));
   dev_ctx.template Alloc<T>(&w_h_idx_map);
   auto w_h_idx_map_t = EigenTensor<T, 4>::From(w_h_idx_map);
 
   DenseTensor w_h_d_idx_map;
-  w_h_d_idx_map.Resize(phi::make_ddim({d, h, w, 3}));
+  w_h_d_idx_map.Resize(common::make_ddim({d, h, w, 3}));
   dev_ctx.template Alloc<T>(&w_h_d_idx_map);
   auto w_h_d_idx_map_t = EigenTensor<T, 4>::From(w_h_d_idx_map);
 
   DenseTensor w_h_d_one_idx_map;
-  w_h_d_one_idx_map.Resize(phi::make_ddim({d, h, w, 4}));
+  w_h_d_one_idx_map.Resize(common::make_ddim({d, h, w, 4}));
   dev_ctx.template Alloc<T>(&w_h_d_one_idx_map);
   auto w_h_d_one_idx_map_t = EigenTensor<T, 4>::From(w_h_d_one_idx_map);
 
diff --git a/paddle/phi/kernels/funcs/axis_utils.h b/paddle/phi/kernels/funcs/axis_utils.h
index 368c4a9e14061c..41bbd4f048c6b4 100644
--- a/paddle/phi/kernels/funcs/axis_utils.h
+++ b/paddle/phi/kernels/funcs/axis_utils.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 
 namespace phi {
 namespace funcs {
diff --git a/paddle/phi/kernels/funcs/batch_norm_utils.h b/paddle/phi/kernels/funcs/batch_norm_utils.h
index 64a20ee5d2e098..c7f25fd5f548d7 100644
--- a/paddle/phi/kernels/funcs/batch_norm_utils.h
+++ b/paddle/phi/kernels/funcs/batch_norm_utils.h
@@ -31,30 +31,30 @@ inline void ResizeToChannelFirst(const DeviceContext& context,
     // input
     transformed_input->Resize(input->dims());
 
-    auto in_dims_vec = vectorize(input->dims());
+    auto in_dims_vec = common::vectorize(input->dims());
     in_dims_vec[1] = input->dims()[4];
     in_dims_vec[2] = input->dims()[1];
     in_dims_vec[3] = input->dims()[2];
     in_dims_vec[4] = input->dims()[3];
-    transformed_input->Resize(make_ddim(in_dims_vec));
+    transformed_input->Resize(common::make_ddim(in_dims_vec));
     context.template Alloc<T>(transformed_input);
   } else if (dim == 2) {
     // input
     transformed_input->Resize(input->dims());
 
-    auto in_dims_vec = vectorize(input->dims());
+    auto in_dims_vec = common::vectorize(input->dims());
     in_dims_vec[1] = input->dims()[3];
     in_dims_vec[2] = input->dims()[1];
     in_dims_vec[3] = input->dims()[2];
-    transformed_input->Resize(make_ddim(in_dims_vec));
+    transformed_input->Resize(common::make_ddim(in_dims_vec));
     context.template Alloc<T>(transformed_input);
   } else if (dim == 1) {
     transformed_input->Resize(input->dims());
 
-    auto in_dims_vec = vectorize(input->dims());
+    auto in_dims_vec = common::vectorize(input->dims());
     in_dims_vec[1] = input->dims()[2];
     in_dims_vec[2] = input->dims()[1];
-    transformed_input->Resize(make_ddim(in_dims_vec));
+    transformed_input->Resize(common::make_ddim(in_dims_vec));
     context.template Alloc<T>(transformed_input);
   }
 }
@@ -68,31 +68,31 @@ inline void ResizeToChannelLast(const DeviceContext& context,
     // input
     transformed_input->Resize(input->dims());
 
-    auto in_dims_vec = vectorize(input->dims());
+    auto in_dims_vec = common::vectorize(input->dims());
     in_dims_vec[1] = input->dims()[2];
     in_dims_vec[2] = input->dims()[3];
     in_dims_vec[3] = input->dims()[4];
     in_dims_vec[4] = input->dims()[1];
-    transformed_input->Resize(make_ddim(in_dims_vec));
+    transformed_input->Resize(common::make_ddim(in_dims_vec));
     context.template Alloc<T>(transformed_input);
 
   } else if (dim == 2) {
     // input
     transformed_input->Resize(input->dims());
 
-    auto in_dims_vec = vectorize(input->dims());
+    auto in_dims_vec = common::vectorize(input->dims());
     in_dims_vec[1] = input->dims()[2];
     in_dims_vec[2] = input->dims()[3];
     in_dims_vec[3] = input->dims()[1];
-    transformed_input->Resize(make_ddim(in_dims_vec));
+    transformed_input->Resize(common::make_ddim(in_dims_vec));
     context.template Alloc<T>(transformed_input);
   } else if (dim == 1) {
     transformed_input->Resize(input->dims());
 
-    auto in_dims_vec = vectorize(input->dims());
+    auto in_dims_vec = common::vectorize(input->dims());
     in_dims_vec[1] = input->dims()[2];
     in_dims_vec[2] = input->dims()[1];
-    transformed_input->Resize(make_ddim(in_dims_vec));
+    transformed_input->Resize(common::make_ddim(in_dims_vec));
     context.template Alloc<T>(transformed_input);
   }
 }
diff --git a/paddle/phi/kernels/funcs/blas/blas.cc b/paddle/phi/kernels/funcs/blas/blas.cc
index a43005cf86939f..9b6ac51317a95e 100644
--- a/paddle/phi/kernels/funcs/blas/blas.cc
+++ b/paddle/phi/kernels/funcs/blas/blas.cc
@@ -27,7 +27,7 @@ MatDescriptor CreateMatrixDescriptor(const DDim &tensor_dim,
                                    tensor_dim.size()));
   MatDescriptor retv;
   if (num_flatten_cols > 1) {
-    auto flatten_dim = phi::flatten_to_2d(tensor_dim, num_flatten_cols);
+    auto flatten_dim = common::flatten_to_2d(tensor_dim, num_flatten_cols);
     retv.height_ = flatten_dim[0];
     retv.width_ = flatten_dim[1];
   } else {
@@ -35,7 +35,7 @@ MatDescriptor CreateMatrixDescriptor(const DDim &tensor_dim,
       retv.height_ = tensor_dim[0];
       retv.width_ = tensor_dim[1];
     } else {
-      auto dim_vec = phi::vectorize(tensor_dim);
+      auto dim_vec = common::vectorize(tensor_dim);
       retv.batch_size_ = 1;
       for (size_t i = 0; i < dim_vec.size() - 2; ++i) {
         retv.batch_size_ *= dim_vec[i];
diff --git a/paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h b/paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h
index 0fca9de54b2ba9..4e1db73c69e64d 100644
--- a/paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h
+++ b/paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h
@@ -1042,8 +1042,8 @@ struct LinearWithCublasLt : public CublasLtBase<T> {
                   const bool trans_x,
                   const bool trans_y,
                   const MatmulFusedType fused_type) {
-    auto planner = phi::funcs::MatmulPlanner(vectorize(x->dims()),
-                                             vectorize(y->dims()),
+    auto planner = phi::funcs::MatmulPlanner(common::vectorize(x->dims()),
+                                             common::vectorize(y->dims()),
                                              trans_x,
                                              trans_y,
                                              phi::CppTypeToDataType<T>::Type(),
@@ -1080,8 +1080,8 @@ struct LinearGradWithCublasLt : public CublasLtBase<T> {
       const bool use_addto,
       const bool no_exchange,  // exchange x_desc and y_desc for grad.
       bool grad_for_dx = true) {
-    auto planner = phi::funcs::MatmulPlanner(vectorize(x->dims()),
-                                             vectorize(y->dims()),
+    auto planner = phi::funcs::MatmulPlanner(common::vectorize(x->dims()),
+                                             common::vectorize(y->dims()),
                                              trans_x,
                                              trans_y,
                                              phi::CppTypeToDataType<T>::Type(),
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index a1f9c1eb4346cb..822801e10c357c 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -34,12 +34,12 @@ enum BroadcastType { kMixed = 1, kBroadcast = 2, kElementwise = 3 };
 template <typename OutT, typename Functor, int Arity, int NumOuts>
 struct BroadcastTypeClassifier {
   int64_t numel{0};
-  int broadcast_num{0};                   // Not used for XPU
-  bool all_elementwise{true};             // Not used for XPU
-  phi::Array<bool, Arity> use_broadcast;  // Not used for XPU
-  phi::Array<kps::details::BroadcastConfig, Arity> configs;
-  phi::Array<const _ptr_ char *__restrict__, Arity> ins_data;
-  phi::Array<_ptr_ OutT *, NumOuts> outs_data;
+  int broadcast_num{0};              // Not used for XPU
+  bool all_elementwise{true};        // Not used for XPU
+  Array<bool, Arity> use_broadcast;  // Not used for XPU
+  Array<kps::details::BroadcastConfig, Arity> configs;
+  Array<const _ptr_ char *__restrict__, Arity> ins_data;
+  Array<_ptr_ OutT *, NumOuts> outs_data;
 
   BroadcastTypeClassifier() {}
   BroadcastTypeClassifier(const std::vector<const DenseTensor *> &ins,
@@ -289,11 +289,11 @@ template <typename OutT,
           bool IsBoundary,
           int LoadType>
 __device__ void VectorizedBroadcastKernelImpl(
-    const phi::Array<const _ptr_ char *__restrict__, Arity> &ins,
-    phi::Array<_ptr_ OutT *, NumOuts> outs,
-    const phi::Array<bool, Arity> &use_broadcast,
+    const Array<const _ptr_ char *__restrict__, Arity> &ins,
+    Array<_ptr_ OutT *, NumOuts> outs,
+    const Array<bool, Arity> &use_broadcast,
     const uint32_t numel,
-    const phi::Array<kps::details::BroadcastConfig, Arity> &configs,
+    const Array<kps::details::BroadcastConfig, Arity> &configs,
     int num,
     int block_offset,
     int read_lens,
@@ -349,11 +349,11 @@ template <typename Functor,
           int VecSize,
           int LoadType>
 __global__ void VectorizedBroadcastKernel(
-    phi::Array<const _ptr_ char *__restrict__, Arity> ins,
-    phi::Array<_ptr_ OutT *, NumOuts> outs,
-    phi::Array<bool, Arity> use_broadcast,
+    Array<const _ptr_ char *__restrict__, Arity> ins,
+    Array<_ptr_ OutT *, NumOuts> outs,
+    Array<bool, Arity> use_broadcast,
     uint32_t numel,
-    phi::Array<kps::details::BroadcastConfig, Arity> configs,
+    Array<kps::details::BroadcastConfig, Arity> configs,
     int main_offset,
     int tail_tid,
     int read_lens,
@@ -580,7 +580,7 @@ static void SliceTensor(DenseTensor *x,
                         const DenseTensor *share,
                         const std::vector<int64_t> &out_compute_dims,
                         int64_t offset) {
-  auto new_dim = make_ddim(out_compute_dims);
+  auto new_dim = common::make_ddim(out_compute_dims);
   DenseTensorMeta meta(share->dtype(),
                        new_dim,
                        share->layout(),
diff --git a/paddle/phi/kernels/funcs/common_shape.h b/paddle/phi/kernels/funcs/common_shape.h
index 8249d5bf22efbc..dea6e9f6ab3e0b 100644
--- a/paddle/phi/kernels/funcs/common_shape.h
+++ b/paddle/phi/kernels/funcs/common_shape.h
@@ -28,7 +28,7 @@ inline void SetXShape(const DenseTensor &x, DenseTensor *xshape) {
   for (int i = 0; i < in_dims.size(); ++i) {
     xshape_dims[i + 1] = in_dims[i];
   }
-  xshape->ResizeAndAllocate(phi::make_ddim(xshape_dims));
+  xshape->ResizeAndAllocate(common::make_ddim(xshape_dims));
   xshape->ResetLoD(x.meta().lod);
 }
 
@@ -114,7 +114,7 @@ static DDim ExtendDims2Rank(const DDim &in_dims, int rank) {
   for (int i = in_dims.size() - 1, j = rank - 1; i >= 0; --i, --j) {
     shapes[j] = in_dims[i];
   }
-  return make_ddim(shapes);
+  return common::make_ddim(shapes);
 }
 
 template <size_t D>
@@ -181,8 +181,8 @@ static inline std::vector<int64_t> MatrixGetBroadcastBatchPortion(
 // batch_size of matrix
 static inline std::tuple<std::vector<int64_t>, std::vector<int64_t>>
 MatrixGetBroadcastDims(const DenseTensor &x, const DenseTensor &y) {
-  std::vector<int64_t> x_dims_vec = phi::vectorize(x.dims());
-  std::vector<int64_t> y_dims_vec = phi::vectorize(y.dims());
+  std::vector<int64_t> x_dims_vec = common::vectorize(x.dims());
+  std::vector<int64_t> y_dims_vec = common::vectorize(y.dims());
 
   std::vector<int64_t>::const_iterator f1 = x_dims_vec.begin();
   std::vector<int64_t>::const_iterator l1 = x_dims_vec.end() - 2;
@@ -212,7 +212,7 @@ inline DDim GetOutputDims(const DDim &s_dims, const DDim &l_dims) {
   if (s_dims.size() > l_dims.size()) {
     return GetOutputDims(l_dims, s_dims);
   }
-  std::vector<int64_t> shapes = phi::vectorize<int64_t>(l_dims);
+  std::vector<int64_t> shapes = common::vectorize<int64_t>(l_dims);
   for (int i = s_dims.size() - 1, j = l_dims.size() - 1; i >= 0; --i, --j) {
     int64_t s = s_dims[i];
     int64_t l = l_dims[j];
@@ -230,7 +230,7 @@ inline DDim GetOutputDims(const DDim &s_dims, const DDim &l_dims) {
       }
     }
   }
-  return phi::make_ddim(shapes);
+  return common::make_ddim(shapes);
 }
 
 inline int64_t CalStride(phi::DDim dim) {
@@ -274,7 +274,7 @@ inline void FCOutputSize(const DDim &in_dims,
                          std::vector<int64_t> &out_dims,  // NOLINT
                          int in_num_col_dims,
                          bool padding_weights) {
-  auto in_mat_dims = phi::flatten_to_2d(in_dims, in_num_col_dims);
+  auto in_mat_dims = common::flatten_to_2d(in_dims, in_num_col_dims);
   auto w_dims0 = padding_weights ? w_dims[0] - 4 : w_dims[0];
   auto w_dims1 = padding_weights ? w_dims[1] - 4 : w_dims[1];
   PADDLE_ENFORCE_EQ(
@@ -288,7 +288,7 @@ inline void FCOutputSize(const DDim &in_dims,
           in_mat_dims[1],
           in_mat_dims,
           w_dims0,
-          phi::make_ddim({w_dims0, w_dims1})));
+          common::make_ddim({w_dims0, w_dims1})));
 
   out_dims.reserve(static_cast<size_t>(in_num_col_dims + 1));
   for (int i = 0; i < in_num_col_dims; ++i) {
diff --git a/paddle/phi/kernels/funcs/compound_functors.h b/paddle/phi/kernels/funcs/compound_functors.h
index 121597bca68731..823dcd70a2f3c8 100644
--- a/paddle/phi/kernels/funcs/compound_functors.h
+++ b/paddle/phi/kernels/funcs/compound_functors.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_set>
 #include <vector>
-#include "paddle/phi/core/macros.h"
+#include "paddle/common/macros.h"
 namespace phi {
 namespace funcs {
 
diff --git a/paddle/phi/kernels/funcs/concat_funcs.h b/paddle/phi/kernels/funcs/concat_funcs.h
index db965c2ef9b654..6a2dbf953b9b25 100644
--- a/paddle/phi/kernels/funcs/concat_funcs.h
+++ b/paddle/phi/kernels/funcs/concat_funcs.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
+#include "paddle/common/errors.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 namespace phi {
 namespace funcs {
 
diff --git a/paddle/phi/kernels/funcs/cufft_util.h b/paddle/phi/kernels/funcs/cufft_util.h
index 3a4a3ef5e59149..52dfb8733f8a5e 100644
--- a/paddle/phi/kernels/funcs/cufft_util.h
+++ b/paddle/phi/kernels/funcs/cufft_util.h
@@ -15,8 +15,8 @@
 #pragma once
 #include <vector>
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/dynload/cufft.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/funcs/fft.h"
 #include "paddle/phi/kernels/funcs/fft_key.h"
diff --git a/paddle/phi/kernels/funcs/cumprod.h b/paddle/phi/kernels/funcs/cumprod.h
index 4eefd4559c33a2..fad43f4acef72a 100644
--- a/paddle/phi/kernels/funcs/cumprod.h
+++ b/paddle/phi/kernels/funcs/cumprod.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/funcs/data_layout_transform.cc b/paddle/phi/kernels/funcs/data_layout_transform.cc
index b949b7945a0469..a9779d8d78943d 100644
--- a/paddle/phi/kernels/funcs/data_layout_transform.cc
+++ b/paddle/phi/kernels/funcs/data_layout_transform.cc
@@ -16,10 +16,10 @@
 
 #include "glog/logging.h"
 
+#include "paddle/common/layout.h"
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/onednn/onednn_context.h"
 #include "paddle/phi/common/bfloat16.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/dense_tensor.h"
 
@@ -57,7 +57,7 @@ void* GetDataFromTensor(const DenseTensor& tensor,
 // 0-D now.
 dnnl::memory::desc make_memory_desc(const phi::DenseTensor& ref_tensor,
                                     phi::DataLayout target_layout) {
-  auto ref_dims = vectorize<int64_t>(ref_tensor.dims());
+  auto ref_dims = common::vectorize<int64_t>(ref_tensor.dims());
   auto ref_type = ToOneDNNDataType(ref_tensor.dtype());
   PADDLE_ENFORCE_NE(ref_type,
                     OneDNNDataType::undef,
@@ -84,7 +84,7 @@ void TransDataLayoutFromOneDNN(DataLayout in_layout,
   auto& pool = DeviceContextPool::Instance();
   auto* dev_ctx = dynamic_cast<OneDNNContext*>(pool.Get(place));
   auto& cpu_engine = dev_ctx->GetEngine();
-  auto in_dims = vectorize<int64_t>(in.dims());
+  auto in_dims = common::vectorize<int64_t>(in.dims());
 
   auto md_dims = !in_dims.empty() ? in_dims : std::vector<int64_t>{1};
   const auto src_mem_desc =
@@ -102,7 +102,7 @@ void TransDataLayoutFromOneDNN(DataLayout in_layout,
   // Note(0x45f): Using initialized() to support slice Tensors
   // with shapes like [0, 0, 0].
   if (in.initialized() && ((in.mem_desc() != out->mem_desc()) || always_copy)) {
-    auto in_tz = vectorize<int64_t>(in.dims());
+    auto in_tz = common::vectorize<int64_t>(in.dims());
     auto in_type = ToOneDNNDataType(in.dtype());
     void* in_data = GetDataFromTensor(in, in_type);
 
diff --git a/paddle/phi/kernels/funcs/data_layout_transform.h b/paddle/phi/kernels/funcs/data_layout_transform.h
index 146e53d1c4be3b..4bcc96d9c2ab70 100644
--- a/paddle/phi/kernels/funcs/data_layout_transform.h
+++ b/paddle/phi/kernels/funcs/data_layout_transform.h
@@ -18,8 +18,8 @@
 #include "dnnl.hpp"  // NOLINT
 #endif
 
+#include "paddle/common/layout.h"
 #include "paddle/phi/common/data_type.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/dense_tensor.h"
 
@@ -44,7 +44,7 @@ inline OneDNNMemoryFormat ToOneDNNFormat(const DataLayout& layout) {
     default:
       PADDLE_THROW(
           errors::InvalidArgument("Fail to convert layout %s to oneDNN format.",
-                                  ::phi::DataLayoutToString(layout)));
+                                  ::common::DataLayoutToString(layout)));
   }
 }
 
diff --git a/paddle/phi/kernels/funcs/detail/activation_functions.h b/paddle/phi/kernels/funcs/detail/activation_functions.h
index f1352df226094b..758503563680be 100644
--- a/paddle/phi/kernels/funcs/detail/activation_functions.h
+++ b/paddle/phi/kernels/funcs/detail/activation_functions.h
@@ -18,9 +18,9 @@ limitations under the License. */
 #include <stdexcept>
 #include <string>
 
+#include "paddle/common/macros.h"
 #include "paddle/phi/backends/cpu/cpu_info.h"
 #include "paddle/phi/core/hostdevice.h"
-#include "paddle/phi/core/macros.h"
 namespace phi {
 namespace funcs {
 namespace detail {
diff --git a/paddle/phi/kernels/funcs/detail/strided_memcpy.h b/paddle/phi/kernels/funcs/detail/strided_memcpy.h
index 0cd07fdfd0e1ae..03e3bdde05ad09 100644
--- a/paddle/phi/kernels/funcs/detail/strided_memcpy.h
+++ b/paddle/phi/kernels/funcs/detail/strided_memcpy.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "paddle/common/ddim.h"
 #include "paddle/phi/common/memory_utils.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/device_context.h"
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/phi/kernels/funcs/diag_functor.h b/paddle/phi/kernels/funcs/diag_functor.h
index 1862f5ec91b4bc..6fe54363e6f0e2 100644
--- a/paddle/phi/kernels/funcs/diag_functor.h
+++ b/paddle/phi/kernels/funcs/diag_functor.h
@@ -112,7 +112,7 @@ DenseTensor BatchDiag(const Context& dev_ctx, const DenseTensor& x, int batch) {
   for (int i = 0; i < num_dims - 1; ++i) {
     out_shape.push_back(x.dims()[i]);
   }
-  out.Resize(phi::make_ddim(out_shape));
+  out.Resize(common::make_ddim(out_shape));
   int order = x.dims()[num_dims - 1];
   int stride_out = order * order;
   int stride_in = order + 1;
diff --git a/paddle/phi/kernels/funcs/diagonal.h b/paddle/phi/kernels/funcs/diagonal.h
index a30fb79f8c8b04..5504a337e88f2e 100644
--- a/paddle/phi/kernels/funcs/diagonal.h
+++ b/paddle/phi/kernels/funcs/diagonal.h
@@ -70,7 +70,7 @@ DenseTensor Diagonal(const DeviceContext& context,
                      int64_t dim2) {
   auto* input_data = input->data<T>();
   auto input_dims = input->dims();
-  auto input_stride = phi::stride(input_dims);
+  auto input_stride = common::stride(input_dims);
   auto dim1_ = dim1 < 0 ? input_dims.size() + dim1 : dim1;
   auto dim2_ = dim2 < 0 ? input_dims.size() + dim2 : dim2;
   auto len1 = input_dims[std::min(dim1_, dim2_)];
@@ -89,8 +89,8 @@ DenseTensor Diagonal(const DeviceContext& context,
   int diag_size = len2 < len1 ? len2 : len1;
 
   if (diag_size > 0) {
-    auto ret_strides = vectorize(input_stride);
-    auto ret_dims = vectorize(input_dims);
+    auto ret_strides = common::vectorize(input_stride);
+    auto ret_dims = common::vectorize(input_dims);
     ret_strides.erase(ret_strides.begin() + std::max(dim1_, dim2_));
     ret_strides.erase(ret_strides.begin() + std::min(dim1_, dim2_));
     ret_dims.erase(ret_dims.begin() + std::max(dim1_, dim2_));
@@ -102,15 +102,15 @@ DenseTensor Diagonal(const DeviceContext& context,
     ret_strides.push_back(stride1 + stride2);
     ret_dims.push_back(diag_size);
     DenseTensor diag;
-    DDim diag_dims = phi::make_ddim(ret_dims);
-    auto dig_stride = phi::stride(diag_dims);
+    DDim diag_dims = common::make_ddim(ret_dims);
+    auto dig_stride = common::stride(diag_dims);
     diag.Resize(diag_dims);
     auto diag_data = context.template Alloc<T>(&diag);
 
     int64_t pos = std::abs(offset) * offset_stride;
     int64_t dim_size = ret_strides.size();
 #if defined(__NVCC__) || defined(__HIPCC__)
-    thrust::device_vector<int64_t> diag_vec(vectorize(dig_stride));
+    thrust::device_vector<int64_t> diag_vec(common::vectorize(dig_stride));
     const int64_t* diag_arr = thrust::raw_pointer_cast(diag_vec.data());
     thrust::device_vector<int64_t> ret_vec(ret_strides);
     const int64_t* ret_arr = thrust::raw_pointer_cast(ret_vec.data());
diff --git a/paddle/phi/kernels/funcs/dims_simplifier.h b/paddle/phi/kernels/funcs/dims_simplifier.h
index 35621ed0abddb3..036cd2cc812c9b 100644
--- a/paddle/phi/kernels/funcs/dims_simplifier.h
+++ b/paddle/phi/kernels/funcs/dims_simplifier.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 #include "glog/logging.h"
@@ -39,15 +39,15 @@ struct BroadcastDimsSimplifier {
     N = std::max(static_cast<int>(ins.size()), 2);
     in_dims.resize(N);
     rank = dims.size();
-    out_dims = phi::vectorize<int64_t>(dims);
+    out_dims = common::vectorize<int64_t>(dims);
     if (ins.size() == 1) {
       // When ins.size() = 1, broadcast input to output.
-      in_dims[0] = phi::vectorize<int64_t>(ins[0]->dims());
+      in_dims[0] = common::vectorize<int64_t>(ins[0]->dims());
       // Add out_dims to in_dims to avoid errors in dims merging.
       in_dims[1] = out_dims;
     } else {
       for (int j = 0; j < N; ++j) {
-        in_dims[j] = phi::vectorize<int64_t>(ins[j]->dims());
+        in_dims[j] = common::vectorize<int64_t>(ins[j]->dims());
       }
     }
     ExtendInputDimensions(axis);
@@ -122,8 +122,8 @@ struct BroadcastDimsSimplifier {
                 out_idx,
                 out_dims[out_idx],
                 in_dim[in_idx],
-                phi::make_ddim(in_dim),
-                phi::make_ddim(out_dims)));
+                common::make_ddim(in_dim),
+                common::make_ddim(out_dims)));
           }
         }
         in_dim.resize(rank);
diff --git a/paddle/phi/kernels/funcs/dropout_impl.cu.h b/paddle/phi/kernels/funcs/dropout_impl.cu.h
index 14696b45c78db1..3b0df468847138 100644
--- a/paddle/phi/kernels/funcs/dropout_impl.cu.h
+++ b/paddle/phi/kernels/funcs/dropout_impl.cu.h
@@ -280,7 +280,7 @@ void DropoutFwGPUKernelDriver(
 
   if (!is_test && mask) {
     auto* mask_data = mask->data<uint8_t>();
-    size_t size = phi::product(mask->dims());
+    size_t size = common::product(mask->dims());
 
     if (dropout_prob == 1.0f) {
 #ifdef PADDLE_WITH_HIP
diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h
index 8ddb3f406ddfe3..c92acdaf4180be 100644
--- a/paddle/phi/kernels/funcs/elementwise_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_base.h
@@ -38,8 +38,7 @@ namespace phi {
 /* Packing scalar type T(float, int etc.) into Array<T, NumOuts> type
    for supporting multiple-output feature in elementwise system.*/
 template <class T, int Num>
-using ConditionalT =
-    typename std::conditional_t<Num == 1, T, phi::Array<T, Num>>;
+using ConditionalT = typename std::conditional_t<Num == 1, T, Array<T, Num>>;
 
 namespace funcs {
 using DDim = phi::DDim;
@@ -624,7 +623,7 @@ struct SameDimsElementwisePrimitiveCaller {
 template <typename OutT, int VecSize, bool IsBoundary, int NumOuts>
 struct ElementwiseWriteDataCallerBc {
   __device__ __forceinline__ void operator()(
-      phi::Array<_ptr_ OutT *, NumOuts> outs,
+      Array<_ptr_ OutT *, NumOuts> outs,
       ConditionalT<OutT, NumOuts> src[VecSize],
       kps::IndexType block_offset,
       int num,
@@ -647,7 +646,7 @@ struct ElementwiseWriteDataCallerBc {
 
 template <typename OutT, int VecSize, bool IsBoundary>
 struct ElementwiseWriteDataCallerBc<OutT, VecSize, IsBoundary, 1> {
-  __device__ __forceinline__ void operator()(phi::Array<_ptr_ OutT *, 1> outs,
+  __device__ __forceinline__ void operator()(Array<_ptr_ OutT *, 1> outs,
                                              OutT src[VecSize],
                                              kps::IndexType block_offset,
                                              int num,
@@ -664,8 +663,8 @@ template <typename OutT,
           int VecSize,
           bool IsBoundary>
 __device__ void VectorizedElementwiseKernelImpl(
-    const phi::Array<const _ptr_ char *__restrict__, Arity> &in,
-    phi::Array<_ptr_ OutT *, NumOuts> outs,
+    const Array<const _ptr_ char *__restrict__, Arity> &in,
+    Array<_ptr_ OutT *, NumOuts> outs,
     kps::IndexType offset,
     int num,
     int read_lens,
@@ -690,8 +689,8 @@ __device__ void VectorizedElementwiseKernelImpl(
 
 template <typename OutT, typename Functor, int Arity, int NumOuts, int VecSize>
 __global__ void VectorizedElementwiseKernel(
-    phi::Array<const _ptr_ char *__restrict__, Arity> ins,
-    phi::Array<_ptr_ OutT *, NumOuts> outs,
+    Array<const _ptr_ char *__restrict__, Arity> ins,
+    Array<_ptr_ OutT *, NumOuts> outs,
     kps::IndexType numel,
     kps::IndexType main_offset,
     int read_lens,
@@ -731,8 +730,8 @@ void LaunchElementwiseKernel(const KPDevice &ctx,
   // For large tensor numel * sizeof(T) > 2^31, we must use int64_t as index
   // type.
   int64_t numel = (*outs)[0]->numel();
-  phi::Array<const _ptr_ char *__restrict__, Arity> ins_data;
-  phi::Array<_ptr_ OutT *, NumOuts> outs_data;
+  Array<const _ptr_ char *__restrict__, Arity> ins_data;
+  Array<_ptr_ OutT *, NumOuts> outs_data;
 
   using Traits = phi::funcs::FunctionTraits<Functor>;
   using ArgsT = typename Traits::ArgsTuple;
diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h
index 4cc12cf641ad9c..eaf527fbba9f6b 100644
--- a/paddle/phi/kernels/funcs/elementwise_functor.h
+++ b/paddle/phi/kernels/funcs/elementwise_functor.h
@@ -14,12 +14,12 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/common/macros.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/hostdevice.h"
-#include "paddle/phi/core/macros.h"
 #if defined(__xpu__)
 #include <xpu/runtime.h>
 
diff --git a/paddle/phi/kernels/funcs/elementwise_grad_base.h b/paddle/phi/kernels/funcs/elementwise_grad_base.h
index 5ff70c86d5fe8b..8e5e45b861a3ae 100644
--- a/paddle/phi/kernels/funcs/elementwise_grad_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h
@@ -244,8 +244,8 @@ void CommonElementwiseBroadcastBackward(const CPUContext &ctx,
   }
 
   VLOG(3) << "CommonElementwiseBroadcastBackward xdims:"
-          << phi::make_ddim(x_dims_array)
-          << " ydim:" << phi::make_ddim(y_dims_array);
+          << common::make_ddim(x_dims_array)
+          << " ydim:" << common::make_ddim(y_dims_array);
 
   CommonGradBroadcastCPU<T, DX_OP, DY_OP, Tout>(x,
                                                 y,
@@ -393,7 +393,7 @@ void ElemwiseGradComputeNoBroadcast(const DeviceContext &dev_ctx,
                                     DenseTensor *dy,
                                     DX_OP dx_op,
                                     DY_OP dy_op) {
-  size_t N = static_cast<size_t>(phi::product(x_dim));
+  size_t N = static_cast<size_t>(common::product(x_dim));
   phi::funcs::ForRange<DeviceContext> for_range(dev_ctx, N);
   for_range(ElemwiseGradNoBroadcast<T, DX_OP, DY_OP, Tout>{
       x.data<T>(),
@@ -1677,8 +1677,8 @@ void CommonElementwiseBroadcastBackward(const GPUContext &ctx,
   }
 
   VLOG(3) << "CommonElementwiseBroadcastBackward xdims:"
-          << phi::make_ddim(x_dims_array)
-          << " ydim:" << phi::make_ddim(y_dims_array);
+          << common::make_ddim(x_dims_array)
+          << " ydim:" << common::make_ddim(y_dims_array);
 
   CommonGradBroadcastCUDA<T, DX_OP, DY_OP, Tout>(x,
                                                  y,
diff --git a/paddle/phi/kernels/funcs/elementwise_utils.h b/paddle/phi/kernels/funcs/elementwise_utils.h
index 3790044346dc42..b94a8fbd53a6d6 100644
--- a/paddle/phi/kernels/funcs/elementwise_utils.h
+++ b/paddle/phi/kernels/funcs/elementwise_utils.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "paddle/common/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 
@@ -75,6 +76,7 @@ inline void GetMidDims(const DDim &x_dims,
 
 inline DDim TrimTrailingSingularDims(const DDim &dims) {
   // Remove trailing dimensions of size 1 for y
+  using common::make_dim;
   auto actual_dims_size = dims.size();
   for (; actual_dims_size != 0; --actual_dims_size) {
     if (dims[actual_dims_size - 1] != 1) break;
@@ -86,9 +88,9 @@ inline DDim TrimTrailingSingularDims(const DDim &dims) {
     trim_dims[i] = dims[i];
   }
   if (trim_dims.size() == 0) {
-    return DDim(phi::make_dim());
+    return DDim(make_dim());
   }
-  DDim actual_dims = phi::make_ddim(trim_dims);
+  DDim actual_dims = common::make_ddim(trim_dims);
   return actual_dims;
 }
 
diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu
index 716d5c3979459b..84a8cc309516bb 100644
--- a/paddle/phi/kernels/funcs/fc_functor.cu
+++ b/paddle/phi/kernels/funcs/fc_functor.cu
@@ -397,8 +397,8 @@ void FCInt8Functor<DeviceContext, T>::operator()(
   const int8_t* W = w_tensor->data<int8_t>();
 
   DenseTensor quant_x_tensor, quant_y_tensor;
-  quant_x_tensor.Resize(phi::make_ddim({M, K}));
-  quant_y_tensor.Resize(phi::make_ddim({M, N}));
+  quant_x_tensor.Resize(common::make_ddim({M, K}));
+  quant_y_tensor.Resize(common::make_ddim({M, N}));
   context.template Alloc<int8_t>(&quant_x_tensor,
                                  quant_x_tensor.numel() * sizeof(int8_t));
   context.template Alloc<int32_t>(&quant_y_tensor,
@@ -417,7 +417,7 @@ void FCInt8Functor<DeviceContext, T>::operator()(
       context, quant_x_tensor, *w_tensor, false, false, &quant_y_tensor);
 
   DenseTensor scale_weights_dev;
-  scale_weights_dev.Resize(phi::make_ddim({N}));
+  scale_weights_dev.Resize(common::make_ddim({N}));
   context.template Alloc<float>(&scale_weights_dev,
                                 scale_weights_dev.numel() * sizeof(float));
   float* scale_weights_dev_ptr = scale_weights_dev.data<float>();
diff --git a/paddle/phi/kernels/funcs/fft.cc b/paddle/phi/kernels/funcs/fft.cc
index 31ea37f5b5037e..97502787b69381 100644
--- a/paddle/phi/kernels/funcs/fft.cc
+++ b/paddle/phi/kernels/funcs/fft.cc
@@ -75,7 +75,7 @@ void exec_fft(const phi::CPUContext& ctx,
   for (int i = 0; i < signal_ndim; i++) {
     collapsed_input_shape_.push_back(in_sizes[axes[i]]);
   }
-  phi::DDim collapsed_input_shape = phi::make_ddim(collapsed_input_shape_);
+  phi::DDim collapsed_input_shape = common::make_ddim(collapsed_input_shape_);
   transposed_input.Resize(collapsed_input_shape);
   DenseTensor& collapsed_input = transposed_input;
 
@@ -87,7 +87,7 @@ void exec_fft(const phi::CPUContext& ctx,
   for (int i = 0; i < signal_ndim; i++) {
     collapsed_output_shape_.push_back(out_sizes[axes[i]]);
   }
-  phi::DDim collapsed_output_shape = phi::make_ddim(collapsed_output_shape_);
+  phi::DDim collapsed_output_shape = common::make_ddim(collapsed_output_shape_);
   DenseTensor collapsed_output;
   collapsed_output.Resize(collapsed_output_shape);
   ctx.Alloc<To>(&collapsed_output);
@@ -99,8 +99,8 @@ void exec_fft(const phi::CPUContext& ctx,
     signal_sizes[1 + i] =
         std::max(collapsed_input_shape[1 + i], collapsed_output_shape[1 + i]);
   }
-  const phi::DDim input_stride = phi::stride(collapsed_input_shape);
-  const phi::DDim output_stride = phi::stride(collapsed_output_shape);
+  const phi::DDim input_stride = common::stride(collapsed_input_shape);
+  const phi::DDim output_stride = common::stride(collapsed_output_shape);
 
   DftiDescriptor desc = plan_mkl_fft(x.dtype(),
                                      out->dtype(),
@@ -220,9 +220,9 @@ struct FFTC2CFunctor<phi::CPUContext, Ti, To> {
     using C = std::complex<R>;
 
     const auto& input_dim = x.dims();
-    const std::vector<size_t> in_sizes = phi::vectorize<size_t>(input_dim);
+    const std::vector<size_t> in_sizes = common::vectorize<size_t>(input_dim);
     std::vector<std::ptrdiff_t> in_strides =
-        phi::vectorize<std::ptrdiff_t>(phi::stride(input_dim));
+        common::vectorize<std::ptrdiff_t>(common::stride(input_dim));
     const int64_t data_size = sizeof(C);
     std::transform(in_strides.begin(),
                    in_strides.end(),
@@ -263,9 +263,9 @@ struct FFTR2CFunctor<phi::CPUContext, Ti, To> {
     using C = std::complex<R>;
 
     const auto& input_dim = x.dims();
-    const std::vector<size_t> in_sizes = phi::vectorize<size_t>(input_dim);
+    const std::vector<size_t> in_sizes = common::vectorize<size_t>(input_dim);
     std::vector<std::ptrdiff_t> in_strides =
-        phi::vectorize<std::ptrdiff_t>(phi::stride(input_dim));
+        common::vectorize<std::ptrdiff_t>(common::stride(input_dim));
     {
       const int64_t data_size = sizeof(R);
       std::transform(in_strides.begin(),
@@ -275,9 +275,9 @@ struct FFTR2CFunctor<phi::CPUContext, Ti, To> {
     }
 
     const auto& output_dim = out->dims();
-    const std::vector<size_t> out_sizes = phi::vectorize<size_t>(output_dim);
+    const std::vector<size_t> out_sizes = common::vectorize<size_t>(output_dim);
     std::vector<std::ptrdiff_t> out_strides =
-        phi::vectorize<std::ptrdiff_t>(phi::stride(output_dim));
+        common::vectorize<std::ptrdiff_t>(common::stride(output_dim));
     {
       const int64_t data_size = sizeof(C);
       std::transform(out_strides.begin(),
@@ -320,9 +320,9 @@ struct FFTC2RFunctor<phi::CPUContext, Ti, To> {
     using C = std::complex<R>;
 
     const auto& input_dim = x.dims();
-    const std::vector<size_t> in_sizes = phi::vectorize<size_t>(input_dim);
+    const std::vector<size_t> in_sizes = common::vectorize<size_t>(input_dim);
     std::vector<std::ptrdiff_t> in_strides =
-        phi::vectorize<std::ptrdiff_t>(phi::stride(input_dim));
+        common::vectorize<std::ptrdiff_t>(common::stride(input_dim));
     {
       const int64_t data_size = sizeof(C);
       std::transform(in_strides.begin(),
@@ -332,9 +332,9 @@ struct FFTC2RFunctor<phi::CPUContext, Ti, To> {
     }
 
     const auto& output_dim = out->dims();
-    const std::vector<size_t> out_sizes = phi::vectorize<size_t>(output_dim);
+    const std::vector<size_t> out_sizes = common::vectorize<size_t>(output_dim);
     std::vector<std::ptrdiff_t> out_strides =
-        phi::vectorize<std::ptrdiff_t>(phi::stride(output_dim));
+        common::vectorize<std::ptrdiff_t>(common::stride(output_dim));
     {
       const int64_t data_size = sizeof(R);
       std::transform(out_strides.begin(),
diff --git a/paddle/phi/kernels/funcs/fft.cu b/paddle/phi/kernels/funcs/fft.cu
index e13a79b335ac0e..c70f615e80fa4d 100644
--- a/paddle/phi/kernels/funcs/fft.cu
+++ b/paddle/phi/kernels/funcs/fft.cu
@@ -17,8 +17,8 @@
 #include "paddle/phi/kernels/funcs/fft.h"
 #include "paddle/phi/kernels/funcs/fft_cache.h"
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/assign_kernel.h"
 #include "paddle/phi/kernels/complex_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
@@ -150,7 +150,7 @@ void exec_fft(const phi::GPUContext& ctx,
   for (int i = 0; i < signal_ndim; i++) {
     collapsed_input_shape_.push_back(in_sizes[axes[i]]);
   }
-  phi::DDim collapsed_input_shape = phi::make_ddim(collapsed_input_shape_);
+  phi::DDim collapsed_input_shape = common::make_ddim(collapsed_input_shape_);
   transposed_input.Resize(collapsed_input_shape);
   DenseTensor& collapsed_input = transposed_input;
 
@@ -162,7 +162,7 @@ void exec_fft(const phi::GPUContext& ctx,
   for (int i = 0; i < signal_ndim; i++) {
     collapsed_output_shape_.push_back(out_sizes[axes[i]]);
   }
-  phi::DDim collapsed_output_shape = phi::make_ddim(collapsed_output_shape_);
+  phi::DDim collapsed_output_shape = common::make_ddim(collapsed_output_shape_);
   DenseTensor collapsed_output;
   collapsed_output.Resize(collapsed_output_shape);
   ctx.Alloc<To>(&collapsed_output);
@@ -267,7 +267,7 @@ struct FFTC2CFunctor<phi::GPUContext, Ti, To> {
       }
     }
 
-    std::vector<int64_t> out_dims = phi::vectorize(x.dims());
+    std::vector<int64_t> out_dims = common::vectorize(x.dims());
     detail::exec_normalization<To>(
         ctx, *out, out, normalization, out_dims, axes);
   }
@@ -281,7 +281,7 @@ struct FFTC2RFunctor<phi::GPUContext, Ti, To> {
                   const std::vector<int64_t>& axes,
                   FFTNormMode normalization,
                   bool forward) {
-    std::vector<int64_t> out_dims = phi::vectorize(out->dims());
+    std::vector<int64_t> out_dims = common::vectorize(out->dims());
 
     if (detail::use_optimized_fft_path(axes)) {
       DenseTensor x_copy = Assign(ctx, x);
@@ -325,7 +325,7 @@ struct FFTR2CFunctor<phi::GPUContext, Ti, To> {
                    forward);
     }
 
-    const auto in_dims = phi::vectorize(x.dims());
+    const auto in_dims = common::vectorize(x.dims());
     detail::exec_normalization<To>(
         ctx, *out, out, normalization, in_dims, axes);
   }
diff --git a/paddle/phi/kernels/funcs/fft_fill_conj.h b/paddle/phi/kernels/funcs/fft_fill_conj.h
index 91d859020f88b9..ab6d351986ecc2 100644
--- a/paddle/phi/kernels/funcs/fft_fill_conj.h
+++ b/paddle/phi/kernels/funcs/fft_fill_conj.h
@@ -142,10 +142,10 @@ void FFTFillConj(const DeviceContext& ctx,
                  DenseTensor* dst,
                  const std::vector<int64_t>& axes) {
   std::vector<int64_t> src_strides_v =
-      phi::vectorize<int64_t>(phi::stride(src->dims()));
+      common::vectorize<int64_t>(common::stride(src->dims()));
   std::vector<int64_t> dst_strides_v =
-      phi::vectorize<int64_t>(phi::stride(dst->dims()));
-  std::vector<int64_t> dst_shape_v = phi::vectorize<int64_t>(dst->dims());
+      common::vectorize<int64_t>(common::stride(dst->dims()));
+  std::vector<int64_t> dst_shape_v = common::vectorize<int64_t>(dst->dims());
   const auto src_data = src->data<C>();
   auto dst_data = dst->data<C>();
   const auto last_axis = axes.back();
diff --git a/paddle/phi/kernels/funcs/fft_key.h b/paddle/phi/kernels/funcs/fft_key.h
index 5893cfc6ba019f..8a577754cf051e 100644
--- a/paddle/phi/kernels/funcs/fft_key.h
+++ b/paddle/phi/kernels/funcs/fft_key.h
@@ -102,8 +102,8 @@ static FFTConfigKey create_fft_configkey(const DenseTensor& input,
     auto out_size = output.dims()[i];
     signal_size[i] = std::max(in_size, out_size);
   }
-  FFTConfigKey key(phi::vectorize(input.dims()),
-                   phi::vectorize(output.dims()),
+  FFTConfigKey key(common::vectorize(input.dims()),
+                   common::vectorize(output.dims()),
                    signal_size,
                    fft_type,
                    value_type);
diff --git a/paddle/phi/kernels/funcs/for_range.h b/paddle/phi/kernels/funcs/for_range.h
index 484fbd21dc7709..7b6f672f47f1b5 100644
--- a/paddle/phi/kernels/funcs/for_range.h
+++ b/paddle/phi/kernels/funcs/for_range.h
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "paddle/common/macros.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
-#include "paddle/phi/core/macros.h"
 namespace phi {
 namespace funcs {
 
diff --git a/paddle/phi/kernels/funcs/fused_gemm_epilogue.h b/paddle/phi/kernels/funcs/fused_gemm_epilogue.h
index eb5f0fa540f8d3..c07ff2e48864f1 100644
--- a/paddle/phi/kernels/funcs/fused_gemm_epilogue.h
+++ b/paddle/phi/kernels/funcs/fused_gemm_epilogue.h
@@ -394,11 +394,11 @@ void ComputeFusedGemmEpilogueForward(const phi::GPUContext& dev_ctx,
     if (activation == "relu") {
       phi::DataType rs_type = phi::DataType::BOOL;
       size_t reserve_space_size =
-          phi::product(reserve_space->dims()) * SizeOf(rs_type);
+          common::product(reserve_space->dims()) * SizeOf(rs_type);
       dev_ctx.Alloc(reserve_space, rs_type, reserve_space_size);
     } else {
       size_t reserve_space_size =
-          phi::product(reserve_space->dims()) * sizeof(T);
+          common::product(reserve_space->dims()) * sizeof(T);
       dev_ctx.Alloc<T>(reserve_space, reserve_space_size);
     }
 
diff --git a/paddle/phi/kernels/funcs/gather.cu.h b/paddle/phi/kernels/funcs/gather.cu.h
index 2acb49446d7bbb..3d4a5256f5fa97 100644
--- a/paddle/phi/kernels/funcs/gather.cu.h
+++ b/paddle/phi/kernels/funcs/gather.cu.h
@@ -135,8 +135,8 @@ void GPUGatherNd(const phi::GPUContext& ctx,
   // final dim
   int64_t end_size = index_dims[index_dims_size - 1];
   // remain dim
-  auto remain_ddim = phi::slice_ddim(index_dims, 0, index_dims_size - 1);
-  int64_t remain_numel = phi::product(remain_ddim);
+  auto remain_ddim = common::slice_ddim(index_dims, 0, index_dims_size - 1);
+  int64_t remain_numel = common::product(remain_ddim);
   // slice size
   int64_t slice_size = 1;
   for (int64_t i = end_size; i < input_dims_size; ++i) {
@@ -250,7 +250,7 @@ void GatherV2CUDAFunction(const DenseTensor* input,
     outer_dim_size *= input_dim[i];
     out_dim_vec.push_back(input_dim[i]);
   }
-  auto out_dim = phi::make_ddim(out_dim_vec);
+  auto out_dim = common::make_ddim(out_dim_vec);
 
   out->Resize(out_dim);
   auto* out_data = ctx.Alloc<T>(out);
diff --git a/paddle/phi/kernels/funcs/gather.h b/paddle/phi/kernels/funcs/gather.h
index 50f7f4fa0322cb..519bc9fb962c9c 100644
--- a/paddle/phi/kernels/funcs/gather.h
+++ b/paddle/phi/kernels/funcs/gather.h
@@ -18,10 +18,10 @@ limitations under the License. */
 #include <cstring>
 #include <vector>
 
+#include "paddle/common/ddim.h"
+#include "paddle/common/macros.h"
 #include "paddle/phi/common/place.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/macros.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 namespace phi {
 namespace funcs {
@@ -111,8 +111,8 @@ void CPUGatherNd(const phi::CPUContext& ctx UNUSED,
   // final dim
   int64_t end_size = index_dims[index_dims_size - 1];
   // remain dim
-  auto remain_ddim = phi::slice_ddim(index_dims, 0, index_dims_size - 1);
-  int64_t remain_numel = phi::product(remain_ddim);
+  auto remain_ddim = common::slice_ddim(index_dims, 0, index_dims_size - 1);
+  int64_t remain_numel = common::product(remain_ddim);
   // slice size
   int64_t slice_size = 1;
   for (int64_t i = end_size; i < input_dims_size; ++i) {
@@ -195,7 +195,7 @@ void GatherV2Function(const phi::CPUContext& ctx,
     outer_dim_size *= input_dim[i];
     out_dim_vec.push_back(input_dim[i]);
   }
-  auto out_dim = phi::make_ddim(out_dim_vec);
+  auto out_dim = common::make_ddim(out_dim_vec);
 
   out->Resize(out_dim);
   auto* out_data = ctx.Alloc<T>(out);
diff --git a/paddle/phi/kernels/funcs/gather_scatter_functor.cc b/paddle/phi/kernels/funcs/gather_scatter_functor.cc
index be07c68b0fd338..7be86351c47ff6 100644
--- a/paddle/phi/kernels/funcs/gather_scatter_functor.cc
+++ b/paddle/phi/kernels/funcs/gather_scatter_functor.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "glog/logging.h"
 
-#include "paddle/phi/core/macros.h"
+#include "paddle/common/macros.h"
 
 namespace phi {
 namespace funcs {
diff --git a/paddle/phi/kernels/funcs/im2col.h b/paddle/phi/kernels/funcs/im2col.h
index 73b2866924d1e9..c6b8d22b8c60c4 100644
--- a/paddle/phi/kernels/funcs/im2col.h
+++ b/paddle/phi/kernels/funcs/im2col.h
@@ -16,10 +16,10 @@ limitations under the License. */
 
 #include <vector>
 
-#include "paddle/phi/common/layout.h"
+#include "paddle/common/errors.h"
+#include "paddle/common/layout.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 
 namespace phi {
 namespace funcs {
diff --git a/paddle/phi/kernels/funcs/index_calculator.h b/paddle/phi/kernels/funcs/index_calculator.h
index 13697e443e16de..4e306cb87a480d 100644
--- a/paddle/phi/kernels/funcs/index_calculator.h
+++ b/paddle/phi/kernels/funcs/index_calculator.h
@@ -34,8 +34,7 @@ constexpr int kMaxRank = phi::DDim::kMaxRank;
 namespace details {
 // Convert dims from vector to array
 template <typename T, size_t ElementCount, typename VectorLikeType>
-static inline phi::Array<T, ElementCount> VectorToArray(
-    const VectorLikeType& vec) {
+static inline Array<T, ElementCount> VectorToArray(const VectorLikeType& vec) {
   PADDLE_ENFORCE_LE(
       vec.size(),
       ElementCount,
@@ -44,7 +43,7 @@ static inline phi::Array<T, ElementCount> VectorToArray(
                                    vec.size(),
                                    ElementCount));
   size_t n = static_cast<size_t>(vec.size());
-  phi::Array<T, ElementCount> ret;
+  Array<T, ElementCount> ret;
   for (size_t i = 0; i < n; ++i) {
     ret[i] = vec[i];
   }
@@ -99,11 +98,11 @@ struct IndexCalculator {
   }
 
   int dim;
-  phi::Array<int, kMaxRank> dims;
-  phi::Array<int, kMaxRank> strides;
-  phi::Array<int, kMaxRank> reduce_strides;
+  Array<int, kMaxRank> dims;
+  Array<int, kMaxRank> strides;
+  Array<int, kMaxRank> reduce_strides;
 #ifndef PADDLE_WITH_XPU_KP
-  phi::Array<kps::details::FastDivMod, kMaxRank> divmoders;
+  Array<kps::details::FastDivMod, kMaxRank> divmoders;
 #endif
 };
 
diff --git a/paddle/phi/kernels/funcs/index_put_utils.h b/paddle/phi/kernels/funcs/index_put_utils.h
index 09da00d7cca147..e6b70e8eb7305f 100644
--- a/paddle/phi/kernels/funcs/index_put_utils.h
+++ b/paddle/phi/kernels/funcs/index_put_utils.h
@@ -15,12 +15,12 @@
 #pragma once
 
 #include <vector>
+#include "paddle/common/array.h"
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/utils/array.h"
 #include "paddle/phi/kernels/cast_kernel.h"
 #include "paddle/phi/kernels/expand_kernel.h"
 #include "paddle/phi/kernels/nonzero_kernel.h"
@@ -46,7 +46,7 @@ phi::DenseTensor GetReshapeAndExpandTensor(const Context& dev_ctx,
                                            const phi::DDim& res_dim,
                                            const phi::DDim& bd_dim,
                                            int index) {
-  std::vector<int64_t> before_dims = phi::vectorize(tensor.dims());
+  std::vector<int64_t> before_dims = common::vectorize(tensor.dims());
   std::vector<int64_t> mid_dims(res_dim.size(), 1);
 
   if (index == 0) {
@@ -58,13 +58,13 @@ phi::DenseTensor GetReshapeAndExpandTensor(const Context& dev_ctx,
   }
 
   phi::DenseTensor mid_tensor(tensor.dtype());
-  mid_tensor.Resize(phi::make_ddim(mid_dims));
+  mid_tensor.Resize(common::make_ddim(mid_dims));
   ReshapeInferKernel<Context>(dev_ctx, tensor, IntArray(mid_dims), &mid_tensor);
 
   phi::DenseTensor res_tensor(tensor.dtype());
   res_tensor.Resize(res_dim);
   ExpandKernel<T, Context>(
-      dev_ctx, mid_tensor, IntArray(phi::vectorize(res_dim)), &res_tensor);
+      dev_ctx, mid_tensor, IntArray(common::vectorize(res_dim)), &res_tensor);
   return res_tensor;
 }
 
@@ -86,7 +86,7 @@ std::vector<const phi::DenseTensor*> DealWithBoolIndices(
           phi::errors::InvalidArgument("the only bool tensor in indices should "
                                        "have number of dimension at least 1"));
       phi::DenseTensor nonzero_indices(phi::DataType::INT64);
-      nonzero_indices.Resize(phi::make_ddim({-1, rank}));
+      nonzero_indices.Resize(common::make_ddim({-1, rank}));
       NonZeroKernel<bool, Context>(dev_ctx, *indices_v[i], &nonzero_indices);
 
       if (nonzero_indices.numel() == 0) {
@@ -99,7 +99,7 @@ std::vector<const phi::DenseTensor*> DealWithBoolIndices(
       for (int i = 0; i < rank; ++i) {
         tmp_indices_v->emplace_back(
             DenseTensor(phi::DataType::INT64)
-                .Resize(phi::make_ddim({nonzero_indices.dims()[0]})));
+                .Resize(common::make_ddim({nonzero_indices.dims()[0]})));
       }
       for (int i = 0; i < rank; ++i) {
         integer_indices[i] = &((*tmp_indices_v)[i + tmp_ix]);
@@ -173,7 +173,7 @@ static phi::DDim BroadCastTensorsDims(
     }
     target_dims[target_rank - index - 1] = target_dim_size;
   }
-  return phi::make_ddim(target_dims);
+  return common::make_ddim(target_dims);
 }
 
 template <typename T, typename Context>
@@ -207,7 +207,7 @@ void DealWithIndices(const Context& dev_ctx,
                      std::vector<int64_t>* res_dim_v) {
   size_t total_dims = x.dims().size();
   if (int_indices_v.size() < total_dims) {
-    std::vector<int64_t> tmp_x_dims = phi::vectorize(x.dims());
+    std::vector<int64_t> tmp_x_dims = common::vectorize(x.dims());
     int len_bd_dim = bd_dim.size();
     res_dim_v->insert(res_dim_v->end(),
                       tmp_x_dims.begin() + int_indices_v.size(),
@@ -225,7 +225,7 @@ void DealWithIndices(const Context& dev_ctx,
     reshaped_indices_v.insert(
         reshaped_indices_v.end(), range_tensor_v.begin(), range_tensor_v.end());
 
-    phi::DDim res_dim = phi::make_ddim(*res_dim_v);
+    phi::DDim res_dim = common::make_ddim(*res_dim_v);
 
     for (size_t i = 0; i < reshaped_indices_v.size(); ++i) {
       tmp_res_indices_v->emplace_back(
@@ -261,7 +261,7 @@ void DealWithIndices(const Context& dev_ctx,
         ExpandKernel<int64_t, Context>(
             dev_ctx,
             int_indices_v_tmp[i],
-            IntArray(phi::vectorize<int64_t>(bd_dim)),
+            IntArray(common::vectorize<int64_t>(bd_dim)),
             &(*tmp_res_indices_v)[i]);
       } else {
         tmp_res_indices_v->emplace_back(int_indices_v_tmp[i]);
@@ -323,7 +323,7 @@ phi::DenseTensor GetRangeCudaTensor(const Context& dev_ctx,
                                     int64_t N,
                                     phi::DataType dtype) {
   phi::DenseTensor res(dtype);
-  res.Resize(phi::make_ddim({N}));
+  res.Resize(common::make_ddim({N}));
   DenseTensor* p_res = &res;
   T* out = dev_ctx.template Alloc<T>(p_res);
   auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, N);
@@ -346,7 +346,7 @@ phi::DenseTensor GetRangeTensor(const Context& dev_ctx,
                                 int64_t N,
                                 phi::DataType dtype) {
   phi::DenseTensor res(dtype);
-  res.Resize(phi::make_ddim({N}));
+  res.Resize(common::make_ddim({N}));
   DenseTensor* p_res = &res;
   T* out = dev_ctx.template Alloc<T>(p_res);
   range_kernel<T>(N, out);
diff --git a/paddle/phi/kernels/funcs/interpolate_function.h b/paddle/phi/kernels/funcs/interpolate_function.h
index 23731285926da4..bbfc54e5e2dc03 100644
--- a/paddle/phi/kernels/funcs/interpolate_function.h
+++ b/paddle/phi/kernels/funcs/interpolate_function.h
@@ -14,9 +14,9 @@
 
 #pragma once
 
+#include "paddle/common/ddim.h"
+#include "paddle/common/layout.h"
 #include "paddle/phi/backends/context_pool.h"
-#include "paddle/phi/common/layout.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 
 #if defined(__NVCC__) || defined(__HIPCC__)
@@ -87,8 +87,8 @@ inline std::vector<int> get_new_shape(
   for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
     auto tensor = list_new_shape_tensor[i];
     phi::DeviceContext* dev_ctx = pool.Get(tensor->place());
-    PADDLE_ENFORCE_EQ(tensor->dims() == phi::make_ddim({1}) ||
-                          tensor->dims() == phi::make_ddim({}),
+    PADDLE_ENFORCE_EQ(tensor->dims() == common::make_ddim({1}) ||
+                          tensor->dims() == common::make_ddim({}),
                       true,
                       errors::InvalidArgument(
                           "The shape of dimension tensor should be [1] or [],"
diff --git a/paddle/phi/kernels/funcs/jit/CMakeLists.txt b/paddle/phi/kernels/funcs/jit/CMakeLists.txt
index fd44ca308107cf..248bdf1c215c32 100644
--- a/paddle/phi/kernels/funcs/jit/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/jit/CMakeLists.txt
@@ -29,7 +29,7 @@ endif()
 cc_test(
   jit_kernel_test
   SRCS test.cc
-  DEPS phi)
+  DEPS phi common)
 
 if(NOT WIN32)
   set(cuda_less12_and_gcc_greater12 false)
@@ -40,7 +40,7 @@ if(NOT WIN32)
     endif()
   endif()
   if(NOT cuda_less12_and_gcc_greater12)
-    cc_binary(jit_kernel_benchmark SRCS benchmark.cc DEPS phi)
+    cc_binary(jit_kernel_benchmark SRCS benchmark.cc DEPS phi common)
   endif()
 endif()
 if(WITH_TESTING AND TEST jit_kernel_test)
diff --git a/paddle/phi/kernels/funcs/jit/kernel_base.h b/paddle/phi/kernels/funcs/jit/kernel_base.h
index 78bedf184975cd..b8a638b48fc8df 100644
--- a/paddle/phi/kernels/funcs/jit/kernel_base.h
+++ b/paddle/phi/kernels/funcs/jit/kernel_base.h
@@ -15,7 +15,7 @@
 #pragma once
 #include <cstdint>
 
-#include "paddle/phi/core/macros.h"
+#include "paddle/common/macros.h"
 #include "paddle/phi/kernels/funcs/jit/macro.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/funcs/jit/registry.h b/paddle/phi/kernels/funcs/jit/registry.h
index e9b371312548f6..26849a66097058 100644
--- a/paddle/phi/kernels/funcs/jit/registry.h
+++ b/paddle/phi/kernels/funcs/jit/registry.h
@@ -19,8 +19,8 @@
 #include <type_traits>
 #include <utility>  // for std::move
 
+#include "paddle/common/macros.h"
 #include "paddle/phi/common/place.h"
-#include "paddle/phi/core/macros.h"
 #include "paddle/phi/kernels/funcs/jit/kernel_base.h"
 #include "paddle/phi/kernels/funcs/jit/kernel_pool.h"
 
diff --git a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
index 1a52e57e45f236..6a82875819161b 100644
--- a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
+++ b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
@@ -26,11 +26,11 @@ namespace cub = hipcub;
 
 #include "glog/logging.h"
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
 #include "paddle/phi/common/memory_utils.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/funcs/math_function.cc b/paddle/phi/kernels/funcs/math_function.cc
index bfb2dc6d78e17f..8642d9d9e602e3 100644
--- a/paddle/phi/kernels/funcs/math_function.cc
+++ b/paddle/phi/kernels/funcs/math_function.cc
@@ -103,8 +103,8 @@ void TransposeNormal<DeviceContext, T>::operator()(
     phi::DenseTensor* out,
     const std::vector<int>& axis) {
   const int rank = static_cast<const int>(axis.size());
-  auto in_stride = phi::stride(in.dims());
-  auto out_stride = phi::stride(out->dims());
+  auto in_stride = common::stride(in.dims());
+  auto out_stride = common::stride(out->dims());
   const T* in_ptr = in.data<T>();
   T* out_ptr = out->data<T>();
 
@@ -191,7 +191,7 @@ void set_constant_with_place<phi::CustomPlace>(
                                     phi::DenseTensor*);
   auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
   (*kernel_fn)(context,
-               phi::IntArray(phi::vectorize(tensor->dims())),
+               phi::IntArray(common::vectorize(tensor->dims())),
                phi::Scalar(value),
                tensor->dtype(),
                tensor);
diff --git a/paddle/phi/kernels/funcs/math_function.cu b/paddle/phi/kernels/funcs/math_function.cu
index bdd97616e0a660..16d4aed2f10618 100644
--- a/paddle/phi/kernels/funcs/math_function.cu
+++ b/paddle/phi/kernels/funcs/math_function.cu
@@ -220,8 +220,8 @@ void TransposeNormal<DeviceContext, T>::operator()(
     phi::DenseTensor* out,
     const std::vector<int>& axis) {
   const int rank = axis.size();
-  auto in_stride = phi::stride(in.dims());
-  auto out_stride = phi::stride(out->dims());
+  auto in_stride = common::stride(in.dims());
+  auto out_stride = common::stride(out->dims());
   auto* in_ptr = in.data<T>();
   auto* out_ptr = out->data<T>();
 
diff --git a/paddle/phi/kernels/funcs/matrix_reduce.cc b/paddle/phi/kernels/funcs/matrix_reduce.cc
index 34d84070497fc5..e20d98984eb5aa 100644
--- a/paddle/phi/kernels/funcs/matrix_reduce.cc
+++ b/paddle/phi/kernels/funcs/matrix_reduce.cc
@@ -28,9 +28,10 @@ class MatrixReduceSumFunctor<T, CPUContext> {
                   DenseTensor* out) {
     // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3]
     // out_reduce_dim should be [0, 2]
-    const std::vector<int64_t> in_dims = phi::vectorize<int64_t>(in.dims());
+    const std::vector<int64_t> in_dims = common::vectorize<int64_t>(in.dims());
     auto in_size = in_dims.size();
-    const std::vector<int64_t> out_dims = phi::vectorize<int64_t>(out->dims());
+    const std::vector<int64_t> out_dims =
+        common::vectorize<int64_t>(out->dims());
     auto out_size = out_dims.size();
 
     std::vector<int64_t> out_bst_dims(in_size);
@@ -39,7 +40,7 @@ class MatrixReduceSumFunctor<T, CPUContext> {
     std::copy(out_dims.data(),
               out_dims.data() + out_size,
               out_bst_dims.data() + in_size - out_size);
-    out->Resize(phi::make_ddim(out_bst_dims));
+    out->Resize(common::make_ddim(out_bst_dims));
 
     std::vector<int64_t> out_reduce_dims;
     for (size_t idx = 0; idx <= in_size - 3; idx++) {
diff --git a/paddle/phi/kernels/funcs/matrix_reduce.cu b/paddle/phi/kernels/funcs/matrix_reduce.cu
index 5c3ebd6bb01671..f4305914c41713 100644
--- a/paddle/phi/kernels/funcs/matrix_reduce.cu
+++ b/paddle/phi/kernels/funcs/matrix_reduce.cu
@@ -26,9 +26,9 @@ class MatrixReduceSumFunctor<T, GPUContext> {
                   DenseTensor* out) {
     // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3]
     // out_reduce_dim should be [0, 2]
-    const std::vector<int> in_dims = phi::vectorize<int>(in.dims());
+    const std::vector<int> in_dims = common::vectorize<int>(in.dims());
     auto in_size = in_dims.size();
-    const std::vector<int> out_dims = phi::vectorize<int>(out->dims());
+    const std::vector<int> out_dims = common::vectorize<int>(out->dims());
     auto out_size = out_dims.size();
 
     std::vector<int> out_bst_dims(in_size);
@@ -37,7 +37,7 @@ class MatrixReduceSumFunctor<T, GPUContext> {
     std::copy(out_dims.data(),
               out_dims.data() + out_size,
               out_bst_dims.data() + in_size - out_size);
-    out->Resize(phi::make_ddim(out_bst_dims));
+    out->Resize(common::make_ddim(out_bst_dims));
 
     std::vector<int> out_reduce_dims;
     for (size_t idx = 0; idx <= in_size - 3; idx++) {
diff --git a/paddle/phi/kernels/funcs/matrix_solve.cu b/paddle/phi/kernels/funcs/matrix_solve.cu
index 0655bb4d958969..0bd1522e9f58ee 100644
--- a/paddle/phi/kernels/funcs/matrix_solve.cu
+++ b/paddle/phi/kernels/funcs/matrix_solve.cu
@@ -64,7 +64,7 @@ void MatrixSolveFunctor<Context, T>::operator()(const Context& context,
   // because cuBlas assumes column-major while Paddle uses row-majar.
   DenseTensor tmp_b(b.type());
   const auto& new_dims_vec = getNewDimsVec(b_dims);
-  tmp_b.Resize(phi::make_ddim(new_dims_vec));
+  tmp_b.Resize(common::make_ddim(new_dims_vec));
   context.template Alloc<T>(&tmp_b);
   phi::funcs::TransposeNormal<Context, T> trans;
   std::vector<int> new_axis = getNewAxis(b_rank);
diff --git a/paddle/phi/kernels/funcs/matrix_solve.h b/paddle/phi/kernels/funcs/matrix_solve.h
index 3856c06c1b25fc..f8225bd482385e 100644
--- a/paddle/phi/kernels/funcs/matrix_solve.h
+++ b/paddle/phi/kernels/funcs/matrix_solve.h
@@ -68,7 +68,7 @@ static std::vector<int> getNewAxis(const int b_rank) {
 
 // for Resize
 static std::vector<int64_t> getNewDimsVec(const DDim& b_dims) {
-  std::vector<int64_t> b_dims_vec = phi::vectorize(b_dims);
+  std::vector<int64_t> b_dims_vec = common::vectorize(b_dims);
   int size = b_dims_vec.size();
   if (size >= 2) {
     // swap the last 2 elements in b_dims_vec
diff --git a/paddle/phi/kernels/funcs/maxouting.h b/paddle/phi/kernels/funcs/maxouting.h
index c6242318a3c0dc..99b781ebba0aae 100644
--- a/paddle/phi/kernels/funcs/maxouting.h
+++ b/paddle/phi/kernels/funcs/maxouting.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "paddle/common/macros.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/macros.h"
 
 namespace phi {
 namespace funcs {
diff --git a/paddle/phi/kernels/funcs/mode.h b/paddle/phi/kernels/funcs/mode.h
index 632b0ce7e15104..d6cf68c092317e 100644
--- a/paddle/phi/kernels/funcs/mode.h
+++ b/paddle/phi/kernels/funcs/mode.h
@@ -152,7 +152,7 @@ static void GetModebySort(const phi::GPUContext& dev_ctx,
                           T* out_tensor,
                           int64_t* indices_tensor) {
   DenseTensor input_tmp;
-  input_tmp.Resize(phi::make_ddim({num_rows, num_cols}));
+  input_tmp.Resize(common::make_ddim({num_rows, num_cols}));
   T* input_tmp_data = dev_ctx.Alloc<T>(&input_tmp);
   phi::Copy(dev_ctx, *input_tensor, dev_ctx.GetPlace(), false, &input_tmp);
 
diff --git a/paddle/phi/kernels/funcs/nanmedian_utils.h b/paddle/phi/kernels/funcs/nanmedian_utils.h
index edcdc10b885956..6acbe25bf75bb2 100644
--- a/paddle/phi/kernels/funcs/nanmedian_utils.h
+++ b/paddle/phi/kernels/funcs/nanmedian_utils.h
@@ -56,7 +56,7 @@ void PostprocessMedianGradKernel(const Context& dev_ctx,
     }
   }
 
-  input->Resize(make_ddim(reshape_back));
+  input->Resize(common::make_ddim(reshape_back));
   funcs::TransCompute<Context, T>(
       static_cast<int>(trans_back.size()), dev_ctx, *input, x, trans_back);
 }
@@ -104,7 +104,7 @@ void PreprocessMedianKernel(const Context& dev_ctx,
   dev_ctx.template Alloc<T>(x);
   funcs::TransCompute<Context, T>(ndims, dev_ctx, input, x, perm);
 
-  x->Resize(make_ddim(reshape));
+  x->Resize(common::make_ddim(reshape));
 }
 
 }  // namespace funcs
diff --git a/paddle/phi/kernels/funcs/norm_utils.cu.h b/paddle/phi/kernels/funcs/norm_utils.cu.h
index ecd2ac50509f6b..0d8fa486cc065a 100644
--- a/paddle/phi/kernels/funcs/norm_utils.cu.h
+++ b/paddle/phi/kernels/funcs/norm_utils.cu.h
@@ -24,7 +24,7 @@ limitations under the License. */
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
 #endif
-#include "paddle/phi/common/layout.h"
+#include "paddle/common/layout.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 
diff --git a/paddle/phi/kernels/funcs/norm_utils.h b/paddle/phi/kernels/funcs/norm_utils.h
index 5c898549b353ea..c3a3b07ae08cca 100644
--- a/paddle/phi/kernels/funcs/norm_utils.h
+++ b/paddle/phi/kernels/funcs/norm_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/phi/common/layout.h"
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
+#include "paddle/common/layout.h"
 
 namespace phi {
 namespace funcs {
diff --git a/paddle/phi/kernels/funcs/pooling.h b/paddle/phi/kernels/funcs/pooling.h
index bf2409d2e502b8..1ffd747735543c 100644
--- a/paddle/phi/kernels/funcs/pooling.h
+++ b/paddle/phi/kernels/funcs/pooling.h
@@ -18,10 +18,10 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#include "paddle/common/macros.h"  // import FLT_MAX
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/hostdevice.h"
-#include "paddle/phi/core/macros.h"  // import FLT_MAX
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/phi/backends/gpu/gpu_decls.h"
@@ -427,7 +427,7 @@ inline void UpdatePadding(std::vector<T>* paddings,
                           const std::vector<T>& strides,
                           const std::vector<T>& kernel_size) {
   // set padding size == data_dims.size() * 2
-  auto data_shape = vectorize<T>(data_dims);
+  auto data_shape = common::vectorize<T>(data_dims);
   if (static_cast<int>(paddings->size()) == data_dims.size()) {
     for (int i = 0; i < data_dims.size(); ++i) {
       T copy_pad = *(paddings->begin() + 2 * i);
diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h
index 1bbdd019a7c4bd..564c02c9f9f79b 100644
--- a/paddle/phi/kernels/funcs/reduce_function.h
+++ b/paddle/phi/kernels/funcs/reduce_function.h
@@ -57,10 +57,10 @@ using dim3 = phi::kps::dim3;
 
 #endif
 
+#include "paddle/common/array.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_utils.h"
-#include "paddle/phi/core/utils/array.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -288,8 +288,8 @@ struct ReduceConfig {
                      const KPDevice& dev_ctx,
                      phi::DenseTensor* tmp) {
     if (should_reduce_again) {
-      tmp->Resize(
-          phi::make_ddim({static_cast<int64_t>(left_num * grid.z * grid.y)}));
+      tmp->Resize(common::make_ddim(
+          {static_cast<int64_t>(left_num * grid.z * grid.y)}));
       tmp_data = dev_ctx.Alloc<MPType>(tmp);
     }
   }
@@ -1060,7 +1060,7 @@ void ReduceKernel(const KPDevice& dev_ctx,
 #endif
   dev_ctx.Alloc<Ty>(y);
 
-  auto x_dim = phi::vectorize<int>(x.dims());
+  auto x_dim = common::vectorize<int>(x.dims());
 
   if (x_dim.size() == 0) {
     std::vector<const DenseTensor*> inputs = {&x};
@@ -1238,13 +1238,13 @@ void ReduceFunctor(const Context& context,
   DDim out_dims = output->dims();
   if (keep_dim && x_rank > 1) {
     const int kDelFlag = -2;
-    auto dims_vector = phi::vectorize(out_dims);
+    auto dims_vector = common::vectorize(out_dims);
     for (size_t i = 0; i < dims_ref.size(); ++i) {
       dims_vector[dims_ref[i]] = kDelFlag;
     }
     dims_vector.erase(remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
                       dims_vector.end());
-    out_dims = phi::make_ddim(dims_vector);
+    out_dims = common::make_ddim(dims_vector);
   }
   auto& place = *context.eigen_device();
   Functor functor;
diff --git a/paddle/phi/kernels/funcs/reduce_functor.h b/paddle/phi/kernels/funcs/reduce_functor.h
index df36bee5f98ff3..ee319b060d0957 100644
--- a/paddle/phi/kernels/funcs/reduce_functor.h
+++ b/paddle/phi/kernels/funcs/reduce_functor.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "paddle/phi/core/macros.h"
+#include "paddle/common/macros.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 namespace phi {
diff --git a/paddle/phi/kernels/funcs/reduce_grad_functions.h b/paddle/phi/kernels/funcs/reduce_grad_functions.h
index e06ae3986b9ded..c6c13b5fac64ba 100644
--- a/paddle/phi/kernels/funcs/reduce_grad_functions.h
+++ b/paddle/phi/kernels/funcs/reduce_grad_functions.h
@@ -36,7 +36,7 @@ void ReduceGradFunctor(const Context& dev_ctx,
   auto x_grad = phi::EigenTensor<T, D>::From(*output);
   auto x_rank = static_cast<int>(x.dimensions().size());
   auto x_dims = input0.dims();
-  auto reduced_dims_v = phi::vectorize(x_dims);
+  auto reduced_dims_v = common::vectorize(x_dims);
   std::vector<int> dims_ref = dims;
   Eigen::array<int, D> broadcast_dim;
   for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
@@ -50,7 +50,7 @@ void ReduceGradFunctor(const Context& dev_ctx,
     broadcast_dim[dims_ref[i]] = x_dims[dims_ref[i]];
     broad_cast_times *= x_dims[dims_ref[i]];
   }
-  auto reduced_dims = phi::make_ddim(reduced_dims_v);
+  auto reduced_dims = common::make_ddim(reduced_dims_v);
   auto x_reduce = EigenTensor<T, D>::From(input1, reduced_dims);
   auto x_reduce_grad = EigenTensor<T, D>::From(input2, reduced_dims);
 
diff --git a/paddle/phi/kernels/funcs/repeat_tensor2index_tensor.h b/paddle/phi/kernels/funcs/repeat_tensor2index_tensor.h
index b66bf39b99e98c..9d2600183651f4 100644
--- a/paddle/phi/kernels/funcs/repeat_tensor2index_tensor.h
+++ b/paddle/phi/kernels/funcs/repeat_tensor2index_tensor.h
@@ -45,7 +45,7 @@ void RepeatsTensor2IndexTensor(const Context& ctx,
     std::fill_n(index_vec.begin() + offset, repeats_data[i], i);
     offset += repeats_data[i];
   }
-  index->Resize(phi::make_ddim({index_size}));
+  index->Resize(common::make_ddim({index_size}));
 
   phi::TensorFromVector<RepeatsT>(index_vec, ctx, index);
 }
diff --git a/paddle/phi/kernels/funcs/scatter.cu.h b/paddle/phi/kernels/funcs/scatter.cu.h
index 0f437db10b9332..8ef33b0ec49845 100644
--- a/paddle/phi/kernels/funcs/scatter.cu.h
+++ b/paddle/phi/kernels/funcs/scatter.cu.h
@@ -229,8 +229,8 @@ void GPUScatterNdAdd(const phi::GPUContext& ctx,
   // final dim
   int64_t end_size = index_dims[index_dims_size - 1];
   // remain dim
-  auto remain_ddim = phi::slice_ddim(index_dims, 0, index_dims_size - 1);
-  int64_t remain_numel = phi::product(remain_ddim);
+  auto remain_ddim = common::slice_ddim(index_dims, 0, index_dims_size - 1);
+  int64_t remain_numel = common::product(remain_ddim);
   // slice size
   int64_t slice_size = 1;
   for (int64_t i = end_size; i < output_dims_size; ++i) {
diff --git a/paddle/phi/kernels/funcs/scatter.h b/paddle/phi/kernels/funcs/scatter.h
index 5934f57b47ddec..ab4af24b70c94a 100644
--- a/paddle/phi/kernels/funcs/scatter.h
+++ b/paddle/phi/kernels/funcs/scatter.h
@@ -17,8 +17,8 @@ limitations under the License. */
 #include <string>
 #include <unordered_set>
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/common/place.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
@@ -276,8 +276,8 @@ void ScatterNdAdd(const phi::CPUContext& ctx,
   // final dim
   int64_t end_size = index_dims[index_dims_size - 1];
   // remain dim
-  auto remain_ddim = phi::slice_ddim(index_dims, 0, index_dims_size - 1);
-  int64_t remain_numel = phi::product(remain_ddim);
+  auto remain_ddim = common::slice_ddim(index_dims, 0, index_dims_size - 1);
+  int64_t remain_numel = common::product(remain_ddim);
   // slice size
   int64_t slice_size = 1;
   for (int64_t i = end_size; i < output_dims_size; ++i) {
diff --git a/paddle/phi/kernels/funcs/segment_pooling.cc b/paddle/phi/kernels/funcs/segment_pooling.cc
index ad8c942e10b28a..9af1211b9a144f 100644
--- a/paddle/phi/kernels/funcs/segment_pooling.cc
+++ b/paddle/phi/kernels/funcs/segment_pooling.cc
@@ -56,7 +56,7 @@ class SegmentPoolFunctor<phi::CPUContext, T, IndexT> {
       Tensor in_t = input.Slice(last_idx, idx);
 
       int64_t h = idx - last_idx;
-      auto in_e = EigenMatrix<T>::From(in_t, phi::make_ddim({h, w}));
+      auto in_e = EigenMatrix<T>::From(in_t, common::make_ddim({h, w}));
       auto out_e = EigenVector<T>::Flatten(out_t);
 
       auto reduce_dim = Eigen::array<int, 1>({{0}});
diff --git a/paddle/phi/kernels/funcs/select_impl.cu.h b/paddle/phi/kernels/funcs/select_impl.cu.h
index 96b7942cf27094..1afcad9f0f918c 100644
--- a/paddle/phi/kernels/funcs/select_impl.cu.h
+++ b/paddle/phi/kernels/funcs/select_impl.cu.h
@@ -25,9 +25,9 @@ namespace cub = hipcub;
 #endif
 
 #include <algorithm>
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/memory_utils.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
 
@@ -444,10 +444,10 @@ void SelectKernel(const KPDevice &dev_ctx,
   std::vector<int64_t> out_dim = {static_cast<int64_t>(total_true_num)};
 
   if (SelectData == 1) {
-    out->Resize(phi::make_ddim(out_dim));
+    out->Resize(common::make_ddim(out_dim));
   } else if (SelectData == 0) {  // == 0 where_index
     out_dim.push_back(static_cast<int64_t>(rank));
-    out->Resize(phi::make_ddim(out_dim));
+    out->Resize(common::make_ddim(out_dim));
   }
   auto out_data = dev_ctx.template Alloc<OutT>(out);
   // 3.2 get true data's index according to cond_data and cumsum_data
diff --git a/paddle/phi/kernels/funcs/selected_rows_functor.cc b/paddle/phi/kernels/funcs/selected_rows_functor.cc
index fef0b1dbff25d6..1233490acba539 100644
--- a/paddle/phi/kernels/funcs/selected_rows_functor.cc
+++ b/paddle/phi/kernels/funcs/selected_rows_functor.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include <set>
 #include <vector>
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 #include "paddle/phi/core/mixed_vector.h"
 
 #ifdef PADDLE_WITH_XPU
@@ -561,7 +561,7 @@ struct MergeAddImpl {
 
     out.set_height(input_height);
     DenseTensor* out_tensor = out.mutable_value();
-    out_tensor->Resize(phi::make_ddim(
+    out_tensor->Resize(common::make_ddim(
         {static_cast<int64_t>(merged_row_set.size()), input_width}));
     auto* out_data = context.template Alloc<T>(out_tensor);
 
@@ -677,8 +677,8 @@ struct MergeAdd<phi::XPUContext, T> {
     out.set_rows(merge_rows);
     out.set_height(input.height());
     DenseTensor* out_tensor = out.mutable_value();
-    out_tensor->Resize(
-        phi::make_ddim({static_cast<int64_t>(merge_rows.size()), input_width}));
+    out_tensor->Resize(common::make_ddim(
+        {static_cast<int64_t>(merge_rows.size()), input_width}));
     context.template Alloc<T>(out_tensor);
 
     std::unordered_map<int64_t, size_t> rows_to_id;
@@ -768,7 +768,7 @@ struct MergeAdd<phi::XPUContext, T> {
     out.set_height(input_height);
 
     DenseTensor* out_tensor = out.mutable_value();
-    out_tensor->Resize(phi::make_ddim(
+    out_tensor->Resize(common::make_ddim(
         {static_cast<int64_t>(merged_row_set.size()), input_width}));
     context.template Alloc<T>(out_tensor);
 
@@ -877,7 +877,7 @@ struct MergeAverage<phi::CPUContext, T> {
     out.set_height(input_height);
 
     DenseTensor* out_tensor = out.mutable_value();
-    out_tensor->Resize(phi::make_ddim(
+    out_tensor->Resize(common::make_ddim(
         {static_cast<int64_t>(merged_row_set.size()), input_width}));
     auto* out_data = context.template Alloc<T>(out_tensor);
 
diff --git a/paddle/phi/kernels/funcs/selected_rows_functor.cu b/paddle/phi/kernels/funcs/selected_rows_functor.cu
index c74cda2e2bd443..b8617c2c9209ae 100644
--- a/paddle/phi/kernels/funcs/selected_rows_functor.cu
+++ b/paddle/phi/kernels/funcs/selected_rows_functor.cu
@@ -399,8 +399,8 @@ struct MergeAddImpl {
     out.set_rows(merge_rows);
     out.set_height(input.height());
     DenseTensor* out_tensor = out.mutable_value();
-    out_tensor->Resize(
-        phi::make_ddim({static_cast<int64_t>(merge_rows.size()), input_width}));
+    out_tensor->Resize(common::make_ddim(
+        {static_cast<int64_t>(merge_rows.size()), input_width}));
     context.template Alloc<T>(out_tensor);
 
     phi::funcs::SetConstant<DeviceContext, T> constant_functor;
@@ -471,8 +471,8 @@ struct MergeAddImpl {
     out.set_height(input_height);
 
     DenseTensor* out_tensor = out.mutable_value();
-    out_tensor->Resize(
-        phi::make_ddim({static_cast<int64_t>(merge_rows.size()), input_width}));
+    out_tensor->Resize(common::make_ddim(
+        {static_cast<int64_t>(merge_rows.size()), input_width}));
     context.template Alloc<T>(out_tensor);
 
     phi::funcs::SetConstant<DeviceContext, T> constant_functor;
diff --git a/paddle/phi/kernels/funcs/sequence2batch.cc b/paddle/phi/kernels/funcs/sequence2batch.cc
index a00af7177cc882..3e30bca02d8a4f 100644
--- a/paddle/phi/kernels/funcs/sequence2batch.cc
+++ b/paddle/phi/kernels/funcs/sequence2batch.cc
@@ -26,8 +26,8 @@ class CopyMatrixRowsFunctor<phi::CPUContext, T> {
                   phi::DenseTensor* dst,
                   bool is_src_index) {
     size_t* index = index_lod.data();
-    const auto& src_dims = vectorize<int>(src.dims());
-    const auto& dst_dims = vectorize<int>(dst->dims());
+    const auto& src_dims = common::vectorize<int>(src.dims());
+    const auto& dst_dims = common::vectorize<int>(dst->dims());
     PADDLE_ENFORCE_EQ(src_dims.size(),
                       2UL,
                       phi::errors::InvalidArgument(
diff --git a/paddle/phi/kernels/funcs/sequence_pooling.cc b/paddle/phi/kernels/funcs/sequence_pooling.cc
index 8d7d3b506136da..004bef522ab16a 100644
--- a/paddle/phi/kernels/funcs/sequence_pooling.cc
+++ b/paddle/phi/kernels/funcs/sequence_pooling.cc
@@ -415,7 +415,7 @@ class SequencePoolFunctor<phi::CPUContext, T> {
       phi::DenseTensor in_t =
           input.Slice(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1]));
       int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
-      auto in_e = EigenMatrix<T>::From(in_t, phi::make_ddim({h, w}));
+      auto in_e = EigenMatrix<T>::From(in_t, common::make_ddim({h, w}));
       auto out_e = EigenVector<T>::Flatten(out_t);
       if (pooltype == "AVERAGE") {
         out_e.device(place) = in_e.mean(Eigen::array<int, 1>({{0}}));
diff --git a/paddle/phi/kernels/funcs/sequence_pooling.cu b/paddle/phi/kernels/funcs/sequence_pooling.cu
index 4bc4b11692d5c4..551525a67bb7a5 100644
--- a/paddle/phi/kernels/funcs/sequence_pooling.cu
+++ b/paddle/phi/kernels/funcs/sequence_pooling.cu
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <algorithm>
 #include <string>
 
+#include "paddle/common/macros.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
-#include "paddle/phi/core/macros.h"
 #include "paddle/phi/core/mixed_vector.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/sequence_pooling.h"
diff --git a/paddle/phi/kernels/funcs/slice.h b/paddle/phi/kernels/funcs/slice.h
index 38b127541650be..ca438c5067849c 100644
--- a/paddle/phi/kernels/funcs/slice.h
+++ b/paddle/phi/kernels/funcs/slice.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
@@ -74,7 +74,7 @@ DenseTensor Slice(const Context& dev_ctx,
                   std::vector<int> ends) {
   DenseTensor ret;
   std::vector<int> new_axes = axes;
-  std::vector<int> out_shape = phi::vectorize<int>(x.dims());
+  std::vector<int> out_shape = common::vectorize<int>(x.dims());
   size_t rank = out_shape.size();
   PADDLE_ENFORCE_EQ(
       axes.size(),
@@ -105,7 +105,7 @@ DenseTensor Slice(const Context& dev_ctx,
     offset[new_axes[i]] = starts[i];
     extends[new_axes[i]] = ends[i] - starts[i];
   }
-  ret.Resize(phi::make_ddim(out_shape));
+  ret.Resize(common::make_ddim(out_shape));
   dev_ctx.template Alloc<T>(&ret);
   switch (rank) {
     SLICE_RANK_CASE(1);
@@ -140,14 +140,14 @@ static void Slice(const Context& ctx,
     extents[i] = in_dims[i];
   }
 
-  std::vector<int64_t> out_shape_vec = vectorize(in_dims);
+  std::vector<int64_t> out_shape_vec = common::vectorize(in_dims);
   for (size_t i = 0; i < axes_vec.size(); ++i) {
     offsets[axes_vec[i]] = begin_vec[i];
     extents[axes_vec[i]] = end_vec[i] - begin_vec[i];
     out_shape_vec[axes_vec[i]] = end_vec[i] - begin_vec[i];
   }
 
-  DDim out_dims(make_ddim(out_shape_vec));
+  DDim out_dims(common::make_ddim(out_shape_vec));
   out->Resize(out_dims);
   ctx.template Alloc<T>(out);
 
diff --git a/paddle/phi/kernels/funcs/slice_utils.h b/paddle/phi/kernels/funcs/slice_utils.h
index 04e5c11aabeed3..a78dcf5419cd3f 100644
--- a/paddle/phi/kernels/funcs/slice_utils.h
+++ b/paddle/phi/kernels/funcs/slice_utils.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <glog/logging.h>
-#include <paddle/phi/core/ddim.h>
+#include <paddle/common/ddim.h>
 #include <string>
 #include <vector>
 #include "paddle/phi/core/flags.h"
@@ -210,7 +210,7 @@ inline DDim GetDecreasedDims(const DDim slice_dims,
       // slice. This will remove in release 2.6.
       new_shape.push_back(1);
     }
-    decreased_dims = phi::make_ddim(new_shape);
+    decreased_dims = common::make_ddim(new_shape);
   }
   return decreased_dims;
 }
diff --git a/paddle/phi/kernels/funcs/softmax.cu b/paddle/phi/kernels/funcs/softmax.cu
index 2ca97cd4ac2055..c7dfd0c0978c00 100644
--- a/paddle/phi/kernels/funcs/softmax.cu
+++ b/paddle/phi/kernels/funcs/softmax.cu
@@ -35,7 +35,7 @@ void SoftmaxCUDNNFunctor<T, DeviceContext>::operator()(
   // ------------------- cudnn descriptors ---------------------
   ScopedTensorDescriptor xDesc;
   ScopedTensorDescriptor yDesc;
-  std::vector<int> cudnn_tensor_dims = phi::vectorize<int>(X->dims());
+  std::vector<int> cudnn_tensor_dims = common::vectorize<int>(X->dims());
   DataLayout layout = DataLayout::kNCHW;
   if (cudnn_tensor_dims.size() == 5) {
     layout = DataLayout::kNCDHW;
@@ -88,7 +88,7 @@ void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
   ScopedTensorDescriptor yDesc;
   ScopedTensorDescriptor dyDesc;
   ScopedTensorDescriptor dxDesc;
-  std::vector<int> cudnn_tensor_dims = phi::vectorize<int>(Y->dims());
+  std::vector<int> cudnn_tensor_dims = common::vectorize<int>(Y->dims());
   DataLayout layout = DataLayout::kNCHW;
   if (cudnn_tensor_dims.size() == 5) {
     layout = DataLayout::kNCDHW;
diff --git a/paddle/phi/kernels/funcs/sparse/common_shape.h b/paddle/phi/kernels/funcs/sparse/common_shape.h
index e4c836d1162523..0f207d099e8e71 100644
--- a/paddle/phi/kernels/funcs/sparse/common_shape.h
+++ b/paddle/phi/kernels/funcs/sparse/common_shape.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <stdint.h>
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 
 namespace phi {
 namespace funcs {
@@ -33,9 +33,9 @@ inline const DDim InferDenseDims(const DDim& x_dims,
     memcpy(&dense_dim_vec[1],
            x_dims.Get() + sparse_dim,
            dense_dim * sizeof(x_dims[0]));
-    values_dims = phi::make_ddim(dense_dim_vec);
+    values_dims = common::make_ddim(dense_dim_vec);
   } else {
-    values_dims = phi::make_ddim({non_zero_num});
+    values_dims = common::make_ddim({non_zero_num});
   }
   return values_dims;
 }
diff --git a/paddle/phi/kernels/funcs/sparse/convolution.h b/paddle/phi/kernels/funcs/sparse/convolution.h
index e6f3a573088b28..7048ca1a127f5c 100644
--- a/paddle/phi/kernels/funcs/sparse/convolution.h
+++ b/paddle/phi/kernels/funcs/sparse/convolution.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 
diff --git a/paddle/phi/kernels/funcs/sparse/flatten_indices.h b/paddle/phi/kernels/funcs/sparse/flatten_indices.h
index 9a031b8cc12ca4..4edcd839572dbb 100644
--- a/paddle/phi/kernels/funcs/sparse/flatten_indices.h
+++ b/paddle/phi/kernels/funcs/sparse/flatten_indices.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <stdint.h>
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 
 namespace phi {
 namespace funcs {
diff --git a/paddle/phi/kernels/funcs/sparse/softmax.cu.h b/paddle/phi/kernels/funcs/sparse/softmax.cu.h
index 72f99bd6331c4b..b75f870970a314 100644
--- a/paddle/phi/kernels/funcs/sparse/softmax.cu.h
+++ b/paddle/phi/kernels/funcs/sparse/softmax.cu.h
@@ -40,7 +40,7 @@ inline DenseTensor GetOffsets(const Context& dev_ctx,
     }
   }
 
-  const IntArray strides_shape(phi::vectorize<IntT>(indices.dims()));
+  const IntArray strides_shape(common::vectorize<IntT>(indices.dims()));
   DenseTensor strides = phi::Empty<IntT>(dev_ctx, strides_shape);
   auto strides_ptr = strides.data<IntT>();
   memory_utils::Copy(dev_ctx.GetPlace(),
@@ -125,10 +125,10 @@ std::tuple<DenseTensor, DenseTensor, DenseTensor, DenseTensor> ComputePoolMax(
                             });
   auto new_sz =
       thrust::distance(thrust_ptr(pool_sizes.data<IntT>()), new_end.second);
-  pool_sizes.Resize(phi::make_ddim({new_sz}));
+  pool_sizes.Resize(common::make_ddim({new_sz}));
 
   DenseTensor pool_offsets;
-  pool_offsets.Resize(phi::make_ddim({new_sz}));
+  pool_offsets.Resize(common::make_ddim({new_sz}));
   dev_ctx.template Alloc<T>(&pool_offsets);
   phi::Copy(dev_ctx, pool_sizes, dev_ctx.GetPlace(), false, &pool_offsets);
 
diff --git a/paddle/phi/kernels/funcs/sparse/softmax.h b/paddle/phi/kernels/funcs/sparse/softmax.h
index fcb45def6c1fae..2a820461c4181b 100644
--- a/paddle/phi/kernels/funcs/sparse/softmax.h
+++ b/paddle/phi/kernels/funcs/sparse/softmax.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 #include "paddle/phi/core/tensor_utils.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h
index fde5cb1768d47c..3502dbfc9ceda4 100644
--- a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h
+++ b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h
@@ -16,11 +16,11 @@
 
 #include "glog/logging.h"
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/dynload/cusparse.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/common/memory_utils.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
@@ -65,7 +65,7 @@ template <typename T, typename IntT>
 inline void CreateCsrDescriptor(const phi::SparseCsrTensor& x,
                                 const phi::GPUContext& dev_ctx,
                                 cusparseSpMatDescr_t* descriptor) {
-  std::vector<int64_t> xdim_vec = phi::vectorize(x.dims());
+  std::vector<int64_t> xdim_vec = common::vectorize(x.dims());
   auto x_ndims = xdim_vec.size();
   PADDLE_ENFORCE_GE(
       x_ndims,
@@ -120,7 +120,7 @@ template <typename T, typename IntT>
 inline void CreateCooDescriptor(const phi::SparseCooTensor& x,
                                 const phi::GPUContext& dev_ctx,
                                 cusparseSpMatDescr_t* descriptor) {
-  std::vector<int64_t> xdim_vec = phi::vectorize(x.dims());
+  std::vector<int64_t> xdim_vec = common::vectorize(x.dims());
   auto x_ndims = xdim_vec.size();
   PADDLE_ENFORCE_GE(
       x_ndims,
@@ -214,7 +214,7 @@ class CuSparseDnMatDescriptor {
   explicit CuSparseDnMatDescriptor(const phi::DenseTensor& x,
                                    const phi::GPUContext& dev_ctx)
       : dev_ctx_(dev_ctx) {
-    std::vector<int64_t> xdim_vec = phi::vectorize(x.dims());
+    std::vector<int64_t> xdim_vec = common::vectorize(x.dims());
     auto x_ndims = xdim_vec.size();
     PADDLE_ENFORCE_GE(
         x_ndims,
@@ -278,7 +278,7 @@ class CuSparseDnVecDescriptor {
   explicit CuSparseDnVecDescriptor(const phi::DenseTensor& x,
                                    const phi::GPUContext& dev_ctx)
       : dev_ctx_(dev_ctx) {
-    std::vector<int64_t> xdim_vec = phi::vectorize(x.dims());
+    std::vector<int64_t> xdim_vec = common::vectorize(x.dims());
     auto x_ndims = xdim_vec.size();
     PADDLE_ENFORCE_GE(x_ndims,
                       1,
diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.hip.h b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.hip.h
index cbd42be3cb6d49..6b6c8c58385cd8 100644
--- a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.hip.h
+++ b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.hip.h
@@ -14,11 +14,11 @@
 
 #pragma once
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/dynload/rocsparse.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/memory_utils.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
@@ -64,7 +64,7 @@ template <typename T, typename IntT>
 inline void CreateCsrDescriptor(const phi::SparseCsrTensor& x,
                                 const phi::GPUContext& dev_ctx,
                                 rocsparse_spmat_descr* descriptor) {
-  std::vector<int64_t> xdim_vec = phi::vectorize(x.dims());
+  std::vector<int64_t> xdim_vec = common::vectorize(x.dims());
   auto x_ndims = xdim_vec.size();
   PADDLE_ENFORCE_GE(
       x_ndims,
@@ -115,7 +115,7 @@ template <typename T, typename IntT>
 inline void CreateCooDescriptor(const phi::SparseCooTensor& x,
                                 const phi::GPUContext& dev_ctx,
                                 rocsparse_spmat_descr* descriptor) {
-  std::vector<int64_t> xdim_vec = phi::vectorize(x.dims());
+  std::vector<int64_t> xdim_vec = common::vectorize(x.dims());
   auto x_ndims = xdim_vec.size();
   PADDLE_ENFORCE_GE(
       x_ndims,
@@ -203,7 +203,7 @@ class RocSparseDnMatDescriptor {
   explicit RocSparseDnMatDescriptor(const phi::DenseTensor& x,
                                     const phi::GPUContext& dev_ctx)
       : dev_ctx_(dev_ctx) {
-    std::vector<int64_t> xdim_vec = phi::vectorize(x.dims());
+    std::vector<int64_t> xdim_vec = common::vectorize(x.dims());
     auto x_ndims = xdim_vec.size();
     PADDLE_ENFORCE_GE(
         x_ndims,
diff --git a/paddle/phi/kernels/funcs/strided_memcpy.h b/paddle/phi/kernels/funcs/strided_memcpy.h
index de38e40d317e19..b91ab85c55b33c 100644
--- a/paddle/phi/kernels/funcs/strided_memcpy.h
+++ b/paddle/phi/kernels/funcs/strided_memcpy.h
@@ -12,8 +12,8 @@ limitations under the License. */
 #pragma once
 #include <vector>
 
+#include "paddle/common/macros.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/macros.h"
 #include "paddle/phi/kernels/funcs/detail/strided_memcpy.h"
 
 namespace phi {
@@ -146,12 +146,12 @@ inline void StridedMemcpyWithAxis0(
     const phi::DenseTensor& input,
     const std::vector<const phi::DenseTensor*>& shape_refer,
     std::vector<phi::DenseTensor*>* outputs) {
-  const phi::DDim in_stride = stride_numel(input.dims());
+  const phi::DDim in_stride = common::stride_numel(input.dims());
   const int axis = 0;
   size_t input_offset = 0;
 
   for (size_t i = 0; i < outputs->size(); ++i) {
-    auto out_stride = stride_numel(shape_refer[i]->dims());
+    auto out_stride = common::stride_numel(shape_refer[i]->dims());
     auto out = outputs->at(i);
     if (out != nullptr && out->initialized() && out->numel() > 0) {
       StridedNumelCopyWithAxis<T, Context>(dev_ctx,
diff --git a/paddle/phi/kernels/funcs/strided_slice.h b/paddle/phi/kernels/funcs/strided_slice.h
index 4a88c1e0660b79..06503f80342d76 100644
--- a/paddle/phi/kernels/funcs/strided_slice.h
+++ b/paddle/phi/kernels/funcs/strided_slice.h
@@ -17,7 +17,7 @@
 #include <utility>
 #include <vector>
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/tensor_array.h"
@@ -212,7 +212,7 @@ void StridedSliceCompute(const Context& dev_ctx,
                       out_dims_vector.data(),
                       axes.size(),
                       false);
-  DDim out_dims(phi::make_ddim(out_dims_vector));
+  DDim out_dims(common::make_ddim(out_dims_vector));
 
   std::vector<int> reverse_vector(starts_.size(), 0);
   StridedSliceFunctor(starts_.data(),
@@ -260,7 +260,7 @@ void StridedSliceCompute(const Context& dev_ctx,
     if (new_out_shape.size() == 0) {
       new_out_shape.push_back(1);
     }
-    out_dims_origin = phi::make_ddim(new_out_shape);
+    out_dims_origin = common::make_ddim(new_out_shape);
   }
 
   bool need_reverse = false;
@@ -307,7 +307,7 @@ void StridedSliceCompute(const Context& dev_ctx,
                          const std::vector<int>& decrease_axis,
                          TensorArray* out) {
   const int64_t size = x.size();
-  auto in_dims = phi::make_ddim({size});
+  auto in_dims = common::make_ddim({size});
 
   auto starts_ = starts.GetData();
   auto ends_ = ends.GetData();
@@ -329,7 +329,7 @@ void StridedSliceCompute(const Context& dev_ctx,
                       out_dims_vector.data(),
                       axes.size(),
                       false);
-  DDim out_dims(phi::make_ddim(out_dims_vector));
+  DDim out_dims(common::make_ddim(out_dims_vector));
 
   std::vector<int> reverse_vector(starts_.size(), 0);
   StridedSliceFunctor(starts_.data(),
@@ -377,7 +377,7 @@ void StridedSliceCompute(const Context& dev_ctx,
     if (new_out_shape.size() == 0) {
       new_out_shape.push_back(1);
     }
-    out_dims_origin = phi::make_ddim(new_out_shape);
+    out_dims_origin = common::make_ddim(new_out_shape);
   }
 
   bool need_reverse = false;
@@ -547,7 +547,7 @@ void StridedSliceGradCompute(const Context& dev_ctx,
   // calculate the output shape. when set it to inplace OP, there may be
   // some problems.
   const int64_t size = x.size();
-  DDim out_dims = phi::make_ddim({size});
+  DDim out_dims = common::make_ddim({size});
 
   auto starts_ = starts.GetData();
   auto ends_ = ends.GetData();
diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h
index b6d6b0cffc667f..31502804f7f4e1 100644
--- a/paddle/phi/kernels/funcs/top_k_function_cuda.h
+++ b/paddle/phi/kernels/funcs/top_k_function_cuda.h
@@ -1043,7 +1043,7 @@ bool SortTopk(const phi::GPUContext& ctx,
 
   Tensor input_indices;
   const std::vector<int64_t> dims = {num_rows, num_cols};
-  auto dim = phi::make_ddim(dims);
+  auto dim = common::make_ddim(dims);
   input_indices.Resize(dim);
   ctx.template Alloc<int64_t>(&input_indices);
   size_t temp_storage_bytes = -1;
@@ -1255,7 +1255,7 @@ bool SortTopk(const phi::GPUContext& ctx,
         static_cast<const Tensor>(temp_indices));
 
     std::vector<int> odims = {static_cast<int>(num_rows), static_cast<int>(k)};
-    auto dim = phi::make_ddim(odims);
+    auto dim = common::make_ddim(odims);
     auto e_values = phi::EigenMatrix<T>::From(*out_tensor, dim);
     auto e_tmp_values =
         phi::EigenMatrix<T>::From(static_cast<const Tensor>(temp_values));
diff --git a/paddle/phi/kernels/funcs/transpose_function.cu.h b/paddle/phi/kernels/funcs/transpose_function.cu.h
index 5bc42a8b69f0e7..173bef120fb606 100644
--- a/paddle/phi/kernels/funcs/transpose_function.cu.h
+++ b/paddle/phi/kernels/funcs/transpose_function.cu.h
@@ -1454,8 +1454,8 @@ inline void PermuteWithEigen(
     phi::DenseTensor temp_in;
 
     temp_in.ShareBufferWith(in);
-    temp_in.Resize(phi::make_ddim(simplifier.GetSrcDims()));
-    out->Resize(phi::make_ddim(simplifier.GetDstDims()));
+    temp_in.Resize(common::make_ddim(simplifier.GetSrcDims()));
+    out->Resize(common::make_ddim(simplifier.GetDstDims()));
 
     TransCompute<phi::GPUContext, T>(
         simplifier.GetRank(), ctx, temp_in, out, simplifier.GetPerm());
@@ -1476,7 +1476,7 @@ void TransposeGPUKernelDriver(const phi::GPUContext& ctx,
   bool ret = TransposeSimple<T>::Run(ctx, in, perm, out, numel);
   if (!ret) {
     auto simplifier = phi::funcs::PermuteDimsSimplifier(
-        rank, numel, perm, phi::vectorize<int64_t>(in.dims()));
+        rank, numel, perm, common::vectorize<int64_t>(in.dims()));
     auto* tuner = phi::autotune::MakeTransposeTuner<T>(PermuteWithEigen<T>);
     tuner->AddCallBack(PermuteAndTranspose<T>);
 
diff --git a/paddle/phi/kernels/funcs/unique_functor.h b/paddle/phi/kernels/funcs/unique_functor.h
index 806d7cca84851d..ade7cf2d8a0daf 100644
--- a/paddle/phi/kernels/funcs/unique_functor.h
+++ b/paddle/phi/kernels/funcs/unique_functor.h
@@ -71,7 +71,7 @@ struct UniqueOpFunctor {
 
     if (count_ != nullptr) {
       // Resize the count tensor dims to allocate the memory
-      count_->Resize(phi::make_ddim({static_cast<int64_t>(uniq.size())}));
+      count_->Resize(common::make_ddim({static_cast<int64_t>(uniq.size())}));
       IndexT* count_data = context_.template Alloc<IndexT>(count_);
       // init count_data to 0
       memset(count_data, 0, uniq.size() * sizeof(IndexT));
@@ -101,7 +101,7 @@ struct UniqueOpFunctor {
       }
     }
 
-    out_->Resize(phi::make_ddim({static_cast<int64_t>(uniq.size())}));
+    out_->Resize(common::make_ddim({static_cast<int64_t>(uniq.size())}));
     auto* out_data = context_.template Alloc<InT>(out_);
     std::memcpy(out_data, uniq.data(), uniq.size() * sizeof(InT));
   }
@@ -141,12 +141,12 @@ static void UniqueFlattendTensor(const Context& context,
                                  bool return_counts) {
   const InT* in_data = in.data<InT>();
   std::set<InT> unique(in_data, in_data + in.numel());
-  out->Resize(phi::make_ddim({static_cast<int64_t>(unique.size())}));
+  out->Resize(common::make_ddim({static_cast<int64_t>(unique.size())}));
   auto* out_data = context.template Alloc<InT>(out);
   std::copy(unique.begin(), unique.end(), out_data);
 
   if (return_index) {
-    indices->Resize(phi::make_ddim({out->numel()}));
+    indices->Resize(common::make_ddim({out->numel()}));
     auto indices_data = context.template Alloc<IndexT>(indices);
     std::unordered_map<InT, IndexT> indices_map;
     indices_map.reserve(out->numel());
@@ -160,7 +160,7 @@ static void UniqueFlattendTensor(const Context& context,
   }
 
   if (return_inverse) {
-    index->Resize(phi::make_ddim({in.numel()}));
+    index->Resize(common::make_ddim({in.numel()}));
     auto inverse_data = context.template Alloc<IndexT>(index);
     std::unordered_map<InT, IndexT> inverse_map;
     inverse_map.reserve(out->numel());
@@ -173,7 +173,7 @@ static void UniqueFlattendTensor(const Context& context,
   }
 
   if (return_counts) {
-    count->Resize(phi::make_ddim({out->numel()}));
+    count->Resize(common::make_ddim({out->numel()}));
     auto count_data = context.template Alloc<IndexT>(count);
     std::unordered_map<InT, IndexT> counts_map;
     counts_map.reserve(out->numel());
@@ -240,16 +240,16 @@ static void UniqueDim(const Context& context,
   std::iota(permute.begin(), permute.end(), 0);
   permute[axis] = 0;
   permute[0] = axis;
-  std::vector<int64_t> in_trans_dims_vec(phi::vectorize(in.dims()));
+  std::vector<int64_t> in_trans_dims_vec(common::vectorize(in.dims()));
   in_trans_dims_vec[axis] = in.dims()[0];
   in_trans_dims_vec[0] = in.dims()[axis];
   DenseTensor in_trans;
-  phi::DDim in_trans_dims = phi::make_ddim(in_trans_dims_vec);
+  phi::DDim in_trans_dims = common::make_ddim(in_trans_dims_vec);
   in_trans.Resize(in_trans_dims);
   context.template Alloc<InT>(&in_trans);
   TransCompute<Context, InT>(in.dims().size(), context, in, &in_trans, permute);
   // reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2]
-  phi::DDim in_trans_flat_dims = phi::flatten_to_2d(in_trans_dims, 1);
+  phi::DDim in_trans_flat_dims = common::flatten_to_2d(in_trans_dims, 1);
   in_trans.Resize(in_trans_flat_dims);
 
   // sort indices
@@ -304,10 +304,10 @@ static void UniqueDim(const Context& context,
   DenseTensor out_trans;
   std::vector<int64_t> out_trans_dims_vec = in_trans_dims_vec;
   out_trans_dims_vec[0] = input_unbind.size();
-  out_trans.Resize(phi::make_ddim(out_trans_dims_vec));
+  out_trans.Resize(common::make_ddim(out_trans_dims_vec));
   context.template Alloc<InT>(&out_trans);
   std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]);
-  out->Resize(phi::make_ddim(out_trans_dims_vec));
+  out->Resize(common::make_ddim(out_trans_dims_vec));
   context.template Alloc<InT>(out);
   concat_functor(context, input_unbind, 0, &out_trans);
   TransCompute<Context, InT>(
diff --git a/paddle/phi/kernels/funcs/unsqueeze.h b/paddle/phi/kernels/funcs/unsqueeze.h
index b15e781b25117b..a8fc8dc8495449 100644
--- a/paddle/phi/kernels/funcs/unsqueeze.h
+++ b/paddle/phi/kernels/funcs/unsqueeze.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 // TODO(paddle-dev): Remove this file when we can call related Kernel directly
@@ -100,7 +100,7 @@ inline DDim GetOutputSqueezeShape(const std::vector<int> squeeze_dims,
       output_shape.push_back(in_dims[i]);
     }
   }
-  return phi::make_ddim(output_shape);
+  return common::make_ddim(output_shape);
 }
 
 inline DDim GetUnsqueezeShape(const std::vector<int64_t> unsqz_dims,
@@ -149,13 +149,13 @@ inline DDim GetUnsqueezeShape(const std::vector<int64_t> unsqz_dims,
     }
   }
 
-  return phi::make_ddim(output_shape);
+  return common::make_ddim(output_shape);
 }
 
 inline const DenseTensor Unsqueeze(const DenseTensor& x, int axis = 0) {
   // don't copy data, only change the dims
   DenseTensor out(x);
-  std::vector<int> out_shape = phi::vectorize<int>(x.dims());
+  std::vector<int> out_shape = common::vectorize<int>(x.dims());
   if (axis >= 0) {
     auto index = (out_shape.begin() + axis);
     out_shape.insert(index, 1);
@@ -163,7 +163,7 @@ inline const DenseTensor Unsqueeze(const DenseTensor& x, int axis = 0) {
     auto index = (out_shape.end() + axis + 1);
     out_shape.insert(index, 1);
   }
-  out.Resize(phi::make_ddim(out_shape));
+  out.Resize(common::make_ddim(out_shape));
   return out;
 }
 
diff --git a/paddle/phi/kernels/funcs/values_vectors_functor.h b/paddle/phi/kernels/funcs/values_vectors_functor.h
index 512155b94bfb39..0de31efaa19b7d 100644
--- a/paddle/phi/kernels/funcs/values_vectors_functor.h
+++ b/paddle/phi/kernels/funcs/values_vectors_functor.h
@@ -14,8 +14,8 @@
 
 #pragma once
 #ifdef PADDLE_WITH_CUDA
+#include "paddle/common/errors.h"
 #include "paddle/phi/backends/dynload/cusolver.h"
-#include "paddle/phi/core/errors.h"
 #endif  // PADDLE_WITH_CUDA
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
@@ -281,16 +281,16 @@ struct MatrixEighFunctor<CPUContext, T> {
         input.type() == phi::DataType::COMPLEX128) {
       lrwork = std::max<int>(1, static_cast<int>(rwork_opt));
 
-      rwork_tensor.Resize(phi::make_ddim({lrwork}));
+      rwork_tensor.Resize(common::make_ddim({lrwork}));
       rwork_data = dev_ctx.template Alloc<ValueType>(&rwork_tensor);
     }
 
     DenseTensor iwork_tensor, work_tensor;
 
-    iwork_tensor.Resize(phi::make_ddim({liwork}));
+    iwork_tensor.Resize(common::make_ddim({liwork}));
     int *iwork_data = dev_ctx.template Alloc<int>(&iwork_tensor);
 
-    work_tensor.Resize(phi::make_ddim({lwork}));
+    work_tensor.Resize(common::make_ddim({lwork}));
     T *work_data = dev_ctx.template Alloc<T>(&work_tensor);
 
     for (auto i = 0; i < batch_size; i++) {
diff --git a/paddle/phi/kernels/funcs/vol2col.cc b/paddle/phi/kernels/funcs/vol2col.cc
index b5d6086feda770..b7c6a1fd6c1e83 100644
--- a/paddle/phi/kernels/funcs/vol2col.cc
+++ b/paddle/phi/kernels/funcs/vol2col.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/vol2col.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/enforce.h"
 
 namespace phi {
 namespace funcs {
diff --git a/paddle/phi/kernels/funcs/vol2col.h b/paddle/phi/kernels/funcs/vol2col.h
index 283ab3ea065635..bd909927952d04 100644
--- a/paddle/phi/kernels/funcs/vol2col.h
+++ b/paddle/phi/kernels/funcs/vol2col.h
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include <vector>
 
+#include "paddle/common/errors.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/errors.h"
 
 namespace phi {
 namespace funcs {
diff --git a/paddle/phi/kernels/fusion/cpu/distributed_fused_lamb_init_kernel.cc b/paddle/phi/kernels/fusion/cpu/distributed_fused_lamb_init_kernel.cc
index 3cb37ccf2ed89d..bbcb61bd454765 100644
--- a/paddle/phi/kernels/fusion/cpu/distributed_fused_lamb_init_kernel.cc
+++ b/paddle/phi/kernels/fusion/cpu/distributed_fused_lamb_init_kernel.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/distributed_fused_lamb_init_kernel.h"
-#include "paddle/phi/core/errors.h"
+#include "paddle/common/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/fusion/cpu/fused_softmax_mask_upper_triangle_kernel.cc b/paddle/phi/kernels/fusion/cpu/fused_softmax_mask_upper_triangle_kernel.cc
index b9ded16d1b0958..6257e9c451aaa7 100644
--- a/paddle/phi/kernels/fusion/cpu/fused_softmax_mask_upper_triangle_kernel.cc
+++ b/paddle/phi/kernels/fusion/cpu/fused_softmax_mask_upper_triangle_kernel.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/fused_softmax_mask_upper_triangle_kernel.h"
-#include "paddle/phi/core/errors.h"
+#include "paddle/common/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/fusion/cpu/fusion_gru_kernel.cc b/paddle/phi/kernels/fusion/cpu/fusion_gru_kernel.cc
index 3b140091fc69c4..deac38bf8dbab9 100644
--- a/paddle/phi/kernels/fusion/cpu/fusion_gru_kernel.cc
+++ b/paddle/phi/kernels/fusion/cpu/fusion_gru_kernel.cc
@@ -16,9 +16,9 @@
 #include <string>
 #include <vector>
 
+#include "paddle/common/errors.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
@@ -33,7 +33,7 @@ namespace fusion {
   auto x_lod = x.lod();                                    \
   auto x_dims = x.dims(); /* T x M*/                       \
   auto x_mat_dims = (x_dims.size() == 3 && x_dims[1] == 1) \
-                        ? phi::flatten_to_2d(x_dims, 1)    \
+                        ? common::flatten_to_2d(x_dims, 1) \
                         : x_dims;                          \
   auto wh_dims = weight_h.dims(); /* D x 3D*/              \
   const int total_T = x_mat_dims[0];                       \
diff --git a/paddle/phi/kernels/fusion/cpu/fusion_repeated_fc_relu_kernel.cc b/paddle/phi/kernels/fusion/cpu/fusion_repeated_fc_relu_kernel.cc
index b65cf71bf93859..b52871620e30a2 100644
--- a/paddle/phi/kernels/fusion/cpu/fusion_repeated_fc_relu_kernel.cc
+++ b/paddle/phi/kernels/fusion/cpu/fusion_repeated_fc_relu_kernel.cc
@@ -51,7 +51,7 @@ void FusionRepeatedFCReluKernel(const Context& dev_ctx,
                                 DenseTensor* out) {
   int weight_sz = static_cast<int>(w.size());
 
-  auto i_dims = phi::vectorize<int>(x.dims());
+  auto i_dims = common::vectorize<int>(x.dims());
   const auto& w_dims = w[0]->dims();
   phi::jit::matmul_attr_t attr;
   attr.m = i_dims[0];
diff --git a/paddle/phi/kernels/fusion/cpu/fusion_seqconv_eltadd_relu_kernel.cc b/paddle/phi/kernels/fusion/cpu/fusion_seqconv_eltadd_relu_kernel.cc
index fbe2ea8d12bc27..4ff18849316d8a 100644
--- a/paddle/phi/kernels/fusion/cpu/fusion_seqconv_eltadd_relu_kernel.cc
+++ b/paddle/phi/kernels/fusion/cpu/fusion_seqconv_eltadd_relu_kernel.cc
@@ -15,9 +15,9 @@
 #include <algorithm>  // for min, max
 #include <string>
 
+#include "paddle/common/errors.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
@@ -37,8 +37,8 @@ void FusionSeqConvEltAddReluKernel(const Context& dev_ctx,
                                    DenseTensor* out,
                                    DenseTensor* col_mat) {
   auto x_lod = x.lod();
-  auto x_dims = phi::vectorize<int64_t>(x.dims());
-  auto w_dims = phi::vectorize<int64_t>(filter.dims());
+  auto x_dims = common::vectorize<int64_t>(x.dims());
+  auto w_dims = common::vectorize<int64_t>(filter.dims());
   PADDLE_ENFORCE_EQ(
       bias.numel(),
       w_dims[1],
diff --git a/paddle/phi/kernels/fusion/cpu/fusion_seqexpand_concat_fc_kernel.cc b/paddle/phi/kernels/fusion/cpu/fusion_seqexpand_concat_fc_kernel.cc
index d5eb7894455f1d..d96940a8c1c2f6 100644
--- a/paddle/phi/kernels/fusion/cpu/fusion_seqexpand_concat_fc_kernel.cc
+++ b/paddle/phi/kernels/fusion/cpu/fusion_seqexpand_concat_fc_kernel.cc
@@ -14,10 +14,10 @@
 
 #include <string>
 
+#include "paddle/common/errors.h"
 #include "paddle/phi/backends/cpu/cpu_info.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention.cu b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention.cu
index cc4fd467dfc20b..f9c3cb0e7c7610 100644
--- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention.cu
+++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention.cu
@@ -14,8 +14,8 @@
 
 #include "glog/logging.h"
 
+#include "paddle/common/errors.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen/memory_efficient_attention.h"
 #include "paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/gemm_kernel_utils.h"
diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_variable_forward_kernels.py b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_variable_forward_kernels.py
index 07e710e52d206a..ac68a611539d20 100644
--- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_variable_forward_kernels.py
+++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_variable_forward_kernels.py
@@ -214,7 +214,7 @@ def parse_args():
   cutlass::Status status;
   size_t workspace_size = fmha.get_workspace_size(args);
   phi::DenseTensor workspace;
-  workspace.Resize(phi::make_ddim({{static_cast<int64_t>(workspace_size)}}));
+  workspace.Resize(common::make_ddim({{static_cast<int64_t>(workspace_size)}}));
   ctx.template Alloc<uint8_t>(&workspace);
   status = fmha.initialize(args, workspace.data<uint8_t>());
   if (status != cutlass::Status::kSuccess) {{
diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_utils.h b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_utils.h
index 65dfb1bc8eced4..43afbdb55707dd 100644
--- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_utils.h
+++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_utils.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 
 namespace phi {
 namespace fusion {
diff --git a/paddle/phi/kernels/fusion/gpu/cast_with_ptr.h b/paddle/phi/kernels/fusion/gpu/cast_with_ptr.h
index 5ae8aed256ccdd..a4fdcb10e1b189 100644
--- a/paddle/phi/kernels/fusion/gpu/cast_with_ptr.h
+++ b/paddle/phi/kernels/fusion/gpu/cast_with_ptr.h
@@ -14,9 +14,9 @@
 
 #pragma once
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 
@@ -37,9 +37,9 @@ static void VecCastKernel(const phi::GPUContext &ctx,
   auto main_offset = n / (VecSize * thread) * VecSize * thread;
   auto stream = ctx.stream();
   using FunctorT = CastFunctor<InT, OutT>;
-  phi::Array<const _ptr_ char *__restrict__, 1> in_arr;
+  Array<const _ptr_ char *__restrict__, 1> in_arr;
   in_arr[0] = reinterpret_cast<const _ptr_ char *>(x);
-  phi::Array<_ptr_ OutT *, 1> out_arr;
+  Array<_ptr_ OutT *, 1> out_arr;
   out_arr[0] = y;
   phi::funcs::VectorizedElementwiseKernel<OutT, FunctorT, 1, 1, VecSize>
       <<<block, thread, 0, stream>>>(
diff --git a/paddle/phi/kernels/fusion/gpu/conv_fusion_kernel.cu b/paddle/phi/kernels/fusion/gpu/conv_fusion_kernel.cu
index 3fd94b8e3b46ee..e8127fbdae3993 100644
--- a/paddle/phi/kernels/fusion/gpu/conv_fusion_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/conv_fusion_kernel.cu
@@ -23,12 +23,12 @@
 
 #include "glog/logging.h"
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/dynload/cudnn.h"
 #include "paddle/phi/backends/gpu/cuda/cudnn_desc.h"
 #include "paddle/phi/common/backend.h"
 #include "paddle/phi/common/data_type.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/impl/conv_cudnn_impl.h"
 #include "paddle/utils/optional.h"
@@ -221,7 +221,7 @@ class CudnnConvDescManager {
       phi::UpdatePaddingAndDilation(&paddings,
                                     &dilations,
                                     padding_algorithm,
-                                    make_ddim(in_data_dims),
+                                    common::make_ddim(in_data_dims),
                                     strides,
                                     ksize);
 
@@ -400,8 +400,8 @@ void Conv2dFusionKernel(const Context& ctx,
       paddings_t,
       dilations_t,
       padding_algorithm,
-      phi::vectorize<int>(input.dims()),
-      phi::vectorize<int>(filter.dims()),
+      common::vectorize<int>(input.dims()),
+      common::vectorize<int>(filter.dims()),
       strides,
       compute_format);
 
@@ -409,7 +409,7 @@ void Conv2dFusionKernel(const Context& ctx,
   const int input_rank = input.dims().size();
   auto unsys_pad_process = [&](const std::vector<int>& new_input_shape_vec,
                                const std::vector<int>& input_pad) {
-    DDim new_input_shape(make_ddim(new_input_shape_vec));
+    DDim new_input_shape(common::make_ddim(new_input_shape_vec));
     transformed_input.Resize(new_input_shape);
     ctx.template Alloc<T>(&transformed_input);
 
@@ -528,10 +528,10 @@ void Conv2dFusionKernel(const Context& ctx,
   };
 
   auto cudnn_cache_info = CudnnConvDescManager::Instance()->GetCudnnCacheInfo(
-      phi::vectorize<int>(transformed_input.dims()),
-      phi::vectorize<int>(filter.dims()),
+      common::vectorize<int>(transformed_input.dims()),
+      common::vectorize<int>(filter.dims()),
       b_dims,
-      phi::vectorize<int>(output->dims()),
+      common::vectorize<int>(output->dims()),
       conv_attr_cache->paddings,
       strides,
       conv_attr_cache->dilations,
diff --git a/paddle/phi/kernels/fusion/gpu/distributed_fused_lamb_init_kernel.cu b/paddle/phi/kernels/fusion/gpu/distributed_fused_lamb_init_kernel.cu
index 3ae7f0682bc75b..7182a13bcf0fcd 100644
--- a/paddle/phi/kernels/fusion/gpu/distributed_fused_lamb_init_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/distributed_fused_lamb_init_kernel.cu
@@ -275,7 +275,7 @@ static void ShareBufferForNonInitedTensor(DenseTensor *origin,
 
   DDim fused_out_dim = fused_out->dims();
   auto fused_out_numel = fused_out->numel();
-  auto numel = phi::product(dims);
+  auto numel = common::product(dims);
   *origin = fused_out->Resize({fused_out_numel})
                 .Slice(numel_offset, numel + numel_offset);
   origin->Resize(dims);
diff --git a/paddle/phi/kernels/fusion/gpu/fused_embedding_eltwise_layernorm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_embedding_eltwise_layernorm_kernel.cu
index 71e778ca6574e4..1833788a6b8c97 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_embedding_eltwise_layernorm_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_embedding_eltwise_layernorm_kernel.cu
@@ -15,11 +15,11 @@
 #include <algorithm>
 #include <type_traits>
 
+#include "paddle/common/errors.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/emb_eltwise_layer_norm_functor.h"
diff --git a/paddle/phi/kernels/fusion/gpu/fused_fc_elementwise_layernorm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_fc_elementwise_layernorm_kernel.cu
index f7f8faa329d60f..2d3b2938a09a07 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_fc_elementwise_layernorm_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_fc_elementwise_layernorm_kernel.cu
@@ -27,11 +27,11 @@ namespace cub = hipcub;
 #include <cuda_fp16.h>
 #endif
 
+#include "paddle/common/errors.h"
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
@@ -420,7 +420,7 @@ void FusedFCElementwiseLayerNormKernel(
   auto w_dims = w.dims();
   int N = w_dims[1];
   int K = w_dims[0];
-  int M = phi::product(x.dims()) / K;
+  int M = common::product(x.dims()) / K;
 
   const T* x_data = x.data<T>();
   const T* w_data = w.data<T>();
diff --git a/paddle/phi/kernels/fusion/gpu/fused_scale_bias_add_relu_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_scale_bias_add_relu_kernel.cu
index ff5edd689f7f3b..6bb7950a823aba 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_scale_bias_add_relu_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_scale_bias_add_relu_kernel.cu
@@ -77,8 +77,8 @@ void FusedScaleBiasAddReluKernel(const Context& dev_ctx,
   auto tensor_format_math = CUDNN_DATA_FLOAT;
   auto compute_dtype = CUDNN_DATA_FLOAT;
 
-  auto dim_x =
-      phi::backends::gpu::TransformDimOrder(phi::vectorize<int64_t>(x1.dims()));
+  auto dim_x = phi::backends::gpu::TransformDimOrder(
+      common::vectorize<int64_t>(x1.dims()));
   std::vector<int64_t> dim_c(dim_x.size(), 1);
   dim_c[1] = dim_x[1];  //  [1, C, 1, 1]
 
diff --git a/paddle/phi/kernels/fusion/gpu/fused_scale_bias_relu_conv_bn_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_scale_bias_relu_conv_bn_kernel.cu
index c0d35cbf718abc..04ed9ebaf146bf 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_scale_bias_relu_conv_bn_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_scale_bias_relu_conv_bn_kernel.cu
@@ -85,7 +85,7 @@ void FusedScaleBiasReluConvBnstatsImpl(
   auto filter_dims = w_transformed.dims();
   DDim in_data_dims = slice_ddim(in_dims, 1, in_dims.size() - 1);
   DDim filter_data_dims = slice_ddim(filter_dims, 1, filter_dims.size() - 1);
-  std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
+  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
   phi::UpdatePaddingAndDilation(&paddings_vec,
                                 &dilations_vec,
                                 padding_algorithm,
@@ -122,12 +122,12 @@ void FusedScaleBiasReluConvBnstatsImpl(
   auto compute_dtype = CUDNN_DATA_FLOAT;
 
   // get dims in CUDNN manner: [N, C, H, W]
-  auto dim_x =
-      phi::backends::gpu::TransformDimOrder(phi::vectorize<int64_t>(in_dims));
+  auto dim_x = phi::backends::gpu::TransformDimOrder(
+      common::vectorize<int64_t>(in_dims));
   auto dim_filt = phi::backends::gpu::TransformDimOrder(
-      phi::vectorize<int64_t>(filter_dims));
+      common::vectorize<int64_t>(filter_dims));
   auto dim_y = phi::backends::gpu::TransformDimOrder(
-      phi::vectorize<int64_t>(output->dims()));
+      common::vectorize<int64_t>(output->dims()));
   std::vector<int64_t> dim_scale(dim_x.size(), 1);
   dim_scale[1] = dim_x[1];                        //  [1, C, 1, 1]
   std::vector<int64_t> dim_sum(dim_x.size(), 1);  // [1, K, 1, 1]
@@ -323,7 +323,7 @@ void BNFinalizeImpl(const Context& dev_ctx,
   auto tensor_format = phi::backends::gpu::ToCudnnDataType(eq_scale->dtype());
   auto compute_dtype = CUDNN_DATA_FLOAT;
   // create tensor descriptors
-  auto dim_input = phi::vectorize<int64_t>(sum_tensor.dims());
+  auto dim_input = common::vectorize<int64_t>(sum_tensor.dims());
   std::vector<int64_t> dim_c = {1, dim_input[0], 1, 1};  //  [1, C, 1, 1]
   std::vector<int64_t> dim_scalar = {1, 1, 1, 1};
   std::vector<int64_t> stride_scalar = {1, 1, 1, 1};
@@ -555,7 +555,7 @@ void FusedScaleBiasReluConvBnKernel(const Context& dev_ctx,
   if (accumulation_count == 0) {
     // dim_out = [N, H, W, C]
     // accumulation_count = N * H * W
-    auto dim_out = phi::vectorize<int64_t>(out->dims());
+    auto dim_out = common::vectorize<int64_t>(out->dims());
     accumulation_count = dim_out[0] * dim_out[1] * dim_out[2];
   }
 
diff --git a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_grad_kernel.cu
index 6c7fe36d364576..a08af5a5b89581 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_grad_kernel.cu
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/common/errors.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/generator.h"
 #include "paddle/phi/core/kernel_registry.h"
 
diff --git a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_kernel.cu
index 30e5599aac2363..a0b7cf5b2689ce 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_kernel.cu
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/common/errors.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/generator.h"
 #include "paddle/phi/core/kernel_registry.h"
 
diff --git a/paddle/phi/kernels/fusion/gpu/fusion_transpose_flatten_concat_kernel.cu b/paddle/phi/kernels/fusion/gpu/fusion_transpose_flatten_concat_kernel.cu
index b71f814fd4c985..422b1eade55769 100644
--- a/paddle/phi/kernels/fusion/gpu/fusion_transpose_flatten_concat_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fusion_transpose_flatten_concat_kernel.cu
@@ -15,10 +15,10 @@
 #include <algorithm>
 #include <type_traits>
 
+#include "paddle/common/errors.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
diff --git a/paddle/phi/kernels/fusion/gpu/multihead_matmul_kernel.cu b/paddle/phi/kernels/fusion/gpu/multihead_matmul_kernel.cu
index 06f28d387b3b33..c970f50eb117ad 100644
--- a/paddle/phi/kernels/fusion/gpu/multihead_matmul_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/multihead_matmul_kernel.cu
@@ -15,9 +15,9 @@
 #include <algorithm>
 #include <type_traits>
 
+#include "paddle/common/errors.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
@@ -349,9 +349,9 @@ void MultiheadMatmulKernel(const Context &dev_ctx,
 
   phi::DenseTensor temp_out_tensor;
   auto temp_out_dims =
-      phi::make_ddim({batch, seq_len, 3, head_number, head_size});
+      common::make_ddim({batch, seq_len, 3, head_number, head_size});
   temp_out_tensor.Resize(
-      {batch * seq_len, phi::product(temp_out_dims) / (batch * seq_len)});
+      {batch * seq_len, common::product(temp_out_dims) / (batch * seq_len)});
   auto *temp_out_data = dev_ctx.template Alloc<T>(
       &temp_out_tensor, temp_out_tensor.numel() * sizeof(T));
 
diff --git a/paddle/phi/kernels/fusion/gpu/skip_layernorm_kernel.cu b/paddle/phi/kernels/fusion/gpu/skip_layernorm_kernel.cu
index c180311755cd97..1bb5c5dfb6301c 100644
--- a/paddle/phi/kernels/fusion/gpu/skip_layernorm_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/skip_layernorm_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/core/errors.h"
+#include "paddle/common/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/skip_layernorm_functor.h"
diff --git a/paddle/phi/kernels/fusion/onednn/fc_kernel.cc b/paddle/phi/kernels/fusion/onednn/fc_kernel.cc
index 368a1a616eab84..6eed95b9b1c9a4 100644
--- a/paddle/phi/kernels/fusion/onednn/fc_kernel.cc
+++ b/paddle/phi/kernels/fusion/onednn/fc_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include <memory>
+#include "paddle/common/errors.h"
 #include "paddle/phi/backends/onednn/onednn_reuse.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/expect.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
@@ -84,8 +84,8 @@ class FCOneDNNHandler
         dev_ctx_(dev_ctx) {
     this->memory_key_ = dev_ctx.GetInputsName("W")[0];
 
-    auto x_vec_dims = phi::vectorize(x->dims());
-    auto weights_vec_dims = phi::vectorize(weights->dims());
+    auto x_vec_dims = common::vectorize(x->dims());
+    auto weights_vec_dims = common::vectorize(weights->dims());
 
     int MB = 1;
     for (int i = 0; i < in_num_col_dims; ++i) {
@@ -382,7 +382,7 @@ void RecomputeOutputDims(const int in_num_col_dims,
                            output_dims,
                            in_num_col_dims,
                            padding_weights);
-  out->Resize(phi::make_ddim(output_dims));
+  out->Resize(common::make_ddim(output_dims));
   out->set_lod(x->lod());
 }
 
@@ -436,8 +436,8 @@ void RunKernel(const phi::OneDNNContext& dev_ctx,
       phi::funcs::CreateKey(dev_ctx,
                             dev_ctx.GetInputsName("Input")[0],
                             dev_ctx.GetInputsName("W")[0],
-                            phi::vectorize(input.dims()),
-                            phi::vectorize(w.dims())));
+                            common::vectorize(input.dims()),
+                            common::vectorize(w.dims())));
 
   auto inner_product_cache =
       std::static_pointer_cast<InnerProductCache>(dev_ctx.GetBlob(cache_key));
@@ -547,7 +547,7 @@ void RunKernel(const phi::OneDNNContext& dev_ctx,
   }
 
   const auto out_md =
-      dst_memory_p->get_desc().reshape(phi::vectorize(out->dims()));
+      dst_memory_p->get_desc().reshape(common::vectorize(out->dims()));
 
   if (dev_ctx.HasDnnAttr("fused_reshape2_shape")) {
     phi::funcs::SetOutMemDescWithReshape2FuseSupport(
diff --git a/paddle/phi/kernels/fusion/onednn/fused_conv_kernel.cc b/paddle/phi/kernels/fusion/onednn/fused_conv_kernel.cc
index 6cbf2c2c05f7dc..9c19c9a202c161 100644
--- a/paddle/phi/kernels/fusion/onednn/fused_conv_kernel.cc
+++ b/paddle/phi/kernels/fusion/onednn/fused_conv_kernel.cc
@@ -145,7 +145,7 @@ KernelKey ConvGetKernelTypeForVar(const GetKernelTypeForVarContext* ctx) {
       (tensor.layout() != phi::DataLayout::ONEDNN)) {
     auto it = attrs.find("data_format");
     const std::string data_format = PADDLE_GET_CONST(std::string, it->second);
-    auto dl = phi::StringToDataLayout(data_format);
+    auto dl = common::StringToDataLayout(data_format);
     // Some models may have intentionally set "AnyLayout" for conv
     // op. Treat this as NCHW (default data_format value)
     if (dl != phi::DataLayout::kAnyLayout) {
diff --git a/paddle/phi/kernels/fusion/onednn/fused_matmul_kernel.cc b/paddle/phi/kernels/fusion/onednn/fused_matmul_kernel.cc
index 86ef5b368476ad..1f2c0766f95e40 100644
--- a/paddle/phi/kernels/fusion/onednn/fused_matmul_kernel.cc
+++ b/paddle/phi/kernels/fusion/onednn/fused_matmul_kernel.cc
@@ -369,10 +369,11 @@ void ExecuteFusedMatmul(const OneDNNContext &dev_ctx,
   if (is_output_fused && !funcs::is_int8<T_out>()) {
     auto permuted_md =
         dst_memory_p->get_desc().permute_axes(fused_transpose_Out);
-    out->set_mem_desc(permuted_md.reshape(vectorize<int64_t>(out->dims())));
-  } else {
     out->set_mem_desc(
-        dst_memory_p->get_desc().reshape(vectorize<int64_t>(out->dims())));
+        permuted_md.reshape(common::vectorize<int64_t>(out->dims())));
+  } else {
+    out->set_mem_desc(dst_memory_p->get_desc().reshape(
+        common::vectorize<int64_t>(out->dims())));
   }
 }
 
@@ -380,9 +381,9 @@ std::vector<int64_t> GetInputShape(DDim input_dims,
                                    std::vector<int> shape,
                                    std::vector<int> axis) {
   if (!shape.empty() && !axis.empty()) {
-    return vectorize(input_dims.reshape(shape).transpose(axis));
+    return common::vectorize(input_dims.reshape(shape).transpose(axis));
   }
-  return vectorize(input_dims);
+  return common::vectorize(input_dims);
 }
 
 void CalculateMatrixDims(const std::vector<int64_t> &x_dims,
@@ -413,7 +414,7 @@ void CalculateMatrixDims(const std::vector<int64_t> &x_dims,
   }
 
   if (!is_output_fused && x_dims.size() > 2 && y_dims.size() > 2) {
-    auto out_dims = vectorize(out->dims());
+    auto out_dims = common::vectorize(out->dims());
     for (size_t i = 0; i < (*x_bd_dims).size() - 2; ++i) {
       PADDLE_ENFORCE_EQ(
           (*x_bd_dims)[i] == (*y_bd_dims)[i] || (*x_bd_dims)[i] == 1 ||
@@ -429,7 +430,7 @@ void CalculateMatrixDims(const std::vector<int64_t> &x_dims,
               (*y_bd_dims)[i]));
       (out_dims)[i] = std::max((*x_bd_dims)[i], (*y_bd_dims)[i]);
     }
-    out->Resize(make_ddim((out_dims)));
+    out->Resize(common::make_ddim((out_dims)));
   }
 }
 
diff --git a/paddle/phi/kernels/fusion/onednn/fused_transpose_kernel.cc b/paddle/phi/kernels/fusion/onednn/fused_transpose_kernel.cc
index 964263424f0973..a7f9e49e325603 100644
--- a/paddle/phi/kernels/fusion/onednn/fused_transpose_kernel.cc
+++ b/paddle/phi/kernels/fusion/onednn/fused_transpose_kernel.cc
@@ -48,7 +48,7 @@ void SetInMemDescWithSqueeze2FuseSupport(
   }
 
   in->set_mem_desc(in_md.reshape(squeezed_op_tz));
-  in->Resize(make_ddim(squeezed_op_tz));
+  in->Resize(common::make_ddim(squeezed_op_tz));
 }
 
 template <typename T, typename Context>
@@ -76,7 +76,7 @@ void FusedTransposeKernel(const Context& dev_ctx,
         formated_axis[i] = axis[i] + axis_size;
       }
     }
-    auto dims = phi::vectorize<int>(x_dims);
+    auto dims = common::vectorize<int>(x_dims);
 
     std::rotate(dims.begin() + 1, dims.begin() + 2, dims.end());
     x_dims = x_dims.reshape(dims);
@@ -107,7 +107,7 @@ void FusedTransposeKernel(const Context& dev_ctx,
     return;
   }
 
-  auto x_vec_dims = vectorize(x.dims());
+  auto x_vec_dims = common::vectorize(x.dims());
   auto x_type = funcs::ToOneDNNDataType(x.dtype());
 
   dnnl::primitive_attr attrs;
@@ -188,7 +188,7 @@ void FusedTransposeKernel(const Context& dev_ctx,
         fused_reshape2_shape, out, out_md);
   } else if (!fused_squeeze2_axes.empty()) {
     out->set_mem_desc(out_md);
-    out->Resize(make_ddim(out_md.get_dims()));
+    out->Resize(common::make_ddim(out_md.get_dims()));
   } else {
     out->set_mem_desc(out_md);
   }
diff --git a/paddle/phi/kernels/fusion/onednn/fusion_gru_kernel.cc b/paddle/phi/kernels/fusion/onednn/fusion_gru_kernel.cc
index e3fa939aad7537..8e7fe89ec1f7f8 100644
--- a/paddle/phi/kernels/fusion/onednn/fusion_gru_kernel.cc
+++ b/paddle/phi/kernels/fusion/onednn/fusion_gru_kernel.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/common/errors.h"
 #include "paddle/phi/backends/onednn/onednn_reuse.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/expect.h"
 #include "paddle/phi/core/utils/data_type.h"
 
@@ -451,12 +451,12 @@ void RunKernel(const phi::OneDNNContext& dev_ctx,
 
   auto x_dims = x.dims();
   auto x_mat_dims = (x_dims.size() == 3 && x_dims[1] == 1)
-                        ? phi::flatten_to_2d(x_dims, 1)
+                        ? common::flatten_to_2d(x_dims, 1)
                         : x_dims;
 
   // Get tensor dimensions
-  const auto x_mat_dims_vec = phi::vectorize(x_mat_dims);
-  const auto weight_h_dims = phi::vectorize(weight_h.dims());
+  const auto x_mat_dims_vec = common::vectorize(x_mat_dims);
+  const auto weight_h_dims = common::vectorize(weight_h.dims());
   const auto& input_lod = x.lod()[0];
 
   // Calculate RNN dimensions
diff --git a/paddle/phi/kernels/fusion/xpu/add_act_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/add_act_xpu_kernel.cc
index ca221cbc7f4129..5a06ba1422fc03 100644
--- a/paddle/phi/kernels/fusion/xpu/add_act_xpu_kernel.cc
+++ b/paddle/phi/kernels/fusion/xpu/add_act_xpu_kernel.cc
@@ -37,8 +37,8 @@ void AddActXPUKernel(const Context& ctx,
       y_max.get_ptr() == nullptr ? nullptr : y_max.get_ptr()->data<float>();
   auto* out_data = reinterpret_cast<XPUType*>(ctx.template Alloc<T>(out));
 
-  std::vector<int64_t> x_shape = phi::vectorize(x.dims());
-  std::vector<int64_t> y_shape = phi::vectorize(y.dims());
+  std::vector<int64_t> x_shape = common::vectorize(x.dims());
+  std::vector<int64_t> y_shape = common::vectorize(y.dims());
   xpu::Activation_t act(static_cast<xpu::Activation_t::act_enum>(act_type));
   int r =
       xpu::add_activation_fusion<XPUType, XPUType, XPUType>(  // TX/TY/TZ/TID
diff --git a/paddle/phi/kernels/fusion/xpu/add_layernorm_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/add_layernorm_xpu_kernel.cc
index e110abca728a08..3cb98906218abb 100644
--- a/paddle/phi/kernels/fusion/xpu/add_layernorm_xpu_kernel.cc
+++ b/paddle/phi/kernels/fusion/xpu/add_layernorm_xpu_kernel.cc
@@ -60,7 +60,7 @@ static phi::DDim BroadCastInferShape(const DDim x_dims,
                                        max_dim,
                                        axis);
 
-    return phi::make_ddim(out_dims_array);
+    return common::make_ddim(out_dims_array);
   }
   return x_dims;
 }
@@ -84,7 +84,7 @@ void AddLayernormXPUKernel(const Context& ctx,
   auto x_dims = x.dims();
   auto y_dims = y.dims();
   auto out_dims = BroadCastInferShape(x_dims, y_dims, -1);
-  auto layer_norm_x_mat_dims = phi::flatten_to_2d(out_dims, begin_norm_axis);
+  auto layer_norm_x_mat_dims = common::flatten_to_2d(out_dims, begin_norm_axis);
   int64_t m = layer_norm_x_mat_dims[0];
   int64_t n = layer_norm_x_mat_dims[1];
 
diff --git a/paddle/phi/kernels/fusion/xpu/bn_act_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/bn_act_xpu_kernel.cc
index b14cc2e85fab21..81e6e670933628 100644
--- a/paddle/phi/kernels/fusion/xpu/bn_act_xpu_kernel.cc
+++ b/paddle/phi/kernels/fusion/xpu/bn_act_xpu_kernel.cc
@@ -33,7 +33,7 @@ void BNActXPUKernel(const Context& dev_ctx,
                     int act_type,
                     DenseTensor* y) {
   using XPUType = typename XPUTypeTrait<T>::Type;
-  const auto data_layout = phi::StringToDataLayout(data_layout_str);
+  const auto data_layout = common::StringToDataLayout(data_layout_str);
   PADDLE_ENFORCE_EQ(data_layout_str == "NCHW" || data_layout_str == "NHWC",
                     true,
                     phi::errors::InvalidArgument(
diff --git a/paddle/phi/kernels/fusion/xpu/conv2d_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/conv2d_xpu_kernel.cc
index 6ba3d84b5eb0b8..aa5d4738aafa97 100644
--- a/paddle/phi/kernels/fusion/xpu/conv2d_xpu_kernel.cc
+++ b/paddle/phi/kernels/fusion/xpu/conv2d_xpu_kernel.cc
@@ -54,9 +54,10 @@ void Conv2dXPUKernelImpl(const Context& ctx,
   // update paddings and dilations accoring to padding_algorithm
   std::vector<int> paddings_vec = paddings;
   std::vector<int> dilations_vec = dilations;
-  DDim in_data_dims = phi::slice_ddim(input_dims, 2, input_dims.size());
-  DDim filter_data_dims = phi::slice_ddim(filter_dims, 2, filter_dims.size());
-  std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
+  DDim in_data_dims = common::slice_ddim(input_dims, 2, input_dims.size());
+  DDim filter_data_dims =
+      common::slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
   phi::UpdatePaddingAndDilation(&paddings_vec,
                                 &dilations_vec,
                                 padding_algorithm,
diff --git a/paddle/phi/kernels/fusion/xpu/conv_transpose_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/conv_transpose_xpu_kernel.cc
index c443e109dc2b6b..58f40f3040f74c 100644
--- a/paddle/phi/kernels/fusion/xpu/conv_transpose_xpu_kernel.cc
+++ b/paddle/phi/kernels/fusion/xpu/conv_transpose_xpu_kernel.cc
@@ -48,7 +48,7 @@ void Conv2dTransposeXPUKernel(const Context& ctx,
 
   DDim in_data_dims = slice_ddim(x.dims(), 2, x.dims().size());  // hw
   DDim filter_data_dims = slice_ddim(filter.dims(), 2, filter.dims().size());
-  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
   std::vector<int> paddings_ = paddings;
   std::vector<int> dilations_ = dilations;
   UpdatePaddingAndDilation(
diff --git a/paddle/phi/kernels/fusion/xpu/fast_layernorm_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/fast_layernorm_xpu_kernel.cc
index 6170cd80faf9df..8012462e5c9bc8 100644
--- a/paddle/phi/kernels/fusion/xpu/fast_layernorm_xpu_kernel.cc
+++ b/paddle/phi/kernels/fusion/xpu/fast_layernorm_xpu_kernel.cc
@@ -28,7 +28,7 @@ void FastLayerNormXPUKernel(const Context& ctx,
                             DenseTensor* out) {
   using XPUType = typename XPUTypeTrait<T>::Type;
   const auto& x_dims = x.dims();
-  auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
+  auto matrix_dim = common::flatten_to_2d(x_dims, begin_norm_axis);
   int left = static_cast<int>(matrix_dim[0]);
   int right = static_cast<int>(matrix_dim[1]);
   const auto* x_data = x.data<T>();
diff --git a/paddle/phi/kernels/fusion/xpu/fast_where_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/fast_where_xpu_kernel.cc
index 8404bcb92015b0..3a2def32fefd26 100644
--- a/paddle/phi/kernels/fusion/xpu/fast_where_xpu_kernel.cc
+++ b/paddle/phi/kernels/fusion/xpu/fast_where_xpu_kernel.cc
@@ -30,9 +30,9 @@ void FastWhereXPUKernel(const Context& ctx,
   auto* x_data = reinterpret_cast<const XPUType*>(x.data<T>());
   auto* y_data = reinterpret_cast<const XPUType*>(y.data<T>());
   auto* out_data = reinterpret_cast<XPUType*>(ctx.template Alloc<T>(out));
-  auto condition_dims = phi::vectorize<int>(condition.dims());
-  auto x_dims = phi::vectorize<int>(x.dims());
-  auto y_dims = phi::vectorize<int>(y.dims());
+  auto condition_dims = common::vectorize<int>(condition.dims());
+  auto x_dims = common::vectorize<int>(x.dims());
+  auto y_dims = common::vectorize<int>(y.dims());
   PADDLE_ENFORCE_EQ(
       x_dims,
       y_dims,
diff --git a/paddle/phi/kernels/fusion/xpu/fused_feedforward_grad_kernel.cc b/paddle/phi/kernels/fusion/xpu/fused_feedforward_grad_kernel.cc
index 3448efca7c3ab1..29f74e8e1fe237 100644
--- a/paddle/phi/kernels/fusion/xpu/fused_feedforward_grad_kernel.cc
+++ b/paddle/phi/kernels/fusion/xpu/fused_feedforward_grad_kernel.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/phi/kernels/fusion/xpu/fused_feedforward_kernel.cc b/paddle/phi/kernels/fusion/xpu/fused_feedforward_kernel.cc
index 221305014190bd..dab55c1bbc10ae 100644
--- a/paddle/phi/kernels/fusion/xpu/fused_feedforward_kernel.cc
+++ b/paddle/phi/kernels/fusion/xpu/fused_feedforward_kernel.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/phi/kernels/fusion/xpu/fused_multi_transformer_int8_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/fused_multi_transformer_int8_xpu_kernel.cc
index 87fb42c9e23b97..236e276cb937d3 100755
--- a/paddle/phi/kernels/fusion/xpu/fused_multi_transformer_int8_xpu_kernel.cc
+++ b/paddle/phi/kernels/fusion/xpu/fused_multi_transformer_int8_xpu_kernel.cc
@@ -294,7 +294,7 @@ void FusedMultiTransformerInt8XpuKernel(
         cache_kv_out[i]->ResizeAndAllocate(cache_kv_gather_dims);
         int64_t curr_index_len =
             gather_index_t->dims().size() == 0 ? 1 : gather_index_t->dims()[0];
-        auto curr_xshape = phi::vectorize<int64_t>(cache_kv_dims);
+        auto curr_xshape = common::vectorize<int64_t>(cache_kv_dims);
         if (reinterpret_cast<XPUTypeT*>(
                 ctx.template Alloc<T>(cache_kv_out[i])) == cache_kv_data &&
             curr_index_len < curr_xshape[gather_axis]) {
@@ -339,7 +339,7 @@ void FusedMultiTransformerInt8XpuKernel(
                 cache_kv_data,
                 gather_index_t->data<int32_t>(),
                 reinterpret_cast<XPUTypeT*>(cache_kv_gather_tensor.data<T>()),
-                phi::vectorize<int32_t>(cache_kv_dims),
+                common::vectorize<int32_t>(cache_kv_dims),
                 gather_index_t->dims().size() == 0 ? 1
                                                    : gather_index_t->dims()[0],
                 gather_axis);
@@ -349,7 +349,7 @@ void FusedMultiTransformerInt8XpuKernel(
                 cache_kv_data,
                 gather_index_t->data<int64_t>(),
                 reinterpret_cast<XPUTypeT*>(cache_kv_gather_tensor.data<T>()),
-                phi::vectorize<int32_t>(cache_kv_dims),
+                common::vectorize<int32_t>(cache_kv_dims),
                 gather_index_t->dims().size() == 0 ? 1
                                                    : gather_index_t->dims()[0],
                 gather_axis);
@@ -376,7 +376,7 @@ void FusedMultiTransformerInt8XpuKernel(
               cache_kv_data,
               gather_index_t->data<int32_t>(),
               cache_kv_data,
-              phi::vectorize<int64_t>(cache_kv_dims),
+              common::vectorize<int64_t>(cache_kv_dims),
               gather_index_t->dims().size() == 0 ? 1
                                                  : gather_index_t->dims()[0],
               gather_axis,
@@ -389,7 +389,7 @@ void FusedMultiTransformerInt8XpuKernel(
               cache_kv_data,
               gather_index_t->data<int64_t>(),
               cache_kv_data,
-              phi::vectorize<int64_t>(cache_kv_dims),
+              common::vectorize<int64_t>(cache_kv_dims),
               gather_index_t->dims().size() == 0 ? 1
                                                  : gather_index_t->dims()[0],
               gather_axis,
diff --git a/paddle/phi/kernels/fusion/xpu/fused_multi_transformer_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/fused_multi_transformer_xpu_kernel.cc
index 879824668a5438..8c151e0257e0e0 100644
--- a/paddle/phi/kernels/fusion/xpu/fused_multi_transformer_xpu_kernel.cc
+++ b/paddle/phi/kernels/fusion/xpu/fused_multi_transformer_xpu_kernel.cc
@@ -262,7 +262,7 @@ void FusedMultiTransformerXpuKernel(
               cache_kv_data,
               gather_index_t->data<int32_t>(),
               reinterpret_cast<XPUTypeT*>(cache_kv_gather_tensor.data<T>()),
-              phi::vectorize<int32_t>(cache_kv_dims),
+              common::vectorize<int32_t>(cache_kv_dims),
               gather_index_t->dims().size() == 0 ? 1
                                                  : gather_index_t->dims()[0],
               gather_axis);
@@ -272,7 +272,7 @@ void FusedMultiTransformerXpuKernel(
               cache_kv_data,
               gather_index_t->data<int64_t>(),
               reinterpret_cast<XPUTypeT*>(cache_kv_gather_tensor.data<T>()),
-              phi::vectorize<int32_t>(cache_kv_dims),
+              common::vectorize<int32_t>(cache_kv_dims),
               gather_index_t->dims().size() == 0 ? 1
                                                  : gather_index_t->dims()[0],
               gather_axis);
@@ -292,7 +292,7 @@ void FusedMultiTransformerXpuKernel(
               cache_kv_data,
               gather_index_t->data<int32_t>(),
               cache_kv_data,
-              phi::vectorize<int64_t>(cache_kv_dims),
+              common::vectorize<int64_t>(cache_kv_dims),
               gather_index_t->dims().size() == 0 ? 1
                                                  : gather_index_t->dims()[0],
               gather_axis);
@@ -302,7 +302,7 @@ void FusedMultiTransformerXpuKernel(
               cache_kv_data,
               gather_index_t->data<int64_t>(),
               cache_kv_data,
-              phi::vectorize<int64_t>(cache_kv_dims),
+              common::vectorize<int64_t>(cache_kv_dims),
               gather_index_t->dims().size() == 0 ? 1
                                                  : gather_index_t->dims()[0],
               gather_axis);
diff --git a/paddle/phi/kernels/fusion/xpu/fused_softmax_mask_kernel.cc b/paddle/phi/kernels/fusion/xpu/fused_softmax_mask_kernel.cc
index 3a8083b9945fb3..01a76f36557bcb 100644
--- a/paddle/phi/kernels/fusion/xpu/fused_softmax_mask_kernel.cc
+++ b/paddle/phi/kernels/fusion/xpu/fused_softmax_mask_kernel.cc
@@ -56,8 +56,8 @@ void FusedSoftmaxMaskKernel(const Context& dev_ctx,
             idx,
             mask_dim[idx]));
   }
-  std::vector<int64_t> x_shape = phi::vectorize<int64_t>(x.dims());
-  std::vector<int64_t> mask_shape = phi::vectorize<int64_t>(mask.dims());
+  std::vector<int64_t> x_shape = common::vectorize<int64_t>(x.dims());
+  std::vector<int64_t> mask_shape = common::vectorize<int64_t>(mask.dims());
 
   // int softmax_with_mask(Context* ctx, const T* x, const T* mask, T* y, const
   // std::vector<int64_t>& x_shape, const std::vector<int64_t>& mask_shape);
diff --git a/paddle/phi/kernels/fusion/xpu/layer_norm_act_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/layer_norm_act_xpu_kernel.cc
index ead6959ba6debc..ec0bec2fbbe383 100644
--- a/paddle/phi/kernels/fusion/xpu/layer_norm_act_xpu_kernel.cc
+++ b/paddle/phi/kernels/fusion/xpu/layer_norm_act_xpu_kernel.cc
@@ -31,7 +31,7 @@ void LayerNormActXPUKernel(const Context& ctx,
                            DenseTensor* y) {
   using XPUType = typename XPUTypeTrait<T>::Type;
   const auto& x_dims = x.dims();
-  auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
+  auto matrix_dim = common::flatten_to_2d(x_dims, begin_norm_axis);
   int left = static_cast<int>(matrix_dim[0]);
   int right = static_cast<int>(matrix_dim[1]);
   const auto* x_data = x.data<T>();
diff --git a/paddle/phi/kernels/fusion/xpu/yolo_box_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/yolo_box_xpu_kernel.cc
index 0164f8439bdae4..259726bf89094a 100644
--- a/paddle/phi/kernels/fusion/xpu/yolo_box_xpu_kernel.cc
+++ b/paddle/phi/kernels/fusion/xpu/yolo_box_xpu_kernel.cc
@@ -70,10 +70,11 @@ void YoloBoxXPUKernel(const Context& ctx,
     stride_data = stride.data<float>();
     anchor_grid_data = anchor_grid.data<float>();
   }
-  std::vector<int64_t> x_shape = phi::vectorize(x.dims());
-  std::vector<int64_t> grid_shape = phi::vectorize(grid.dims());
-  std::vector<int64_t> stride_shape = phi::vectorize(stride.dims());
-  std::vector<int64_t> anchor_grid_shape = phi::vectorize(anchor_grid.dims());
+  std::vector<int64_t> x_shape = common::vectorize(x.dims());
+  std::vector<int64_t> grid_shape = common::vectorize(grid.dims());
+  std::vector<int64_t> stride_shape = common::vectorize(stride.dims());
+  std::vector<int64_t> anchor_grid_shape =
+      common::vectorize(anchor_grid.dims());
   // yolo_box_coord only support fp32&&fp16 precision
   int r = xpu::yolo_box_coord<XPUType>(
       /* baidu::xpu::api::Context* ctx */ ctx.x_context(),
diff --git a/paddle/phi/kernels/gpu/affine_grid_grad_kernel.cu b/paddle/phi/kernels/gpu/affine_grid_grad_kernel.cu
index f42d13df86a7c0..d2833db851f77f 100644
--- a/paddle/phi/kernels/gpu/affine_grid_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/affine_grid_grad_kernel.cu
@@ -40,7 +40,7 @@ struct Linspace<phi::GPUContext, T> {
                   bool align_corners,
                   DenseTensor* numbers,
                   const phi::GPUContext& dev_ctx) {
-    numbers->Resize(phi::make_ddim({count}));
+    numbers->Resize(common::make_ddim({count}));
     T* number_data = dev_ctx.template Alloc<T>(numbers);
     T slice = (end - start) / (T)(count - 1);
     if (!align_corners) {
@@ -144,7 +144,7 @@ void AffineGridGrad4DCUDAKernel(const Context& dev_ctx,
   int w = 0;
   h = size_attr[2];
   w = size_attr[3];
-  theta_grad->Resize(phi::make_ddim({n, 2, 3}));
+  theta_grad->Resize(common::make_ddim({n, 2, 3}));
   T* theta_grad_data = dev_ctx.template Alloc<T>(theta_grad);
   phi::funcs::SetConstant<phi::GPUContext, T>()(
       dev_ctx, theta_grad, static_cast<T>(0));
@@ -199,7 +199,7 @@ void AffineGridGrad5DCUDAKernel(const Context& dev_ctx,
   d = size_attr[2];
   h = size_attr[3];
   w = size_attr[4];
-  theta_grad->Resize(phi::make_ddim({n, 3, 4}));
+  theta_grad->Resize(common::make_ddim({n, 3, 4}));
   T* theta_grad_data = dev_ctx.template Alloc<T>(theta_grad);
   phi::funcs::SetConstant<phi::GPUContext, T>()(
       dev_ctx, theta_grad, static_cast<T>(0));
diff --git a/paddle/phi/kernels/gpu/affine_grid_kernel.cu b/paddle/phi/kernels/gpu/affine_grid_kernel.cu
index d9d539ce28e23f..71220ba40700ea 100644
--- a/paddle/phi/kernels/gpu/affine_grid_kernel.cu
+++ b/paddle/phi/kernels/gpu/affine_grid_kernel.cu
@@ -39,7 +39,7 @@ struct Linspace<phi::GPUContext, T> {
                   bool align_corners,
                   DenseTensor* numbers,
                   const phi::GPUContext& dev_ctx) {
-    numbers->Resize(phi::make_ddim({count}));
+    numbers->Resize(common::make_ddim({count}));
     T* number_data = dev_ctx.template Alloc<T>(numbers);
     T slice = (end - start) / (T)(count - 1);
     if (!align_corners) {
@@ -136,7 +136,7 @@ void AffineGrid4DCUDAKernel(const Context& dev_ctx,
   int w = 0;
   h = size_attr[2];
   w = size_attr[3];
-  output->Resize(phi::make_ddim({n, h, w, 2}));
+  output->Resize(common::make_ddim({n, h, w, 2}));
   T* out_data = dev_ctx.template Alloc<T>(output);
 
   T h_step;
@@ -186,7 +186,7 @@ void AffineGrid5DCUDAKernel(const Context& dev_ctx,
   d = size_attr[2];
   h = size_attr[3];
   w = size_attr[4];
-  output->Resize(phi::make_ddim({n, d, h, w, 3}));
+  output->Resize(common::make_ddim({n, d, h, w, 3}));
   T* out_data = dev_ctx.template Alloc<T>(output);
 
   T d_step;
diff --git a/paddle/phi/kernels/gpu/arange_kernel.cu b/paddle/phi/kernels/gpu/arange_kernel.cu
index 3c793e106f049b..10905ff89e18e9 100644
--- a/paddle/phi/kernels/gpu/arange_kernel.cu
+++ b/paddle/phi/kernels/gpu/arange_kernel.cu
@@ -14,12 +14,12 @@
 
 #include "paddle/phi/kernels/arange_kernel.h"
 
+#include "paddle/common/errors.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/range_function.h"
@@ -47,7 +47,7 @@ void ArangeTensorKernel(const Context& dev_ctx,
 
   int64_t size = 0;
   phi::funcs::GetSize(start_value, end_value, step_value, &size);
-  out->Resize(phi::make_ddim({size}));
+  out->Resize(common::make_ddim({size}));
   T* out_data = dev_ctx.template Alloc<T>(out);
 
   auto stream = dev_ctx.stream();
@@ -68,7 +68,7 @@ void ArangeNullaryKernel(const Context& dev_ctx,
                          DenseTensor* out) {
   int64_t size = 0;
   phi::funcs::GetSize(start_value, end_value, step_value, &size);
-  out->Resize(phi::make_ddim({size}));
+  out->Resize(common::make_ddim({size}));
   T* out_data = dev_ctx.template Alloc<T>(out);
 
   auto stream = dev_ctx.stream();
diff --git a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
index caa635255b9878..8a628560f27adc 100644
--- a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
+++ b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
@@ -28,7 +28,7 @@ namespace cub = hipcub;
 #endif
 #include <limits>
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 #include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 namespace phi {
@@ -171,7 +171,7 @@ struct VisitDataCudaArgMinMaxFunctor {
     phi::DDim x_dims;
     int new_axis = axis;
     if (flatten) {
-      x_dims = phi::make_ddim({x.numel()});
+      x_dims = common::make_ddim({x.numel()});
       // if flatten, the axis just as 0
       new_axis = 0;
     } else {
diff --git a/paddle/phi/kernels/gpu/argsort_grad_kernel.cu b/paddle/phi/kernels/gpu/argsort_grad_kernel.cu
index 4cc6b1dd3cb888..5d7dcc08e44c55 100644
--- a/paddle/phi/kernels/gpu/argsort_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/argsort_grad_kernel.cu
@@ -175,7 +175,7 @@ void ArgsortGradKernel(const Context& dev_ctx,
   // Special case for full sort, speedup ~190x.
   if (axis == -1 || axis + 1 == in_dims.size()) {
     const int64_t input_height =
-        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+        common::product(common::slice_ddim(in_dims, 0, in_dims.size() - 1));
     const int64_t input_width = in_dims[in_dims.size() - 1];
     ArgFullAssign<T, int64_t>(
         dev_ctx, &out_grad, &indices, in_grad, input_height, input_width);
@@ -204,8 +204,8 @@ void ArgsortGradKernel(const Context& dev_ctx,
     TransposeKernel<T, Context>(dev_ctx, out_grad, trans, &trans_dO);
     TransposeKernel<int64_t, Context>(dev_ctx, indices, trans, &trans_ind);
 
-    const int64_t input_height =
-        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_height = common::product(
+        common::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
     const int64_t input_width = trans_dims[trans_dims.size() - 1];
 
     DenseTensor tmp_out;
diff --git a/paddle/phi/kernels/gpu/argsort_kernel.cu b/paddle/phi/kernels/gpu/argsort_kernel.cu
index 4d16f826cca86a..48a5cfd4d09e9d 100644
--- a/paddle/phi/kernels/gpu/argsort_kernel.cu
+++ b/paddle/phi/kernels/gpu/argsort_kernel.cu
@@ -109,7 +109,7 @@ void ArgFullSort(const phi::GPUContext& ctx,
   auto cu_stream = ctx.stream();
   DenseTensor input_indices;
   const std::vector<IndType> dims = {num_rows, num_cols};
-  auto dim = phi::make_ddim(dims);
+  auto dim = common::make_ddim(dims);
   input_indices.Resize(dim);
   ctx.template Alloc<IndType>(&input_indices);
   size_t temp_storage_bytes = -1;
@@ -264,7 +264,7 @@ void ArgsortKernel(const Context& dev_ctx,
   // Special case for full sort, speedup ~190x.
   if (axis == -1 || axis + 1 == in_dims.size()) {
     const int64_t input_height =
-        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+        common::product(common::slice_ddim(in_dims, 0, in_dims.size() - 1));
     const int64_t input_width = in_dims[in_dims.size() - 1];
     ArgFullSort<T, int64_t>(dev_ctx,
                             &input,
@@ -295,8 +295,8 @@ void ArgsortKernel(const Context& dev_ctx,
     // Do transpose
     TransposeKernel<T, Context>(dev_ctx, input, trans, &trans_inp);
 
-    const int64_t input_height =
-        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_height = common::product(
+        common::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
     const int64_t input_width = trans_dims[trans_dims.size() - 1];
 
     DenseTensor tmp_out;
diff --git a/paddle/phi/kernels/gpu/assign_pos_kernel.cu b/paddle/phi/kernels/gpu/assign_pos_kernel.cu
index dc164a8bbe6d92..891a18fc413155 100644
--- a/paddle/phi/kernels/gpu/assign_pos_kernel.cu
+++ b/paddle/phi/kernels/gpu/assign_pos_kernel.cu
@@ -70,7 +70,7 @@ void AssignPosKernel(const Context& dev_ctx,
     cpu_eff_num_len_data = cpu_eff_num_len.data<T>()[0];
   }
 
-  phi::DDim out_dims = phi::make_ddim({cpu_eff_num_len_data});
+  phi::DDim out_dims = common::make_ddim({cpu_eff_num_len_data});
   out->Resize(out_dims);
   auto out_data = dev_ctx.template Alloc<T>(out);
 
diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
index c3c353859728b7..c275f58ff734b9 100644
--- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
@@ -14,9 +14,9 @@
 
 #include "glog/logging.h"
 
+#include "paddle/common/layout.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/flags.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -508,7 +508,7 @@ void BatchNormGradFunctor(const Context &ctx,
                           DenseTensor *bias_grad) {
   double epsilon = static_cast<double>(epsilon_f);
 
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
 
   const auto *d_y = &y_grad;
 
@@ -1353,7 +1353,7 @@ void BatchNormDoubleGradKernel(
                         "you want to use global status in pre_train model, "
                         "please set `use_global_stats = True`"));
 
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
 
   const DenseTensor *running_mean = nullptr;
   const DenseTensor *running_variance = nullptr;
diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index 20aa02a5f24856..2158d0d1189f59 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -22,9 +22,9 @@ namespace cub = hipcub;
 
 #include "glog/logging.h"
 
+#include "paddle/common/layout.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/flags.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -532,7 +532,7 @@ void BatchNormKernel(const Context &ctx,
                      DenseTensor *reserve_space) {
   double epsilon = epsilon_f;
   const bool trainable_stats = trainable_statistics;
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
   bool test_mode = is_test && (!trainable_stats);
 
   // Get the size for each dimension.
diff --git a/paddle/phi/kernels/gpu/c_split_kernel.cu b/paddle/phi/kernels/gpu/c_split_kernel.cu
index 2fda7d3cf37f0d..f003e4a73f802c 100644
--- a/paddle/phi/kernels/gpu/c_split_kernel.cu
+++ b/paddle/phi/kernels/gpu/c_split_kernel.cu
@@ -86,8 +86,8 @@ void CSplitKernel(const Context& ctx,
   int64_t end_size = dims[dims_size - 1];
 
   // remain dim
-  auto remain_ddim = phi::slice_ddim(dims, 0, dims_size - 1);
-  int64_t remain_numel = phi::product(remain_ddim);
+  auto remain_ddim = common::slice_ddim(dims, 0, dims_size - 1);
+  int64_t remain_numel = common::product(remain_ddim);
 
   int64_t limit = x.numel();
   int64_t blocks = NumBlocks(limit);
diff --git a/paddle/phi/kernels/gpu/concat_kernel.cu b/paddle/phi/kernels/gpu/concat_kernel.cu
index f0dc0c9153430c..74ba93d05893d2 100644
--- a/paddle/phi/kernels/gpu/concat_kernel.cu
+++ b/paddle/phi/kernels/gpu/concat_kernel.cu
@@ -83,8 +83,8 @@ void ConcatKernel(const Context& dev_ctx,
       if (in->numel() == 0UL) {
         continue;
       }
-      auto in_stride = phi::stride_numel(in->dims());
-      auto out_stride = phi::stride_numel(out->dims());
+      auto in_stride = common::stride_numel(in->dims());
+      auto out_stride = common::stride_numel(out->dims());
       phi::funcs::StridedNumelCopyWithAxis<T, Context>(
           dev_ctx,
           axis,
diff --git a/paddle/phi/kernels/gpu/contiguous_kernel.cu b/paddle/phi/kernels/gpu/contiguous_kernel.cu
index ff53a9456182fb..6405b35599ed2c 100644
--- a/paddle/phi/kernels/gpu/contiguous_kernel.cu
+++ b/paddle/phi/kernels/gpu/contiguous_kernel.cu
@@ -30,7 +30,7 @@ template <typename T, size_t N>
 __global__ void ContiguousCaseZeroFunc(
     const T* input_data,
     T* out_data,
-    phi::Array<int64_t, phi::DDim::kMaxRank + 1> input_stride) {
+    Array<int64_t, phi::DDim::kMaxRank + 1> input_stride) {
   int64_t input_offset = 0;
   int64_t output_offset = (blockIdx.z * gridDim.y * gridDim.x +
                            blockIdx.y * gridDim.x + blockIdx.x) *
@@ -56,8 +56,8 @@ template <typename T, size_t N>
 __global__ void ContiguousCaseOneFunc(
     const T* input_data,
     T* out_data,
-    phi::Array<int64_t, phi::DDim::kMaxRank + 1> input_stride,
-    phi::Array<int64_t, 6> dims,
+    Array<int64_t, phi::DDim::kMaxRank + 1> input_stride,
+    Array<int64_t, 6> dims,
     const int64_t x_max) {
   int64_t x = blockIdx.x * blockDim.x + threadIdx.x;
   if (x < x_max) {
@@ -511,8 +511,8 @@ void ContiguousKernel(const Context& dev_ctx,
     return;
   }
 
-  phi::Array<int64_t, phi::DDim::kMaxRank + 1> input_stride;
-  phi::Array<int64_t, phi::DDim::kMaxRank + 1> input_dims;
+  Array<int64_t, phi::DDim::kMaxRank + 1> input_stride;
+  Array<int64_t, phi::DDim::kMaxRank + 1> input_dims;
   for (int i = 0; i < input.dims().size(); i++) {
     input_dims[i] = input.dims()[i];
     input_stride[i] = input.strides()[i];
diff --git a/paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu b/paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu
index c64facc1e6879b..e96d53b1fdb311 100644
--- a/paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu
@@ -14,8 +14,8 @@
 
 #include "paddle/phi/kernels/conv_transpose_grad_kernel.h"
 
-#include "paddle/phi/common/layout.h"
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
+#include "paddle/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -71,7 +71,7 @@ void DepthwiseConv2dTransposeGradKernel(const Context& ctx,
                                         const std::string& data_format,
                                         DenseTensor* dx,
                                         DenseTensor* dfilter) {
-  const DataLayout data_layout = phi::StringToDataLayout(data_format);
+  const DataLayout data_layout = common::StringToDataLayout(data_format);
   DenseTensor filter_ = filter;
 
   if (!dx && !dfilter) {
@@ -91,7 +91,7 @@ void DepthwiseConv2dTransposeGradKernel(const Context& ctx,
     in_data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1);
   }
   DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
-  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
   UpdatePaddingAndDilation(
       &paddings_, &dilations_, padding_algorithm, in_data_dims, strides, ksize);
 
diff --git a/paddle/phi/kernels/gpu/conv_transpose_kernel.cu b/paddle/phi/kernels/gpu/conv_transpose_kernel.cu
index bee31450cbf70f..7d29f3503fd35c 100644
--- a/paddle/phi/kernels/gpu/conv_transpose_kernel.cu
+++ b/paddle/phi/kernels/gpu/conv_transpose_kernel.cu
@@ -14,8 +14,8 @@
 
 #include "paddle/phi/kernels/conv_transpose_kernel.h"
 
-#include "paddle/phi/common/layout.h"
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
+#include "paddle/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -37,7 +37,7 @@ void DepthwiseConv2dTransposeKernel(const Context& ctx,
                                     const std::vector<int>& dilations,
                                     const std::string& data_format,
                                     DenseTensor* out) {
-  const DataLayout data_layout = phi::StringToDataLayout(data_format);
+  const DataLayout data_layout = common::StringToDataLayout(data_format);
   DenseTensor filter_ = filter;
   ctx.template Alloc<T>(out);
 
@@ -72,7 +72,7 @@ void DepthwiseConv2dTransposeKernel(const Context& ctx,
     in_data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1);
   }
   DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
-  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
   UpdatePaddingAndDilation(
       &paddings_, &dilations_, padding_algorithm, in_data_dims, strides, ksize);
 
diff --git a/paddle/phi/kernels/gpu/cum_maxmin_kernel.cu b/paddle/phi/kernels/gpu/cum_maxmin_kernel.cu
index 49903bde6ff99b..24ba48429e10ce 100644
--- a/paddle/phi/kernels/gpu/cum_maxmin_kernel.cu
+++ b/paddle/phi/kernels/gpu/cum_maxmin_kernel.cu
@@ -275,7 +275,7 @@ void ScanWithIndicesKernel(const Context& dev_ctx,
             x_data, values_data, indices_data, num_rows, row_size, init, op);
   } else {
     int64_t row_size = x.dims()[axis];
-    auto sizes = phi::vectorize(x.dims());
+    auto sizes = common::vectorize(x.dims());
 
     const int64_t num_orows =
         std::accumulate(sizes.begin(),
diff --git a/paddle/phi/kernels/gpu/decode_jpeg_kernel.cu b/paddle/phi/kernels/gpu/decode_jpeg_kernel.cu
index 0b5a10b93d85a1..ef6ce5d159aeb6 100644
--- a/paddle/phi/kernels/gpu/decode_jpeg_kernel.cu
+++ b/paddle/phi/kernels/gpu/decode_jpeg_kernel.cu
@@ -118,7 +118,7 @@ void DecodeJpegKernel(const Context& dev_ctx,
   int sz = widths[0] * heights[0];
 
   std::vector<int64_t> out_shape = {output_components, height, width};
-  out->Resize(phi::make_ddim(out_shape));
+  out->Resize(common::make_ddim(out_shape));
 
   T* data = dev_ctx.template Alloc<T>(out);
 
diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h
index 2908a155cdedd3..278b219b453d3d 100644
--- a/paddle/phi/kernels/gpu/depthwise_conv.h
+++ b/paddle/phi/kernels/gpu/depthwise_conv.h
@@ -38,6 +38,7 @@ namespace math {
  * \brief Compute the depthwise convolution which include
  * forward process and backpropagation process
  */
+using DataLayout = phi::DataLayout;
 template <typename DeviceContext,
           typename T,
           bool fuse_relu_before_conv = false>
diff --git a/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu
index b2856080a7873e..a46eb02dc7c8a5 100644
--- a/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/common/layout.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/funcs/batch_norm_utils.h"
@@ -57,14 +57,14 @@ void DepthwiseConvGradKernel(const Context& dev_ctx,
   auto filter_dims = filter.dims();
 
   DDim in_data_dims;
-  const phi::DataLayout data_layout = phi::StringToDataLayout(data_format);
+  const phi::DataLayout data_layout = common::StringToDataLayout(data_format);
   if (data_layout != phi::DataLayout::kNHWC) {
     in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
   } else {
     in_data_dims = slice_ddim(in_dims, 1, in_dims.size() - 1);
   }
   DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
-  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
   UpdatePaddingAndDilation(
       &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
 
diff --git a/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu
index cd4579ef16d58d..eb87c49a9de4b2 100644
--- a/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu
+++ b/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu
@@ -77,7 +77,7 @@ void DepthwiseConvKernel(const Context& dev_ctx,
   auto filter_dims = filter.dims();
 
   DDim in_data_dims;
-  const phi::DataLayout data_layout = phi::StringToDataLayout(data_format);
+  const phi::DataLayout data_layout = common::StringToDataLayout(data_format);
   if (data_layout != phi::DataLayout::kNHWC) {
     in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
   } else {
@@ -85,7 +85,7 @@ void DepthwiseConvKernel(const Context& dev_ctx,
   }
 
   DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
-  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
   UpdatePaddingAndDilation(
       &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
 
diff --git a/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu b/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu
index bac9a297b580e9..681954317d51c4 100644
--- a/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu
@@ -36,7 +36,8 @@ void DiagonalGradKernel(const Context& dev_ctx,
   auto dout_dim = dout->dims().Get();
   auto dout_dim_size = dout->dims().size();
 
-  std::vector<int64_t> res_dout = vectorize(phi::stride(dout->dims()));
+  std::vector<int64_t> res_dout =
+      common::vectorize(common::stride(dout->dims()));
   DenseTensor dout_stride_tensor;
   phi::TensorFromVector<int64_t>(res_dout, dev_ctx, &dout_stride_tensor);
   int64_t* dout_stride = dout_stride_tensor.data<int64_t>();
@@ -46,7 +47,7 @@ void DiagonalGradKernel(const Context& dev_ctx,
   auto dx_dim = dx->dims().Get();
   auto dx_dim_size = dx->dims().size();
 
-  std::vector<int64_t> res_dx = vectorize(phi::stride(dx->dims()));
+  std::vector<int64_t> res_dx = common::vectorize(common::stride(dx->dims()));
   DenseTensor dx_stride_tensor;
   phi::TensorFromVector<int64_t>(res_dx, dev_ctx, &dx_stride_tensor);
   int64_t* dx_stride = dx_stride_tensor.data<int64_t>();
diff --git a/paddle/phi/kernels/gpu/diagonal_kernel.cu b/paddle/phi/kernels/gpu/diagonal_kernel.cu
index 2acc527e9b7c7f..6adcb9b28f5d81 100644
--- a/paddle/phi/kernels/gpu/diagonal_kernel.cu
+++ b/paddle/phi/kernels/gpu/diagonal_kernel.cu
@@ -33,7 +33,8 @@ void DiagonalKernel(const Context& dev_ctx,
   auto input_dim = input->dims().Get();
   auto input_dim_size = input->dims().size();
 
-  std::vector<int64_t> res_in = vectorize(phi::stride(input->dims()));
+  std::vector<int64_t> res_in =
+      common::vectorize(common::stride(input->dims()));
   DenseTensor input_stride_tensor;
   phi::TensorFromVector<int64_t>(res_in, dev_ctx, &input_stride_tensor);
   int64_t* input_stride = input_stride_tensor.data<int64_t>();
@@ -43,7 +44,8 @@ void DiagonalKernel(const Context& dev_ctx,
   auto output_dim = output->dims().Get();
   auto output_dim_size = output->dims().size();
 
-  std::vector<int64_t> res_out = vectorize(phi::stride(output->dims()));
+  std::vector<int64_t> res_out =
+      common::vectorize(common::stride(output->dims()));
   DenseTensor output_stride_tensor;
   phi::TensorFromVector<int64_t>(res_out, dev_ctx, &output_stride_tensor);
   int64_t* output_stride = output_stride_tensor.data<int64_t>();
diff --git a/paddle/phi/kernels/gpu/dist_kernel.cu b/paddle/phi/kernels/gpu/dist_kernel.cu
index e146fb47cf66d4..a9cbf97b975f22 100644
--- a/paddle/phi/kernels/gpu/dist_kernel.cu
+++ b/paddle/phi/kernels/gpu/dist_kernel.cu
@@ -134,7 +134,7 @@ void DistKernel(const Context& dev_ctx,
   if (xdim == y.dims()) {  // same shape
     auto n = x.numel();
     auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n);
-    intermediate.Resize(phi::make_ddim({config.block_per_grid.x}));
+    intermediate.Resize(common::make_ddim({config.block_per_grid.x}));
     T* i_ptr = dev_ctx.template Alloc<T>(&intermediate);
 
     std::vector<int64_t> axis_dims = {static_cast<int64_t>(-1)};
diff --git a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
index a7c75e64a462ad..8689f7fde8b3ba 100644
--- a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
@@ -209,7 +209,7 @@ struct EmbeddingSparseGradCUDAFunctor {
     auto* d_output_data = d_output->template data<T>();
     auto d_output_dims = d_output->dims();
     auto d_output_dims_2d =
-        phi::flatten_to_2d(d_output_dims, d_output_dims.size() - 1);
+        common::flatten_to_2d(d_output_dims, d_output_dims.size() - 1);
     PADDLE_ENFORCE_EQ(d_table_value->dims(),
                       d_output_dims_2d,
                       phi::errors::InvalidArgument(
diff --git a/paddle/phi/kernels/gpu/expand_as_kernel.cu b/paddle/phi/kernels/gpu/expand_as_kernel.cu
index b296b5b7e2014d..6bd7cb80da28fe 100644
--- a/paddle/phi/kernels/gpu/expand_as_kernel.cu
+++ b/paddle/phi/kernels/gpu/expand_as_kernel.cu
@@ -31,7 +31,7 @@ void ExpandAsKernel(const Context& ctx,
                     DenseTensor* out) {
   int rank = x.dims().size();
   int target_rank = static_cast<int>(target_shape.size());
-  auto vec_in_dims = phi::vectorize<int>(x.dims());
+  auto vec_in_dims = common::vectorize<int>(x.dims());
 
   unsigned int diff = target_rank - rank;
   vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
diff --git a/paddle/phi/kernels/gpu/expand_kernel.cu b/paddle/phi/kernels/gpu/expand_kernel.cu
index dc632ce4d4e63b..ef5643737f4007 100644
--- a/paddle/phi/kernels/gpu/expand_kernel.cu
+++ b/paddle/phi/kernels/gpu/expand_kernel.cu
@@ -29,7 +29,7 @@ void ExpandKernel(const Context& ctx,
                   DenseTensor* out) {
   auto expand_shape = shape.GetData();
   auto diff = expand_shape.size() - x.dims().size();
-  auto out_shape = phi::vectorize<int64_t>(x.dims());
+  auto out_shape = common::vectorize<int64_t>(x.dims());
   out_shape.insert(out_shape.begin(), diff, 1);
   for (size_t i = 0; i < out_shape.size(); ++i) {
     PADDLE_ENFORCE_NE(
@@ -69,7 +69,7 @@ void ExpandKernel(const Context& ctx,
     }
   }
 
-  out->Resize(phi::make_ddim(out_shape));
+  out->Resize(common::make_ddim(out_shape));
   ctx.template Alloc<T>(out);
   std::vector<const DenseTensor*> ins = {&x};
   std::vector<DenseTensor*> outs = {out};
diff --git a/paddle/phi/kernels/gpu/fill_diagonal_tensor_grad_kernel.cu b/paddle/phi/kernels/gpu/fill_diagonal_tensor_grad_kernel.cu
index eda1d3ba2225bf..f4eb8fcb8c059d 100644
--- a/paddle/phi/kernels/gpu/fill_diagonal_tensor_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/fill_diagonal_tensor_grad_kernel.cu
@@ -73,7 +73,7 @@ void FillDiagonalTensorGradKernel(const Context &ctx,
 
     auto stream = ctx.stream();
     DenseTensor tensor_tmp;
-    tensor_tmp.Resize(phi::make_ddim({2 + matrows}));
+    tensor_tmp.Resize(common::make_ddim({2 + matrows}));
     int64_t *memory_block_cu = ctx.template Alloc<int64_t>(&tensor_tmp);
     const auto gpu_place = ctx.GetPlace();
     memory_utils::Copy(gpu_place,
diff --git a/paddle/phi/kernels/gpu/fill_diagonal_tensor_kernel.cu b/paddle/phi/kernels/gpu/fill_diagonal_tensor_kernel.cu
index 8e6841cf6bb5bb..0f43a57c54de6b 100644
--- a/paddle/phi/kernels/gpu/fill_diagonal_tensor_kernel.cu
+++ b/paddle/phi/kernels/gpu/fill_diagonal_tensor_kernel.cu
@@ -58,7 +58,7 @@ void FillDiagonalTensorKernel(const Context &ctx,
 
   auto out_dims = out->dims();
   auto matdims = y.dims();
-  auto fill_dims = phi::flatten_to_2d(matdims, matdims.size() - 1);
+  auto fill_dims = common::flatten_to_2d(matdims, matdims.size() - 1);
 
   int64_t new_dims[2];
   std::vector<int64_t> memory_block;
@@ -89,7 +89,7 @@ void FillDiagonalTensorKernel(const Context &ctx,
 
   auto stream = ctx.stream();
   DenseTensor tensor_tmp;
-  tensor_tmp.Resize(phi::make_ddim({2 + fill_dims[0]}));
+  tensor_tmp.Resize(common::make_ddim({2 + fill_dims[0]}));
   int64_t *memory_block_cu = ctx.template Alloc<int64_t>(&tensor_tmp);
   const auto gpu_place = ctx.GetPlace();
   memory_utils::Copy(gpu_place,
diff --git a/paddle/phi/kernels/gpu/flip_kernel.cu b/paddle/phi/kernels/gpu/flip_kernel.cu
index 71fdbcaaa68bb4..4b73fd48d95985 100644
--- a/paddle/phi/kernels/gpu/flip_kernel.cu
+++ b/paddle/phi/kernels/gpu/flip_kernel.cu
@@ -13,11 +13,11 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/flip_kernel.h"
+#include "paddle/common/array.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/utils/array.h"
 
 namespace phi {
 
@@ -74,7 +74,7 @@ void FlipKernel(const Context& dev_ctx,
   const int64_t numel = x.numel();
 
   size_t flip_dims_size = axis.size();
-  auto x_stride = phi::stride(x_dims);
+  auto x_stride = common::stride(x_dims);
 
   phi::Array<int64_t, DDim::kMaxRank> stride_array;
   phi::Array<int64_t, DDim::kMaxRank> shape_array;
diff --git a/paddle/phi/kernels/gpu/full_kernel.cu b/paddle/phi/kernels/gpu/full_kernel.cu
index bd1d7db96cfeca..fde2e33505f971 100644
--- a/paddle/phi/kernels/gpu/full_kernel.cu
+++ b/paddle/phi/kernels/gpu/full_kernel.cu
@@ -41,7 +41,7 @@ void FullKernel(const Context& dev_ctx,
                 const Scalar& val,
                 DataType dtype,
                 DenseTensor* out) {
-  out->Resize(phi::make_ddim(shape.GetData()));
+  out->Resize(common::make_ddim(shape.GetData()));
   int numel = out->numel();
   dev_ctx.template Alloc<T>(out);
 
diff --git a/paddle/phi/kernels/gpu/gaussian_inplace_grad_kernel.cu b/paddle/phi/kernels/gpu/gaussian_inplace_grad_kernel.cu
index d2bb9c31fa67da..7846a596a6bd64 100644
--- a/paddle/phi/kernels/gpu/gaussian_inplace_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/gaussian_inplace_grad_kernel.cu
@@ -27,7 +27,7 @@ void GaussianInplaceGradKernel(const Context& ctx,
                                float std,
                                int seed,
                                DenseTensor* x_grad) {
-  auto dims = vectorize(x_grad->dims());
+  auto dims = common::vectorize(x_grad->dims());
   float value = static_cast<float>(0.0f);
   phi::FullKernel<T>(ctx, dims, value, phi::DataType::UNDEFINED, x_grad);
 }
diff --git a/paddle/phi/kernels/gpu/gaussian_kernel.cu b/paddle/phi/kernels/gpu/gaussian_kernel.cu
index 6e5c7ee63ce531..36fad8215d8261 100644
--- a/paddle/phi/kernels/gpu/gaussian_kernel.cu
+++ b/paddle/phi/kernels/gpu/gaussian_kernel.cu
@@ -59,7 +59,7 @@ void GaussianKernel(const Context& dev_ctx,
                     int seed,
                     DataType dtype,
                     DenseTensor* out) {
-  out->Resize(phi::make_ddim(shape.GetData()));
+  out->Resize(common::make_ddim(shape.GetData()));
   dev_ctx.template Alloc<T>(out);
   if (seed == 0) {
     // use global Generator seed
diff --git a/paddle/phi/kernels/gpu/generate_proposals_kernel.cu b/paddle/phi/kernels/gpu/generate_proposals_kernel.cu
index 38e0e27d99f14e..ce2f8dc2467ed0 100644
--- a/paddle/phi/kernels/gpu/generate_proposals_kernel.cu
+++ b/paddle/phi/kernels/gpu/generate_proposals_kernel.cu
@@ -47,16 +47,16 @@ static void SortDescending(const phi::GPUContext &ctx,
                            DenseTensor *index_out) {
   int num = static_cast<int>(value.numel());
   DenseTensor index_in_t;
-  index_in_t.Resize(phi::make_ddim({num}));
+  index_in_t.Resize(common::make_ddim({num}));
   int *idx_in = ctx.template Alloc<int>(&index_in_t);
   phi::funcs::ForRange<phi::GPUContext> for_range(ctx, num);
   for_range(funcs::RangeInitFunctor{0, 1, idx_in});
 
-  index_out->Resize(phi::make_ddim({num}));
+  index_out->Resize(common::make_ddim({num}));
   int *idx_out = ctx.template Alloc<int>(index_out);
 
   const T *keys_in = value.data<T>();
-  value_out->Resize(phi::make_ddim({num}));
+  value_out->Resize(common::make_ddim({num}));
   T *keys_out = ctx.template Alloc<T>(value_out);
 
   // Determine temporary device storage requirements
@@ -333,7 +333,7 @@ static void NMS(const phi::GPUContext &ctx,
       }
     }
   }
-  keep_out->Resize(phi::make_ddim({num_to_keep}));
+  keep_out->Resize(common::make_ddim({num_to_keep}));
   int *keep = ctx.template Alloc<int>(keep_out);
   memory_utils::Copy(place,
                      keep,
@@ -364,12 +364,12 @@ static std::pair<DenseTensor, DenseTensor> ProposalForOneImage(
   int num = scores.numel();
   int pre_nms_num = (pre_nms_top_n <= 0 || pre_nms_top_n > num) ? scores.numel()
                                                                 : pre_nms_top_n;
-  scores_sort.Resize(phi::make_ddim({pre_nms_num, 1}));
-  index_sort.Resize(phi::make_ddim({pre_nms_num, 1}));
+  scores_sort.Resize(common::make_ddim({pre_nms_num, 1}));
+  index_sort.Resize(common::make_ddim({pre_nms_num, 1}));
 
   // 2. box decode and clipping
   DenseTensor proposals;
-  proposals.Resize(phi::make_ddim({pre_nms_num, 4}));
+  proposals.Resize(common::make_ddim({pre_nms_num, 4}));
   ctx.template Alloc<T>(&proposals);
 
   {
@@ -385,9 +385,9 @@ static std::pair<DenseTensor, DenseTensor> ProposalForOneImage(
 
   // 3. filter
   DenseTensor keep_index, keep_num_t;
-  keep_index.Resize(phi::make_ddim({pre_nms_num}));
+  keep_index.Resize(common::make_ddim({pre_nms_num}));
   ctx.template Alloc<int>(&keep_index);
-  keep_num_t.Resize(phi::make_ddim({1}));
+  keep_num_t.Resize(common::make_ddim({1}));
   ctx.template Alloc<int>(&keep_num_t);
   min_size = std::max(min_size, 1.0f);
   auto stream = ctx.stream();
@@ -408,23 +408,23 @@ static std::pair<DenseTensor, DenseTensor> ProposalForOneImage(
                      sizeof(int),
                      ctx.stream());
   ctx.Wait();
-  keep_index.Resize(phi::make_ddim({keep_num}));
+  keep_index.Resize(common::make_ddim({keep_num}));
 
   DenseTensor scores_filter, proposals_filter;
   // Handle the case when there is no keep index left
   if (keep_num == 0) {
     phi::funcs::SetConstant<phi::GPUContext, T> set_zero;
-    proposals_filter.Resize(phi::make_ddim({1, 4}));
+    proposals_filter.Resize(common::make_ddim({1, 4}));
     ctx.template Alloc<T>(&proposals_filter);
-    scores_filter.Resize(phi::make_ddim({1, 1}));
+    scores_filter.Resize(common::make_ddim({1, 1}));
     ctx.template Alloc<T>(&scores_filter);
     set_zero(ctx, &proposals_filter, static_cast<T>(0));
     set_zero(ctx, &scores_filter, static_cast<T>(0));
     return std::make_pair(proposals_filter, scores_filter);
   }
-  proposals_filter.Resize(phi::make_ddim({keep_num, 4}));
+  proposals_filter.Resize(common::make_ddim({keep_num, 4}));
   ctx.template Alloc<T>(&proposals_filter);
-  scores_filter.Resize(phi::make_ddim({keep_num, 1}));
+  scores_filter.Resize(common::make_ddim({keep_num, 1}));
   ctx.template Alloc<T>(&scores_filter);
   phi::funcs::GPUGather<T>(ctx, proposals, keep_index, &proposals_filter);
   phi::funcs::GPUGather<T>(ctx, scores_sort, keep_index, &scores_filter);
@@ -438,13 +438,13 @@ static std::pair<DenseTensor, DenseTensor> ProposalForOneImage(
   NMS<T>(
       ctx, proposals_filter, keep_index, nms_thresh, &keep_nms, pixel_offset);
   if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) {
-    keep_nms.Resize(phi::make_ddim({post_nms_top_n}));
+    keep_nms.Resize(common::make_ddim({post_nms_top_n}));
   }
 
   DenseTensor scores_nms, proposals_nms;
-  proposals_nms.Resize(phi::make_ddim({keep_nms.numel(), 4}));
+  proposals_nms.Resize(common::make_ddim({keep_nms.numel(), 4}));
   ctx.template Alloc<T>(&proposals_nms);
-  scores_nms.Resize(phi::make_ddim({keep_nms.numel(), 1}));
+  scores_nms.Resize(common::make_ddim({keep_nms.numel(), 1}));
   ctx.template Alloc<T>(&scores_nms);
   phi::funcs::GPUGather<T>(ctx, proposals_filter, keep_nms, &proposals_nms);
   phi::funcs::GPUGather<T>(ctx, scores_filter, keep_nms, &scores_nms);
@@ -487,9 +487,9 @@ void GenerateProposalsKernel(const Context &ctx,
   int64_t w_bbox = bbox_dim[3];
 
   DenseTensor bbox_deltas_swap, scores_swap;
-  bbox_deltas_swap.Resize(phi::make_ddim({num, h_bbox, w_bbox, c_bbox}));
+  bbox_deltas_swap.Resize(common::make_ddim({num, h_bbox, w_bbox, c_bbox}));
   ctx.template Alloc<T>(&bbox_deltas_swap);
-  scores_swap.Resize(phi::make_ddim({num, h_score, w_score, c_score}));
+  scores_swap.Resize(common::make_ddim({num, h_score, w_score, c_score}));
   ctx.template Alloc<T>(&scores_swap);
 
   phi::funcs::Transpose<phi::GPUContext, T, 4> trans;
@@ -499,12 +499,12 @@ void GenerateProposalsKernel(const Context &ctx,
 
   DenseTensor tmp_anchors = anchors;
   DenseTensor tmp_variances = variances;
-  tmp_anchors.Resize(phi::make_ddim({tmp_anchors.numel() / 4, 4}));
-  tmp_variances.Resize(phi::make_ddim({tmp_variances.numel() / 4, 4}));
+  tmp_anchors.Resize(common::make_ddim({tmp_anchors.numel() / 4, 4}));
+  tmp_variances.Resize(common::make_ddim({tmp_variances.numel() / 4, 4}));
 
-  rpn_rois->Resize(phi::make_ddim({bbox_deltas.numel() / 4, 4}));
+  rpn_rois->Resize(common::make_ddim({bbox_deltas.numel() / 4, 4}));
   ctx.template Alloc<T>(rpn_rois);
-  rpn_roi_probs->Resize(phi::make_ddim({scores.numel(), 1}));
+  rpn_roi_probs->Resize(common::make_ddim({scores.numel(), 1}));
   ctx.template Alloc<T>(rpn_roi_probs);
 
   T *rpn_rois_data = rpn_rois->data<T>();
@@ -522,8 +522,9 @@ void GenerateProposalsKernel(const Context &ctx,
     DenseTensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1);
     DenseTensor scores_slice = scores_swap.Slice(i, i + 1);
 
-    bbox_deltas_slice.Resize(phi::make_ddim({h_bbox * w_bbox * c_bbox / 4, 4}));
-    scores_slice.Resize(phi::make_ddim({h_score * w_score * c_score, 1}));
+    bbox_deltas_slice.Resize(
+        common::make_ddim({h_bbox * w_bbox * c_bbox / 4, 4}));
+    scores_slice.Resize(common::make_ddim({h_score * w_score * c_score, 1}));
 
     std::pair<DenseTensor, DenseTensor> box_score_pair =
         ProposalForOneImage<T>(ctx,
@@ -560,7 +561,7 @@ void GenerateProposalsKernel(const Context &ctx,
     tmp_num.push_back(proposals.dims()[0]);
   }
   if (rpn_rois_num != nullptr) {
-    rpn_rois_num->Resize(phi::make_ddim({num}));
+    rpn_rois_num->Resize(common::make_ddim({num}));
     ctx.template Alloc<int>(rpn_rois_num);
     int *num_data = rpn_rois_num->data<int>();
     memory_utils::Copy(place,
@@ -569,12 +570,12 @@ void GenerateProposalsKernel(const Context &ctx,
                        &tmp_num[0],
                        sizeof(int) * num,
                        ctx.stream());
-    rpn_rois_num->Resize(phi::make_ddim({num}));
+    rpn_rois_num->Resize(common::make_ddim({num}));
   }
   phi::LoD lod;
   lod.emplace_back(offset);
-  rpn_rois->Resize(phi::make_ddim({num_proposals, 4}));
-  rpn_roi_probs->Resize(phi::make_ddim({num_proposals, 1}));
+  rpn_rois->Resize(common::make_ddim({num_proposals, 4}));
+  rpn_roi_probs->Resize(common::make_ddim({num_proposals, 1}));
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu
index ca933cd97c7fb5..b3e34429e9ccb9 100644
--- a/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu
@@ -14,8 +14,8 @@
 
 #include "paddle/phi/kernels/group_norm_grad_kernel.h"
 
+#include "paddle/common/layout.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/gpu/group_norm_utils.h"
@@ -280,7 +280,7 @@ void GroupNormGradKernel(const Context& dev_ctx,
                          DenseTensor* d_scale,
                          DenseTensor* d_bias) {
   using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
   const auto scale_ptr = scale.get_ptr();
   const auto bias_ptr = bias.get_ptr();
 
diff --git a/paddle/phi/kernels/gpu/group_norm_kernel.cu b/paddle/phi/kernels/gpu/group_norm_kernel.cu
index 503ba9feefec64..301701c61d34ea 100644
--- a/paddle/phi/kernels/gpu/group_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/group_norm_kernel.cu
@@ -14,8 +14,8 @@
 
 #include "paddle/phi/kernels/group_norm_kernel.h"
 
+#include "paddle/common/layout.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/gpu/group_norm_utils.h"
@@ -801,7 +801,7 @@ void GroupNormDirectCUDAFunctor<T, AccT>::operator()(
     AccT* mean,
     AccT* variance,
     const DataLayout data_layout) {
-  const auto input_ddim = phi::make_ddim(input_shape);
+  const auto input_ddim = common::make_ddim(input_shape);
   const int C =
       (data_layout == DataLayout::kNCHW ? input_ddim[1]
                                         : input_ddim[input_ddim.size() - 1]);
@@ -898,7 +898,7 @@ void GroupNormGeneralCaseKernel(const Context& dev_ctx,
                                 DenseTensor* mean,
                                 DenseTensor* var) {
   using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
   const auto scale_ptr = scale.get_ptr();
   const auto bias_ptr = bias.get_ptr();
   const auto x_dims = x.dims();
diff --git a/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu b/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
index aee591894cc811..124629f580457c 100644
--- a/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
+++ b/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
@@ -141,7 +141,7 @@ struct GumbleNoiseGenerator<GPUContext, T> {
                         const float temperature) {
     DenseTensor random_tensor;
     int64_t size = size_to_axis * size_from_axis;
-    random_tensor.Resize(make_ddim({size}));
+    random_tensor.Resize(common::make_ddim({size}));
     using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
     MPType* random_data = ctx.template Alloc<MPType>(&random_tensor);
 
diff --git a/paddle/phi/kernels/gpu/index_add_grad_kernel.cu b/paddle/phi/kernels/gpu/index_add_grad_kernel.cu
index c0d5b737c5fbbf..394600a1eb9a6e 100644
--- a/paddle/phi/kernels/gpu/index_add_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_add_grad_kernel.cu
@@ -42,7 +42,7 @@ void IndexAddGradKernel(const Context& ctx,
   auto output_dim = out_grad.dims();
   auto add_value_dim = add_value_grad->dims();
   dim = dim >= 0 ? dim : dim + input_dim.size();
-  auto stride_dim = phi::stride(input_dim);
+  auto stride_dim = common::stride(input_dim);
   int64_t stride = stride_dim[dim];
   int64_t size = add_value_dim[dim];
   int64_t delta = input_dim[dim] - size;
diff --git a/paddle/phi/kernels/gpu/index_add_kernel.cu b/paddle/phi/kernels/gpu/index_add_kernel.cu
index 8fd15d5435f98b..0e8546e88d54a1 100644
--- a/paddle/phi/kernels/gpu/index_add_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_add_kernel.cu
@@ -60,7 +60,7 @@ void IndexAddKernel(const Context& ctx,
   const auto& index_type = index.dtype();
   int dim = axis;
   dim = dim >= 0 ? dim : dim + input_dim.size();
-  auto stride_dim = phi::stride(input_dim);
+  auto stride_dim = common::stride(input_dim);
   int64_t stride = stride_dim[dim];
   int64_t size = add_value_dim[dim];
   int64_t delta = input_dim[dim] - size;
diff --git a/paddle/phi/kernels/gpu/index_put_grad_kernel.cu b/paddle/phi/kernels/gpu/index_put_grad_kernel.cu
index d63d670945fba3..b0e2865d75840c 100644
--- a/paddle/phi/kernels/gpu/index_put_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_put_grad_kernel.cu
@@ -26,8 +26,8 @@ namespace phi {
 
 template <typename T>
 __global__ void SetZeroCudaKernel(int64_t** indices,
-                                  phi::Array<int64_t, DDim::kMaxRank> stride,
-                                  phi::Array<int64_t, DDim::kMaxRank> shape,
+                                  Array<int64_t, DDim::kMaxRank> stride,
+                                  Array<int64_t, DDim::kMaxRank> shape,
                                   const int rank,
                                   const int64_t numel,
                                   T* out) {
@@ -56,14 +56,13 @@ __global__ void SetZeroCudaKernel(int64_t** indices,
 }
 
 template <typename T>
-__global__ void IndexPutGradCudaKernel(
-    const T* out_grad,
-    int64_t** indices,
-    phi::Array<int64_t, DDim::kMaxRank> stride,
-    phi::Array<int64_t, DDim::kMaxRank> shape,
-    const int rank,
-    const int64_t numel,
-    T* value_grad) {
+__global__ void IndexPutGradCudaKernel(const T* out_grad,
+                                       int64_t** indices,
+                                       Array<int64_t, DDim::kMaxRank> stride,
+                                       Array<int64_t, DDim::kMaxRank> shape,
+                                       const int rank,
+                                       const int64_t numel,
+                                       T* value_grad) {
   int64_t idx =
       static_cast<int64_t>(threadIdx.x) +
       static_cast<int64_t>(blockDim.x) * static_cast<int64_t>(blockIdx.x);
@@ -103,10 +102,10 @@ void LaunchIndexPutGradCudaKernel(
       T* x_grad_data = x_grad->data<T>();
 
       auto x_grad_dims = x_grad->dims();
-      auto x_grad_stride = phi::stride(x_grad_dims);
+      auto x_grad_stride = common::stride(x_grad_dims);
 
-      phi::Array<int64_t, DDim::kMaxRank> stride_array;
-      phi::Array<int64_t, DDim::kMaxRank> shape_array;
+      Array<int64_t, DDim::kMaxRank> stride_array;
+      Array<int64_t, DDim::kMaxRank> shape_array;
       for (int i = 0; i < rank; ++i) {
         stride_array[i] = x_grad_stride[i];
         shape_array[i] = x_grad_dims[i];
@@ -125,10 +124,10 @@ void LaunchIndexPutGradCudaKernel(
   }
 
   auto out_grad_dims = out_grad.dims();
-  auto out_grad_stride = phi::stride(out_grad_dims);
+  auto out_grad_stride = common::stride(out_grad_dims);
 
-  phi::Array<int64_t, DDim::kMaxRank> stride_array;
-  phi::Array<int64_t, DDim::kMaxRank> shape_array;
+  Array<int64_t, DDim::kMaxRank> stride_array;
+  Array<int64_t, DDim::kMaxRank> shape_array;
   for (int i = 0; i < rank; ++i) {
     stride_array[i] = out_grad_stride[i];
     shape_array[i] = out_grad_dims[i];
@@ -199,8 +198,9 @@ void LaunchIndexPutGradCudaKernel(
                                                       numel,
                                                       tmp_value_grad_data);
 
-      std::vector<int64_t> after_dims = phi::vectorize(tmp_value_grad.dims());
-      std::vector<int64_t> before_dims = phi::vectorize(value_grad->dims());
+      std::vector<int64_t> after_dims =
+          common::vectorize(tmp_value_grad.dims());
+      std::vector<int64_t> before_dims = common::vectorize(value_grad->dims());
       std::vector<int64_t> compress_dims;
       std::vector<int64_t> dims_without_1;
 
@@ -208,7 +208,7 @@ void LaunchIndexPutGradCudaKernel(
           &after_dims, &before_dims, &compress_dims, &dims_without_1);
 
       auto pre_dims = value_grad->dims();
-      value_grad->Resize(phi::make_ddim(dims_without_1));
+      value_grad->Resize(common::make_ddim(dims_without_1));
       IntArray v_axis(compress_dims);
       SumKernel<T, Context>(dev_ctx,
                             tmp_value_grad,
@@ -245,7 +245,7 @@ void IndexPutGradKernel(const Context& dev_ctx,
     }
     if (value_grad) {
       FullKernel<T, Context>(dev_ctx,
-                             phi::vectorize(value_grad->dims()),
+                             common::vectorize(value_grad->dims()),
                              0.0f,
                              value_grad->dtype(),
                              value_grad);
@@ -255,7 +255,7 @@ void IndexPutGradKernel(const Context& dev_ctx,
 
   auto bd_dim = funcs::BroadCastTensorsDims(int_indices_v);
 
-  std::vector<int64_t> res_dim_v(phi::vectorize(bd_dim));
+  std::vector<int64_t> res_dim_v(common::vectorize(bd_dim));
   std::vector<const phi::DenseTensor*> res_indices_v(x.dims().size(), nullptr);
   std::vector<DenseTensor> tmp_res_indices_v;
   std::vector<DenseTensor> range_tensor_v;
diff --git a/paddle/phi/kernels/gpu/index_put_kernel.cu b/paddle/phi/kernels/gpu/index_put_kernel.cu
index ee58eab21c53df..ffd4ee7572d562 100644
--- a/paddle/phi/kernels/gpu/index_put_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_put_kernel.cu
@@ -25,8 +25,8 @@ template <typename T>
 __global__ void IndexPutCudaKernel(const T* x,
                                    const T* vals,
                                    int64_t** indices,
-                                   phi::Array<int64_t, DDim::kMaxRank> stride,
-                                   phi::Array<int64_t, DDim::kMaxRank> shape,
+                                   Array<int64_t, DDim::kMaxRank> stride,
+                                   Array<int64_t, DDim::kMaxRank> shape,
                                    const int rank,
                                    const int64_t numel,
                                    const int64_t is_single_val_tensor,
@@ -78,10 +78,10 @@ void LaunchIndexPutCudaKernel(const Context& dev_ctx,
 
   auto x_dims = x.dims();
   const int rank = x_dims.size();
-  auto x_stride = phi::stride(x_dims);
+  auto x_stride = common::stride(x_dims);
 
-  phi::Array<int64_t, DDim::kMaxRank> stride_array;
-  phi::Array<int64_t, DDim::kMaxRank> shape_array;
+  Array<int64_t, DDim::kMaxRank> stride_array;
+  Array<int64_t, DDim::kMaxRank> shape_array;
   for (int i = 0; i < rank; ++i) {
     stride_array[i] = x_stride[i];
     shape_array[i] = x_dims[i];
@@ -134,7 +134,7 @@ void IndexPutKernel(const Context& dev_ctx,
   }
   auto bd_dim = funcs::BroadCastTensorsDims(int_indices_v);
 
-  std::vector<int64_t> res_dim_v(phi::vectorize(bd_dim));
+  std::vector<int64_t> res_dim_v(common::vectorize(bd_dim));
   std::vector<const phi::DenseTensor*> res_indices_v(x.dims().size(), nullptr);
   std::vector<DenseTensor> tmp_res_indices_v;
   std::vector<DenseTensor> tmp_value_v;
@@ -157,7 +157,7 @@ void IndexPutKernel(const Context& dev_ctx,
 
   if (value.numel() != 1) {
     tmp_value_v.emplace_back(
-        DenseTensor(value.dtype()).Resize(phi::make_ddim(res_dim_v)));
+        DenseTensor(value.dtype()).Resize(common::make_ddim(res_dim_v)));
     ExpandKernel<T, Context>(
         dev_ctx, value, IntArray(res_dim_v), &tmp_value_v[0]);
     ptr_value = &tmp_value_v[0];
diff --git a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu
index 6d0ba9e5bd4ef9..94f94a319c97a2 100644
--- a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu
@@ -60,7 +60,7 @@ void IndexSelectGradKernel(const Context& ctx,
   auto input_dim = x_grad->dims();
   auto output_dim = out_grad.dims();
   dim = dim >= 0 ? dim : dim + input_dim.size();
-  auto stride_dim = phi::stride(input_dim);
+  auto stride_dim = common::stride(input_dim);
   int64_t stride = stride_dim[dim];
   int64_t size = output_dim[dim];
   int64_t delta = input_dim[dim] - size;
diff --git a/paddle/phi/kernels/gpu/index_select_kernel.cu b/paddle/phi/kernels/gpu/index_select_kernel.cu
index 910015e00f0103..b81fb22cb96e8a 100644
--- a/paddle/phi/kernels/gpu/index_select_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_select_kernel.cu
@@ -34,7 +34,7 @@ void IndexSelectKernel(const Context& ctx,
   auto input_dim = x.dims();
   auto output_dim = output->dims();
   dim = dim >= 0 ? dim : dim + input_dim.size();
-  auto stride_dim = phi::stride(input_dim);
+  auto stride_dim = common::stride(input_dim);
   int64_t stride = stride_dim[dim];
   int64_t size = output_dim[dim];
   int64_t delta = input_dim[dim] - size;
diff --git a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
index 0f17a1bcc318a7..4bdd78e2d6e2c1 100644
--- a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
@@ -16,8 +16,8 @@
 
 #include "glog/logging.h"
 
+#include "paddle/common/layout.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/phi/kernels/gpu/instance_norm_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_kernel.cu
index b46584798b1ead..6358611e9e958c 100644
--- a/paddle/phi/kernels/gpu/instance_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/instance_norm_kernel.cu
@@ -16,8 +16,8 @@
 
 #include "glog/logging.h"
 
+#include "paddle/common/layout.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
index da633b73bf6e43..f596859fd2d575 100644
--- a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
@@ -14,11 +14,11 @@
 
 #include "paddle/phi/kernels/interpolate_grad_kernel.h"
 
+#include "paddle/common/layout.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/amp_type_traits.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/interpolate_function.h"
 #include "paddle/phi/kernels/funcs/math_cuda_utils.h"
@@ -761,7 +761,7 @@ static void Interpolate1DCUDABwd(
     bool align_corners,
     int align_mode,
     DenseTensor* input_grad) {
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
   int n, c, in_d, in_h, in_w;
   funcs::ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
 
@@ -875,7 +875,7 @@ static void Interpolate2DCUDABwd(
     bool align_corners,
     int align_mode,
     DenseTensor* input_grad) {
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
   int n, c, in_d, in_h, in_w;
   funcs::ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
 
@@ -1134,7 +1134,7 @@ static void Interpolate3DCUDABwd(
     bool align_corners,
     int align_mode,
     DenseTensor* input_grad) {
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
   int n, c, in_d, in_h, in_w;
   funcs::ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
 
diff --git a/paddle/phi/kernels/gpu/interpolate_kernel.cu b/paddle/phi/kernels/gpu/interpolate_kernel.cu
index 7d2211f2758488..a87d235971d2be 100644
--- a/paddle/phi/kernels/gpu/interpolate_kernel.cu
+++ b/paddle/phi/kernels/gpu/interpolate_kernel.cu
@@ -14,13 +14,13 @@
 
 #include "paddle/phi/kernels/interpolate_kernel.h"
 
+#include "paddle/common/layout.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/float16.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/interpolate_function.h"
 #include "paddle/phi/kernels/primitive/datamover_primitives.h"
@@ -658,7 +658,7 @@ static void Interpolate1DCUDAFwd(
     DenseTensor* output) {
   auto* input_data = input.data<T>();
 
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
   int n, c, in_d, in_h, in_w;
   funcs::ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
 
@@ -772,7 +772,7 @@ static void Interpolate2DCUDAFwd(
     DenseTensor* output) {
   auto* input_data = input.data<T>();
 
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
   int n, c, in_d, in_h, in_w;
   funcs::ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
 
@@ -1024,7 +1024,7 @@ static void Interpolate3DCUDAFwd(
     DenseTensor* output) {
   auto* input_data = input.data<T>();
 
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
   int n, c, in_d, in_h, in_w;
   funcs::ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
 
diff --git a/paddle/phi/kernels/gpu/kthvalue_kernel.cu b/paddle/phi/kernels/gpu/kthvalue_kernel.cu
index 2ecec80c27b242..b79ee58d38975f 100644
--- a/paddle/phi/kernels/gpu/kthvalue_kernel.cu
+++ b/paddle/phi/kernels/gpu/kthvalue_kernel.cu
@@ -46,7 +46,7 @@ bool SortKthvalue(const phi::GPUContext& dev_ctx,
   auto cu_stream = dev_ctx.stream();
   DenseTensor input_indices;
   const std::vector<int64_t> dims = {num_rows, num_cols};
-  auto dim = phi::make_ddim(dims);
+  auto dim = common::make_ddim(dims);
   input_indices.Resize(dim);
   dev_ctx.template Alloc<int64_t>(&input_indices);
   size_t temp_storage_bytes = -1;
@@ -140,7 +140,7 @@ bool SortKthvalue(const phi::GPUContext& dev_ctx,
   auto e_tmp_indices =
       EigenMatrix<int64_t>::From(static_cast<const DenseTensor>(temp_indices));
   std::vector<int> odims = {static_cast<int>(num_rows), static_cast<int>(1)};
-  dim = phi::make_ddim(odims);
+  dim = common::make_ddim(odims);
   auto e_values = EigenMatrix<T>::From(*out_tensor, dim);
   auto e_tmp_values =
       EigenMatrix<T>::From(static_cast<const DenseTensor>(temp_values));
@@ -182,7 +182,7 @@ void KthvalueKernel(const Context& dev_ctx,
 
   if (axis == in_dims.size() - 1) {
     const int64_t& input_height =
-        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+        common::product(common::slice_ddim(in_dims, 0, in_dims.size() - 1));
     const int64_t& input_width = in_dims[in_dims.size() - 1];
 #if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 9000
     const T* input_data = x.data<T>();
@@ -221,7 +221,7 @@ void KthvalueKernel(const Context& dev_ctx,
       for (int i = axis + 1; i < in_dims.size(); i++) {
         tmp_out_shape.emplace_back(in_dims[i]);
       }
-      DDim tmp_out_dims = phi::make_ddim(tmp_out_shape);
+      DDim tmp_out_dims = common::make_ddim(tmp_out_shape);
       output->Resize(tmp_out_dims);
       indices->Resize(tmp_out_dims);
     }
@@ -243,8 +243,8 @@ void KthvalueKernel(const Context& dev_ctx,
     trans_out.Resize(trans_out_dims);
     int64_t* tran_indices_data = dev_ctx.template Alloc<int64_t>(&trans_ind);
     T* tran_output_data = dev_ctx.template Alloc<T>(&trans_out);
-    const int64_t input_height =
-        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_height = common::product(
+        common::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
     const int64_t input_width = trans_dims[trans_dims.size() - 1];
 
 #if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 9000
diff --git a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
index e8fc640cdd508e..1e10da8967c21a 100644
--- a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
@@ -45,7 +45,7 @@ void LayerNormGradKernel(const Context &dev_ctx,
   auto *d_y = &out_grad;
 
   const auto &x_dims = x.dims();
-  auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
+  auto matrix_dim = common::flatten_to_2d(x_dims, begin_norm_axis);
   int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
   int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
 
diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
index eb85d9ac826d0a..d9757183b289c8 100644
--- a/paddle/phi/kernels/gpu/layer_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
@@ -463,8 +463,8 @@ void LayerNormDirectCUDAFunctor<T, U>::operator()(gpuStream_t stream,
                                                   U *variance,
                                                   int begin_norm_axis,
                                                   float eps) {
-  const auto x_dims = phi::make_ddim(input_shape);
-  auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
+  const auto x_dims = common::make_ddim(input_shape);
+  auto matrix_dim = common::flatten_to_2d(x_dims, begin_norm_axis);
   int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
   int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
   switch (phi::funcs::GetDesiredBlockDim(feature_size)) {
@@ -534,7 +534,7 @@ void LayerNormKernel(const Context &dev_ctx,
                           "Unsupported data type of Scale and Bias"));
   }
 
-  auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
+  auto matrix_dim = common::flatten_to_2d(x_dims, begin_norm_axis);
   int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
   int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
   auto stream = dev_ctx.stream();
diff --git a/paddle/phi/kernels/gpu/lerp_grad_kernel.cu b/paddle/phi/kernels/gpu/lerp_grad_kernel.cu
index d18c769b5117d0..6f41545e4d8e31 100644
--- a/paddle/phi/kernels/gpu/lerp_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/lerp_grad_kernel.cu
@@ -82,10 +82,10 @@ __global__ void LerpGradScalarKernelImpl(const T* weight,
 bool XYNeedReduce(const DenseTensor& x,
                   const DenseTensor& y,
                   const DenseTensor& out) {
-  auto x_dims =
-      x.dims().size() ? x.dims() : make_ddim(std::vector<int64_t>(1, 1));
-  auto y_dims =
-      y.dims().size() ? y.dims() : make_ddim(std::vector<int64_t>(1, 1));
+  auto x_dims = x.dims().size() ? x.dims()
+                                : common::make_ddim(std::vector<int64_t>(1, 1));
+  auto y_dims = y.dims().size() ? y.dims()
+                                : common::make_ddim(std::vector<int64_t>(1, 1));
 
   auto out_dims = out.dims();
   if (out_dims.size() == 0) {
@@ -242,7 +242,7 @@ void LerpGradKernel(const Context& ctx,
                              x_grad_data,
                              y_grad_data);
 
-    auto zero_dim = make_ddim(std::vector<int64_t>(1, 1));
+    auto zero_dim = common::make_ddim(std::vector<int64_t>(1, 1));
     if (x_grad) {
       std::vector<int> reduce_axis_x =
           funcs::GetReduceDim(x_grad->dims().size() ? x_grad->dims() : zero_dim,
diff --git a/paddle/phi/kernels/gpu/lerp_kernel.cu b/paddle/phi/kernels/gpu/lerp_kernel.cu
index f9d8514a54ca22..9ecb6b14379e2d 100644
--- a/paddle/phi/kernels/gpu/lerp_kernel.cu
+++ b/paddle/phi/kernels/gpu/lerp_kernel.cu
@@ -86,20 +86,22 @@ void LerpKernel(const Context &ctx,
       if (x.dims().size() < y.dims().size() &&
           x.dims().size() < weight.dims().size()) {
         // x broadcast to b_min
-        ExpandKernel<T, Context>(ctx, x, phi::vectorize(b_min.dims()), &b_min);
+        ExpandKernel<T, Context>(
+            ctx, x, common::vectorize(b_min.dims()), &b_min);
         inputs.emplace_back(&b_min);
         inputs.emplace_back(&y);
         inputs.emplace_back(&weight);
       } else if (y.dims().size() < weight.dims().size()) {
         // y broadcast to b_min
-        ExpandKernel<T, Context>(ctx, y, phi::vectorize(b_min.dims()), &b_min);
+        ExpandKernel<T, Context>(
+            ctx, y, common::vectorize(b_min.dims()), &b_min);
         inputs.emplace_back(&x);
         inputs.emplace_back(&b_min);
         inputs.emplace_back(&weight);
       } else {
         // weight broadcast to b_min
         ExpandKernel<T, Context>(
-            ctx, weight, phi::vectorize(b_min.dims()), &b_min);
+            ctx, weight, common::vectorize(b_min.dims()), &b_min);
         inputs.emplace_back(&x);
         inputs.emplace_back(&y);
         inputs.emplace_back(&b_min);
diff --git a/paddle/phi/kernels/gpu/linspace_kernel.cu b/paddle/phi/kernels/gpu/linspace_kernel.cu
index 68ff7c3b0a93de..3a54561c9a1444 100644
--- a/paddle/phi/kernels/gpu/linspace_kernel.cu
+++ b/paddle/phi/kernels/gpu/linspace_kernel.cu
@@ -88,7 +88,7 @@ void LinspaceKernel(const Context& ctx,
                                    "than 0, but received num is %d",
                                    num));
 
-  out->Resize(phi::make_ddim({num}));
+  out->Resize(common::make_ddim({num}));
   T* out_data = ctx.template Alloc<T>(out);
 
   auto stream = ctx.stream();
diff --git a/paddle/phi/kernels/gpu/logspace_kernel.cu b/paddle/phi/kernels/gpu/logspace_kernel.cu
index 458600e2f29e14..250e440170d7d1 100644
--- a/paddle/phi/kernels/gpu/logspace_kernel.cu
+++ b/paddle/phi/kernels/gpu/logspace_kernel.cu
@@ -93,7 +93,7 @@ void LogspaceKernel(const Context& ctx,
                                    "than 0, but received num is %d",
                                    num));
 
-  out->Resize(phi::make_ddim({num}));
+  out->Resize(common::make_ddim({num}));
   T* out_data = ctx.template Alloc<T>(out);
 
   double step = 0;
diff --git a/paddle/phi/kernels/gpu/logsumexp_kernel.cu b/paddle/phi/kernels/gpu/logsumexp_kernel.cu
index ef2c29bbb2da0d..d780f2258651f0 100644
--- a/paddle/phi/kernels/gpu/logsumexp_kernel.cu
+++ b/paddle/phi/kernels/gpu/logsumexp_kernel.cu
@@ -57,8 +57,8 @@ void LogsumexpFallbackKernel(const Context& dev_ctx,
   auto* in_x = &x;
   auto* out_y = out;
 
-  auto outdim = phi::make_ddim(outdim_vec);
-  auto keeped_outdim = phi::make_ddim(keeped_outdim_vec);
+  auto outdim = common::make_ddim(outdim_vec);
+  auto keeped_outdim = common::make_ddim(keeped_outdim_vec);
   out->Resize(outdim);
   dev_ctx.template Alloc<T>(out_y);
 
@@ -128,7 +128,7 @@ void LogsumexpKernel(const Context& dev_ctx,
     }
   }
 
-  auto outdim = phi::make_ddim(outdim_vec);
+  auto outdim = common::make_ddim(outdim_vec);
   if (compute_size <= 1024) {
     if (perm.size() != xdim.size())
       perm.insert(perm.end(), axis_vec.begin(), axis_vec.end());
@@ -138,7 +138,7 @@ void LogsumexpKernel(const Context& dev_ctx,
         (axis_vec.size() == 1 && axis_vec[0] == xdim.size())) {
       transpose_x = x;
     } else {
-      transpose_x.Resize(make_ddim(transpose_shape));
+      transpose_x.Resize(common::make_ddim(transpose_shape));
       dev_ctx.template Alloc<T>(&transpose_x);
       phi::funcs::TransposeGPUKernelDriver<T>(dev_ctx, x, perm, &transpose_x);
     }
diff --git a/paddle/phi/kernels/gpu/lstsq_kernel.cu b/paddle/phi/kernels/gpu/lstsq_kernel.cu
index adb0ca09d89386..85db2de74e6fdd 100644
--- a/paddle/phi/kernels/gpu/lstsq_kernel.cu
+++ b/paddle/phi/kernels/gpu/lstsq_kernel.cu
@@ -61,22 +61,22 @@ void LstsqKernel(const Context& dev_ctx,
   T rcond = rcond_scalar.to<T>();
 
   DenseTensor* new_x = new DenseTensor();
-  new_x->Resize(phi::make_ddim({batch_count, m, n}));
+  new_x->Resize(common::make_ddim({batch_count, m, n}));
   dev_ctx.template Alloc<T>(new_x);
   phi::Copy<Context>(dev_ctx, x, dev_ctx.GetPlace(), true, new_x);
 
   DenseTensor* new_y = new DenseTensor();
-  new_y->Resize(phi::make_ddim({batch_count, m, nrhs}));
+  new_y->Resize(common::make_ddim({batch_count, m, nrhs}));
   dev_ctx.template Alloc<T>(new_y);
   phi::Copy<Context>(dev_ctx, y, dev_ctx.GetPlace(), true, new_y);
 
   // Prepare tau
-  auto tau_dims_vec = phi::vectorize<int>(x_dims);
+  auto tau_dims_vec = common::vectorize<int>(x_dims);
   tau_dims_vec.pop_back();
   tau_dims_vec[tau_dims_vec.size() - 1] = min_mn;
 
   DenseTensor* tau = new DenseTensor();
-  tau->Resize(phi::make_ddim(tau_dims_vec));
+  tau->Resize(common::make_ddim(tau_dims_vec));
   auto tau_data = dev_ctx.template Alloc<T>(tau);
 
   if (m >= n) {
@@ -108,7 +108,7 @@ void LstsqKernel(const Context& dev_ctx,
     DenseTensor slice_r =
         phi::funcs::Slice<T>(dev_ctx, trans_r, {-2}, {0}, {min_mn});
     DenseTensor* res_r = new DenseTensor();
-    res_r->Resize(phi::make_ddim({batch_count, min_mn, min_mn}));
+    res_r->Resize(common::make_ddim({batch_count, min_mn, min_mn}));
     dev_ctx.template Alloc<T>(res_r);
     phi::TrilTriuKernel<T>(dev_ctx, slice_r, 0, false, res_r);
 
@@ -133,7 +133,7 @@ void LstsqKernel(const Context& dev_ctx,
     DenseTensor slice_r =
         phi::funcs::Slice<T>(dev_ctx, trans_r, {-2}, {0}, {min_mn});
     DenseTensor* res_r = new DenseTensor();
-    res_r->Resize(phi::make_ddim({batch_count, min_mn, min_mn}));
+    res_r->Resize(common::make_ddim({batch_count, min_mn, min_mn}));
     dev_ctx.template Alloc<T>(res_r);
     phi::TrilTriuKernel<T>(dev_ctx, slice_r, 0, false, res_r);
 
@@ -161,7 +161,7 @@ void LstsqKernel(const Context& dev_ctx,
         dev_ctx, solu_tensor, dev_ctx.GetPlace(), true, solution);
   }
 
-  if (batch_count == 1) solution->Resize(phi::make_ddim({n, nrhs}));
+  if (batch_count == 1) solution->Resize(common::make_ddim({n, nrhs}));
   GetResidualsTensor<Context, T>(dev_ctx, x, y, solution, residuals);
 }
 
diff --git a/paddle/phi/kernels/gpu/lu_kernel.cu b/paddle/phi/kernels/gpu/lu_kernel.cu
index d5646d546b67d9..f509e0a173161b 100644
--- a/paddle/phi/kernels/gpu/lu_kernel.cu
+++ b/paddle/phi/kernels/gpu/lu_kernel.cu
@@ -138,16 +138,16 @@ void LUKernel(const Context& dev_ctx,
   int n = static_cast<int>(outdims[outrank - 2]);
   int lda = std::max(1, m);
   if (pivot) {
-    auto ipiv_dims = phi::slice_ddim(outdims, 0, outrank - 1);
+    auto ipiv_dims = common::slice_ddim(outdims, 0, outrank - 1);
     ipiv_dims[outrank - 2] = std::min(m, n);
     pivots->Resize(ipiv_dims);
   }
   dev_ctx.template Alloc<int>(pivots);
   auto ipiv_data = pivots->data<int>();
 
-  auto info_dims = phi::slice_ddim(outdims, 0, outrank - 2);
+  auto info_dims = common::slice_ddim(outdims, 0, outrank - 2);
   if (info_dims.size() == 0) {
-    info_dims = phi::make_ddim({1});
+    info_dims = common::make_ddim({1});
   }
   infos->Resize(info_dims);
   dev_ctx.template Alloc<int>(infos);
diff --git a/paddle/phi/kernels/gpu/masked_select_grad_kernel.cu b/paddle/phi/kernels/gpu/masked_select_grad_kernel.cu
index 4feadcf899a443..4bf5949f084fe5 100644
--- a/paddle/phi/kernels/gpu/masked_select_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/masked_select_grad_kernel.cu
@@ -65,8 +65,8 @@ void MaskedSelectGradKernel(const Context& dev_ctx,
   bool expand_x = false;
 
   auto expanded_size = funcs::MatrixGetBroadcastBatchPortion(
-      vectorize(x_grad->dims()), vectorize(mask.dims()));
-  auto expaned_dims = make_ddim(expanded_size);
+      common::vectorize(x_grad->dims()), common::vectorize(mask.dims()));
+  auto expaned_dims = common::make_ddim(expanded_size);
 
   if (mask.dims() != expaned_dims) {
     ExpandKernel<bool, Context>(
diff --git a/paddle/phi/kernels/gpu/masked_select_kernel.cu b/paddle/phi/kernels/gpu/masked_select_kernel.cu
index cd92d7f03e7dfd..9739f9799a4ec1 100644
--- a/paddle/phi/kernels/gpu/masked_select_kernel.cu
+++ b/paddle/phi/kernels/gpu/masked_select_kernel.cu
@@ -54,9 +54,9 @@ void MaskedSelectKernel(const Context& dev_ctx,
   DenseTensor x_expand;
 
   auto expanded_size = funcs::MatrixGetBroadcastBatchPortion(
-      vectorize(x.dims()), vectorize(mask.dims()));
+      common::vectorize(x.dims()), common::vectorize(mask.dims()));
 
-  DDim epxand_dims = make_ddim(expanded_size);
+  DDim epxand_dims = common::make_ddim(expanded_size);
   if (mask.dims() != epxand_dims) {
     phi::ExpandKernel<bool, Context>(
         dev_ctx, mask, IntArray(expanded_size), &mask_expand);
diff --git a/paddle/phi/kernels/gpu/mode_kernel.cu b/paddle/phi/kernels/gpu/mode_kernel.cu
index ed598b2e75d849..793928177000e6 100644
--- a/paddle/phi/kernels/gpu/mode_kernel.cu
+++ b/paddle/phi/kernels/gpu/mode_kernel.cu
@@ -56,7 +56,7 @@ void ModeKernel(const Context& dev_ctx,
 
   if (axis == in_dims.size() - 1) {
     const int64_t& input_height =
-        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+        common::product(common::slice_ddim(in_dims, 0, in_dims.size() - 1));
     const int64_t& input_width = in_dims[in_dims.size() - 1];
     funcs::GetModebySort<T>(
         dev_ctx, &x, input_width, input_height, output_data, indices_data);
@@ -80,7 +80,7 @@ void ModeKernel(const Context& dev_ctx,
       for (int i = axis + 1; i < in_dims.size(); i++) {
         tmp_out_shape.emplace_back(in_dims[i]);
       }
-      DDim tmp_out_dim = phi::make_ddim(tmp_out_shape);
+      DDim tmp_out_dim = common::make_ddim(tmp_out_shape);
       out->Resize(tmp_out_dim);
       indices->Resize(tmp_out_dim);
     }
@@ -109,8 +109,8 @@ void ModeKernel(const Context& dev_ctx,
     trans_out.Resize(trans_out_shape);
     T* trans_out_data = dev_ctx.template Alloc<T>(&trans_out);
 
-    const int64_t input_height =
-        phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1));
+    const int64_t input_height = common::product(
+        common::slice_ddim(trans_shape, 0, trans_shape.size() - 1));
     const int64_t input_width = trans_shape[trans_shape.size() - 1];
     funcs::GetModebySort<T>(dev_ctx,
                             &trans_input,
diff --git a/paddle/phi/kernels/gpu/multinomial_kernel.cu b/paddle/phi/kernels/gpu/multinomial_kernel.cu
index 96fc3d1ac2b2e5..635e9189b7d89a 100644
--- a/paddle/phi/kernels/gpu/multinomial_kernel.cu
+++ b/paddle/phi/kernels/gpu/multinomial_kernel.cu
@@ -22,10 +22,10 @@ limitations under the License. */
 namespace cub = hipcub;
 #endif
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/scalar.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/arg_min_max_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
@@ -193,7 +193,8 @@ void MultinomialKernel(const Context& dev_ctx,
       ArgMaxKernel<T, Context>(
           dev_ctx, rand, -1, true, false, DataType::INT64, out);
     } else {
-      std::vector<int64_t> out_dim_vec = vectorize<int64_t>(out->dims());
+      std::vector<int64_t> out_dim_vec =
+          common::vectorize<int64_t>(out->dims());
       DenseTensor value = Empty<T, Context>(dev_ctx, IntArray(out_dim_vec));
       TopkKernel<T, Context>(
           dev_ctx, rand, num_samples, -1, true, true, &value, out);
diff --git a/paddle/phi/kernels/gpu/nanmedian_kernel.cu b/paddle/phi/kernels/gpu/nanmedian_kernel.cu
index ccd1b5561f081d..01144442f3904b 100644
--- a/paddle/phi/kernels/gpu/nanmedian_kernel.cu
+++ b/paddle/phi/kernels/gpu/nanmedian_kernel.cu
@@ -166,10 +166,10 @@ void ProcessMedianKernel(const Context& dev_ctx,
 
   bool ignore_nan = true;
   if (ignore_nan) {
-    nan_counts.Resize(phi::make_ddim({pre_dim}));
+    nan_counts.Resize(common::make_ddim({pre_dim}));
     dev_ctx.template Alloc<int64_t>(&nan_counts);
     nan_counts_ptr = nan_counts.data<int64_t>();
-    nan_stat.Resize(phi::make_ddim({2}));
+    nan_stat.Resize(common::make_ddim({2}));
     int64_t* nan_stat_mem = dev_ctx.template Alloc<int64_t>(&nan_stat);
     int64_t* nan_stat_ptr = nan_stat.data<int64_t>();
 
diff --git a/paddle/phi/kernels/gpu/nms_kernel.cu b/paddle/phi/kernels/gpu/nms_kernel.cu
index 8eab5c261f5863..3de36f7bd68b38 100644
--- a/paddle/phi/kernels/gpu/nms_kernel.cu
+++ b/paddle/phi/kernels/gpu/nms_kernel.cu
@@ -103,7 +103,7 @@ void NMSKernel(const Context& dev_ctx,
       }
     }
   }
-  output->Resize(phi::make_ddim({last_box_num}));
+  output->Resize(common::make_ddim({last_box_num}));
   auto* output_data = dev_ctx.template Alloc<int64_t>(output);
   memory_utils::Copy(dev_ctx.GetPlace(),
                      output_data,
diff --git a/paddle/phi/kernels/gpu/nonzero_kernel.cu b/paddle/phi/kernels/gpu/nonzero_kernel.cu
index bc44f4f033c458..65cdcd3d6a058d 100644
--- a/paddle/phi/kernels/gpu/nonzero_kernel.cu
+++ b/paddle/phi/kernels/gpu/nonzero_kernel.cu
@@ -20,7 +20,7 @@
 namespace cub = hipcub;
 #endif
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/phi/kernels/gpu/number_count_kernel.cu b/paddle/phi/kernels/gpu/number_count_kernel.cu
index e17727751f4bf1..6fdfb71724aef4 100644
--- a/paddle/phi/kernels/gpu/number_count_kernel.cu
+++ b/paddle/phi/kernels/gpu/number_count_kernel.cu
@@ -77,7 +77,7 @@ void NumberCountKernel(const Context& ctx,
                        DenseTensor* out) {
   int64_t batch_size = numbers.numel();
 
-  DDim out_dims = phi::make_ddim({upper_range});
+  DDim out_dims = common::make_ddim({upper_range});
   out->Resize(out_dims);
   auto out_data = ctx.template Alloc<T>(out);
   const T* gate_data = numbers.data<T>();
diff --git a/paddle/phi/kernels/gpu/overlap_add_grad_kernel.cu b/paddle/phi/kernels/gpu/overlap_add_grad_kernel.cu
index a2ec60109d6404..337620a556db5e 100644
--- a/paddle/phi/kernels/gpu/overlap_add_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/overlap_add_grad_kernel.cu
@@ -50,15 +50,16 @@ void OverlapAddGradKernel(const Context& dev_ctx,
     phi::DDim x_grad_resized_dims;
     phi::DDim out_grad_resized_dims;
     if (axis == 0) {
-      preserved_dims = phi::slice_ddim(out_grad_.dims(), 1, out_grad_rank);
+      preserved_dims = common::slice_ddim(out_grad_.dims(), 1, out_grad_rank);
       x_grad_resized_dims = {
-          n_frames, frame_length, phi::product(preserved_dims)};
-      out_grad_resized_dims = {seq_length, phi::product(preserved_dims)};
+          n_frames, frame_length, common::product(preserved_dims)};
+      out_grad_resized_dims = {seq_length, common::product(preserved_dims)};
     } else {
-      preserved_dims = phi::slice_ddim(out_grad_.dims(), 0, out_grad_rank - 1);
+      preserved_dims =
+          common::slice_ddim(out_grad_.dims(), 0, out_grad_rank - 1);
       x_grad_resized_dims = {
-          phi::product(preserved_dims), frame_length, n_frames};
-      out_grad_resized_dims = {phi::product(preserved_dims), seq_length};
+          common::product(preserved_dims), frame_length, n_frames};
+      out_grad_resized_dims = {common::product(preserved_dims), seq_length};
     }
     x_grad->Resize(x_grad_resized_dims);
     out_grad_.Resize(out_grad_resized_dims);
@@ -73,31 +74,31 @@ void OverlapAddGradKernel(const Context& dev_ctx,
       trans_out_grad = out_grad_;
 
       std::vector<int> perm_x_grad{1, 0};
-      auto x_grad_dims_vec = phi::vectorize(x_grad->dims());
+      auto x_grad_dims_vec = common::vectorize(x_grad->dims());
       for (int i = 0; i < x_grad->dims().size(); ++i) {
         x_grad_dims_vec[i] = x_grad->dims()[perm_x_grad[i]];
       }
-      trans_x_grad.Resize(phi::make_ddim(x_grad_dims_vec));
+      trans_x_grad.Resize(common::make_ddim(x_grad_dims_vec));
       dev_ctx.template Alloc<T>(&trans_x_grad);
       phi::funcs::TransCompute<Context, T>(
           perm_x_grad.size(), dev_ctx, *x_grad, &trans_x_grad, perm_x_grad);
     } else {
       std::vector<int> perm_d_out{1, 0};
-      auto out_grad_dims_vec = phi::vectorize(out_grad_.dims());
+      auto out_grad_dims_vec = common::vectorize(out_grad_.dims());
       for (int i = 0; i < out_grad_.dims().size(); ++i) {
         out_grad_dims_vec[i] = out_grad_.dims()[perm_d_out[i]];
       }
-      trans_out_grad.Resize(phi::make_ddim(out_grad_dims_vec));
+      trans_out_grad.Resize(common::make_ddim(out_grad_dims_vec));
       dev_ctx.template Alloc<T>(&trans_out_grad);
       phi::funcs::TransCompute<Context, T>(
           perm_d_out.size(), dev_ctx, out_grad_, &trans_out_grad, perm_d_out);
 
       std::vector<int> perm_x_grad{2, 1, 0};
-      auto x_grad_dims_vec = phi::vectorize(x_grad->dims());
+      auto x_grad_dims_vec = common::vectorize(x_grad->dims());
       for (int i = 0; i < x_grad->dims().size(); ++i) {
         x_grad_dims_vec[i] = x_grad->dims()[perm_x_grad[i]];
       }
-      trans_x_grad.Resize(phi::make_ddim(x_grad_dims_vec));
+      trans_x_grad.Resize(common::make_ddim(x_grad_dims_vec));
       dev_ctx.template Alloc<T>(&trans_x_grad);
       phi::funcs::TransCompute<Context, T>(
           perm_x_grad.size(), dev_ctx, *x_grad, &trans_x_grad, perm_x_grad);
@@ -146,7 +147,7 @@ void OverlapAddGradKernel(const Context& dev_ctx,
       restored_x_grad_shape.push_back(n_frames);
     }
 
-    x_grad->Resize(phi::make_ddim(restored_x_grad_shape));
+    x_grad->Resize(common::make_ddim(restored_x_grad_shape));
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/overlap_add_kernel.cu b/paddle/phi/kernels/gpu/overlap_add_kernel.cu
index b8726b8d8e15ad..71668e9e10b43a 100644
--- a/paddle/phi/kernels/gpu/overlap_add_kernel.cu
+++ b/paddle/phi/kernels/gpu/overlap_add_kernel.cu
@@ -46,13 +46,15 @@ void OverlapAddKernel(const Context& dev_ctx,
     phi::DDim x_resized_dims;
     phi::DDim out_resized_dims;
     if (axis == 0) {
-      preserved_dims = phi::slice_ddim(out->dims(), 1, out_rank);
-      x_resized_dims = {n_frames, frame_length, phi::product(preserved_dims)};
-      out_resized_dims = {seq_length, phi::product(preserved_dims)};
+      preserved_dims = common::slice_ddim(out->dims(), 1, out_rank);
+      x_resized_dims = {
+          n_frames, frame_length, common::product(preserved_dims)};
+      out_resized_dims = {seq_length, common::product(preserved_dims)};
     } else {
-      preserved_dims = phi::slice_ddim(out->dims(), 0, out_rank - 1);
-      x_resized_dims = {phi::product(preserved_dims), frame_length, n_frames};
-      out_resized_dims = {phi::product(preserved_dims), seq_length};
+      preserved_dims = common::slice_ddim(out->dims(), 0, out_rank - 1);
+      x_resized_dims = {
+          common::product(preserved_dims), frame_length, n_frames};
+      out_resized_dims = {common::product(preserved_dims), seq_length};
     }
     x_.Resize(x_resized_dims);
     out->Resize(out_resized_dims);
@@ -67,31 +69,31 @@ void OverlapAddKernel(const Context& dev_ctx,
       trans_out = *out;
 
       std::vector<int> perm_x{1, 0};
-      auto x_dims_vec = phi::vectorize(x_.dims());
+      auto x_dims_vec = common::vectorize(x_.dims());
       for (int i = 0; i < x_.dims().size(); ++i) {
         x_dims_vec[i] = x_.dims()[perm_x[i]];
       }
-      trans_x.Resize(phi::make_ddim(x_dims_vec));
+      trans_x.Resize(common::make_ddim(x_dims_vec));
       dev_ctx.template Alloc<T>(&trans_x);
       phi::funcs::TransCompute<Context, T>(
           perm_x.size(), dev_ctx, x_, &trans_x, perm_x);
     } else {
       std::vector<int> perm_out{1, 0};
-      auto out_dims_vec = phi::vectorize(out->dims());
+      auto out_dims_vec = common::vectorize(out->dims());
       for (int i = 0; i < out->dims().size(); ++i) {
         out_dims_vec[i] = out->dims()[perm_out[i]];
       }
-      trans_out.Resize(phi::make_ddim(out_dims_vec));
+      trans_out.Resize(common::make_ddim(out_dims_vec));
       dev_ctx.template Alloc<T>(&trans_out);
       phi::funcs::TransCompute<Context, T>(
           perm_out.size(), dev_ctx, *out, &trans_out, perm_out);
 
       std::vector<int> perm_x{2, 1, 0};
-      auto x_dims_vec = phi::vectorize(x_.dims());
+      auto x_dims_vec = common::vectorize(x_.dims());
       for (int i = 0; i < x_.dims().size(); ++i) {
         x_dims_vec[i] = x_.dims()[perm_x[i]];
       }
-      trans_x.Resize(phi::make_ddim(x_dims_vec));
+      trans_x.Resize(common::make_ddim(x_dims_vec));
       dev_ctx.template Alloc<T>(&trans_x);
       phi::funcs::TransCompute<Context, T>(
           perm_x.size(), dev_ctx, x_, &trans_x, perm_x);
@@ -132,7 +134,7 @@ void OverlapAddKernel(const Context& dev_ctx,
       restored_out_shape.push_back(seq_length);
     }
 
-    out->Resize(phi::make_ddim(restored_out_shape));
+    out->Resize(common::make_ddim(restored_out_shape));
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/p_recv_kernel.cu b/paddle/phi/kernels/gpu/p_recv_kernel.cu
index 1e413797b6b893..b6fd090173260f 100644
--- a/paddle/phi/kernels/gpu/p_recv_kernel.cu
+++ b/paddle/phi/kernels/gpu/p_recv_kernel.cu
@@ -16,9 +16,9 @@
 
 #include "glog/logging.h"
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/common/memory_utils.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 #if defined(PADDLE_WITH_NCCL) || \
@@ -168,7 +168,7 @@ void PRecvArrayKernel(const Context& dev_ctx,
     dev_ctx.Alloc(&out, dtype);
     comm_ctx->Recv(&out, out.numel(), peer, stream);
     VLOG(3) << "rank " << comm_ctx->GetRank() << " recv "
-            << phi::product(out_dims) << " from " << peer;
+            << common::product(out_dims) << " from " << peer;
   }
 #else
   PADDLE_THROW(
diff --git a/paddle/phi/kernels/gpu/p_send_kernel.cu b/paddle/phi/kernels/gpu/p_send_kernel.cu
index 520adcf730a1d6..efbb69afcdab75 100644
--- a/paddle/phi/kernels/gpu/p_send_kernel.cu
+++ b/paddle/phi/kernels/gpu/p_send_kernel.cu
@@ -156,7 +156,7 @@ void PSendArrayKernel(const Context& dev_ctx,
     ncclDataType_t dtype = ToNCCLDataType(x.type());
     comm_ctx->Send(x, x.numel(), peer, stream);
     VLOG(3) << "rank " << comm_ctx->GetRank() << " send "
-            << phi::product(x.dims()) << " to " << peer;
+            << common::product(x.dims()) << " to " << peer;
   }
 #else
   PADDLE_THROW(
diff --git a/paddle/phi/kernels/gpu/qr_kernel.cu b/paddle/phi/kernels/gpu/qr_kernel.cu
index 14f602cc95bd62..5bbb2ef158aa1a 100644
--- a/paddle/phi/kernels/gpu/qr_kernel.cu
+++ b/paddle/phi/kernels/gpu/qr_kernel.cu
@@ -40,7 +40,7 @@ static DenseTensor Fill(const Context& ctx,
                         std::vector<int> shape,
                         float fill_value) {
   DenseTensor ret;
-  ret.Resize(make_ddim(shape));
+  ret.Resize(common::make_ddim(shape));
   ctx.template Alloc<T>(&ret);
   funcs::SetConstant<Context, T>()(ctx, &ret, T(fill_value));
   return ret;
@@ -85,7 +85,7 @@ void QrKernel(const Context& ctx,
   phi::Copy(ctx, x, ctx.GetPlace(), false, &qr);
 
   // Prepare tau
-  auto tau_dims_vec = phi::vectorize<int>(x_dims);
+  auto tau_dims_vec = common::vectorize<int>(x_dims);
   tau_dims_vec.pop_back();
   tau_dims_vec[tau_dims_vec.size() - 1] = min_mn;
   DenseTensor tau = Fill<T, Context>(ctx, tau_dims_vec, 0);
@@ -133,7 +133,7 @@ void QrKernel(const Context& ctx,
       phi::Copy(ctx, sliced_q, q->place(), false, q);
     } else {
       if (m > n) {
-        auto new_qr_dims_vec = phi::vectorize<int>(x_dims);
+        auto new_qr_dims_vec = common::vectorize<int>(x_dims);
         new_qr_dims_vec[new_qr_dims_vec.size() - 1] = m;
         DenseTensor new_qr = Fill<T, Context>(ctx, new_qr_dims_vec, 0);
         auto new_qr_data = ctx.template Alloc<phi::dtype::Real<T>>(&new_qr);
@@ -195,11 +195,11 @@ void BatchedGeqrf<GPUContext, float>(const GPUContext& dev_ctx,
       phi::dynload::cusolverDnSgeqrf_bufferSize(handle, m, n, a, lda, &lwork));
 
   DenseTensor workspace = DenseTensor();
-  workspace.Resize(make_ddim({lwork}));
+  workspace.Resize(common::make_ddim({lwork}));
   float* workspace_ptr = dev_ctx.template Alloc<float>(&workspace);
 
   DenseTensor info = DenseTensor();
-  info.Resize(make_ddim({1}));
+  info.Resize(common::make_ddim({1}));
   int* info_d = dev_ctx.template Alloc<int>(&info);
 
   for (int i = 0; i < batch_size; ++i) {
@@ -249,11 +249,11 @@ void BatchedGeqrf<GPUContext, double>(const GPUContext& dev_ctx,
       phi::dynload::cusolverDnDgeqrf_bufferSize(handle, m, n, a, lda, &lwork));
 
   DenseTensor workspace = DenseTensor();
-  workspace.Resize(make_ddim({lwork}));
+  workspace.Resize(common::make_ddim({lwork}));
   double* workspace_ptr = dev_ctx.template Alloc<double>(&workspace);
 
   DenseTensor info = DenseTensor();
-  info.Resize(make_ddim({1}));
+  info.Resize(common::make_ddim({1}));
   int* info_d = dev_ctx.template Alloc<int>(&info);
 
   for (int i = 0; i < batch_size; ++i) {
@@ -304,11 +304,11 @@ void BatchedOrgqr<GPUContext, float>(const GPUContext& dev_ctx,
       handle, m, n, k, a, lda, tau, &lwork));
 
   DenseTensor workspace = DenseTensor();
-  workspace.Resize(make_ddim({lwork}));
+  workspace.Resize(common::make_ddim({lwork}));
   float* workspace_ptr = dev_ctx.template Alloc<float>(&workspace);
 
   DenseTensor info = DenseTensor();
-  info.Resize(make_ddim({1}));
+  info.Resize(common::make_ddim({1}));
   int* info_d = dev_ctx.template Alloc<int>(&info);
 
   for (int i = 0; i < batch_size; ++i) {
@@ -360,11 +360,11 @@ void BatchedOrgqr<GPUContext, double>(const GPUContext& dev_ctx,
       handle, m, n, k, a, lda, tau, &lwork));
 
   DenseTensor workspace = DenseTensor();
-  workspace.Resize(make_ddim({lwork}));
+  workspace.Resize(common::make_ddim({lwork}));
   double* workspace_ptr = dev_ctx.template Alloc<double>(&workspace);
 
   DenseTensor info = DenseTensor();
-  info.Resize(make_ddim({1}));
+  info.Resize(common::make_ddim({1}));
   int* info_d = dev_ctx.template Alloc<int>(&info);
 
   for (int i = 0; i < batch_size; ++i) {
diff --git a/paddle/phi/kernels/gpu/randint_kernel.cu b/paddle/phi/kernels/gpu/randint_kernel.cu
index 39a57a5a6e8653..22c32b883bf08a 100644
--- a/paddle/phi/kernels/gpu/randint_kernel.cu
+++ b/paddle/phi/kernels/gpu/randint_kernel.cu
@@ -31,7 +31,7 @@ void RandintKernel(const Context& dev_ctx,
                    DataType dtype,
                    DenseTensor* out) {
   int seed = 0;
-  out->Resize(phi::make_ddim(shape.GetData()));
+  out->Resize(common::make_ddim(shape.GetData()));
   T* data = dev_ctx.template Alloc<T>(out);
   funcs::uniform_distribution<uint32_t> dist;
   funcs::uniform_int_transform<T, uint32_t> trans(low, high);
diff --git a/paddle/phi/kernels/gpu/randperm_kernel.cu b/paddle/phi/kernels/gpu/randperm_kernel.cu
index 4c6597b93f91fd..f439336cc1e709 100644
--- a/paddle/phi/kernels/gpu/randperm_kernel.cu
+++ b/paddle/phi/kernels/gpu/randperm_kernel.cu
@@ -107,7 +107,7 @@ void RandpermKernel(const Context& dev_ctx,
     range_data[idx] = static_cast<T>(idx);
   });
 
-  out->Resize(phi::make_ddim({n}));
+  out->Resize(common::make_ddim({n}));
   T* out_data = dev_ctx.template Alloc<T>(out);
 
   // Refer to [Algorithm of randperm] https://osf.io/af2hy/ to
diff --git a/paddle/phi/kernels/gpu/reduce_amin_amax_common.h b/paddle/phi/kernels/gpu/reduce_amin_amax_common.h
index b04267030b2846..02dfa4348f5c0d 100644
--- a/paddle/phi/kernels/gpu/reduce_amin_amax_common.h
+++ b/paddle/phi/kernels/gpu/reduce_amin_amax_common.h
@@ -42,7 +42,7 @@ void ReduceCudaAMaxAMinGrad(const Context& dev_ctx,
   // get reduce_dim and reduce_num for reduce_mean_grad
   int dim_size = in_x->dims().size();
   auto reduce_dims = funcs::details::GetReduceDim(dims, dim_size, reduce_all);
-  auto update_dims = vectorize(d_x->dims());
+  auto update_dims = common::vectorize(d_x->dims());
   int reduce_num = 1;
   for (auto i : reduce_dims) {
     reduce_num *= (in_x->dims())[i];
@@ -52,12 +52,12 @@ void ReduceCudaAMaxAMinGrad(const Context& dev_ctx,
   // make new tensor reduce_out
   phi::DenseTensor new_y(out_y->type());
   new_y.ShareDataWith(*out_y);
-  new_y.Resize(phi::make_ddim(update_dims));
+  new_y.Resize(common::make_ddim(update_dims));
 
   // make new tensor d_out
   phi::DenseTensor new_dout(d_out->type());
   new_dout.ShareDataWith(*d_out);
-  new_dout.Resize(phi::make_ddim(update_dims));
+  new_dout.Resize(common::make_ddim(update_dims));
   dev_ctx.Alloc(d_x, d_out->dtype());
 
   auto new_in = std::make_unique<phi::DenseTensor>(*in_x);
@@ -74,7 +74,7 @@ void ReduceCudaAMaxAMinGrad(const Context& dev_ctx,
 
   // make new tensor equal_count
   phi::DenseTensor* equal_count = new phi::DenseTensor();
-  equal_count->Resize(phi::make_ddim(update_dims));
+  equal_count->Resize(common::make_ddim(update_dims));
   dev_ctx.template Alloc<T>(equal_count);
 
   // compute
diff --git a/paddle/phi/kernels/gpu/reduce_grad.h b/paddle/phi/kernels/gpu/reduce_grad.h
index 7e01c1ae843910..0a01fe1ff1aab4 100644
--- a/paddle/phi/kernels/gpu/reduce_grad.h
+++ b/paddle/phi/kernels/gpu/reduce_grad.h
@@ -61,7 +61,7 @@ void ReduceGradKernel(const Context& dev_ctx,
   std::vector<int> reduce_dims =
       funcs::details::GetReduceDim(dims, dim_size, reduce_all);
 
-  auto update_dims = vectorize(d_x->dims());
+  auto update_dims = common::vectorize(d_x->dims());
   int reduce_num = 1;
   for (auto i : reduce_dims) {
     reduce_num *= (in_x->dims())[i];
@@ -70,7 +70,7 @@ void ReduceGradKernel(const Context& dev_ctx,
   // make new tensor
   DenseTensor new_d_out(d_out->dtype());
   new_d_out.ShareDataWith(*d_out);
-  new_d_out.Resize(phi::make_ddim(update_dims));
+  new_d_out.Resize(common::make_ddim(update_dims));
 
   dev_ctx.Alloc(d_x, x.dtype());
 
diff --git a/paddle/phi/kernels/gpu/reduce_kernel.cu b/paddle/phi/kernels/gpu/reduce_kernel.cu
index d9714d37febd9b..51b50ed6e00248 100644
--- a/paddle/phi/kernels/gpu/reduce_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_kernel.cu
@@ -52,7 +52,7 @@ void ReduceSumGradKernel(const Context& dev_ctx,
   std::vector<int> reduce_dims =
       funcs::details::GetReduceDim(dims.GetData(), dim_size, reduce_all);
 
-  auto update_dims = vectorize(x.dims());
+  auto update_dims = common::vectorize(x.dims());
   for (auto i : reduce_dims) {
     update_dims[i] = 1;
   }
@@ -60,7 +60,7 @@ void ReduceSumGradKernel(const Context& dev_ctx,
   // make new tensor
   DenseTensor new_out_grad(out_grad.dtype());
   new_out_grad.ShareDataWith(out_grad);
-  new_out_grad.Resize(phi::make_ddim(update_dims));
+  new_out_grad.Resize(common::make_ddim(update_dims));
 
   // call ReduceGrad
   dev_ctx.Alloc(x_grad, x.dtype());
@@ -89,7 +89,7 @@ void ReduceMinGradKernel(const Context& dev_ctx,
   int dim_size = x.dims().size();
   auto reduce_dims =
       funcs::details::GetReduceDim(dims.GetData(), dim_size, reduce_all);
-  auto update_dims = vectorize(x.dims());
+  auto update_dims = common::vectorize(x.dims());
   for (auto i : reduce_dims) {
     update_dims[i] = 1;
   }
@@ -97,11 +97,11 @@ void ReduceMinGradKernel(const Context& dev_ctx,
   // make new tensor of out and out_grad
   phi::DenseTensor new_out(out.type());
   new_out.ShareDataWith(out);
-  new_out.Resize(phi::make_ddim(update_dims));
+  new_out.Resize(common::make_ddim(update_dims));
 
   phi::DenseTensor new_out_grad(out_grad.type());
   new_out_grad.ShareDataWith(out_grad);
-  new_out_grad.Resize(phi::make_ddim(update_dims));
+  new_out_grad.Resize(common::make_ddim(update_dims));
 
   // make equal_out
   phi::DenseTensor* equal_out = new phi::DenseTensor();
@@ -134,7 +134,7 @@ void ReduceMeanGradKernel(const Context& dev_ctx,
   std::vector<int> reduce_dims =
       funcs::details::GetReduceDim(dims.GetData(), dim_size, reduce_all);
 
-  auto update_dims = vectorize(x.dims());
+  auto update_dims = common::vectorize(x.dims());
   int reduce_num = 1;
   for (auto i : reduce_dims) {
     reduce_num *= (x.dims())[i];
@@ -144,7 +144,7 @@ void ReduceMeanGradKernel(const Context& dev_ctx,
   // make new tensor
   DenseTensor new_out_grad(out_grad.dtype());
   new_out_grad.ShareDataWith(out_grad);
-  new_out_grad.Resize(phi::make_ddim(update_dims));
+  new_out_grad.Resize(common::make_ddim(update_dims));
 
   // call BroadcastKernel
   dev_ctx.Alloc(x_grad, x.dtype());
@@ -172,7 +172,7 @@ void ReduceMaxGradKernel(const Context& dev_ctx,
   int dim_size = x.dims().size();
   auto reduce_dims =
       funcs::details::GetReduceDim(dims.GetData(), dim_size, reduce_all);
-  auto update_dims = vectorize(x.dims());
+  auto update_dims = common::vectorize(x.dims());
   for (auto i : reduce_dims) {
     update_dims[i] = 1;
   }
@@ -180,11 +180,11 @@ void ReduceMaxGradKernel(const Context& dev_ctx,
   // make new tensor of out and out_grad
   phi::DenseTensor new_out(out.type());
   new_out.ShareDataWith(out);
-  new_out.Resize(phi::make_ddim(update_dims));
+  new_out.Resize(common::make_ddim(update_dims));
 
   phi::DenseTensor new_out_grad(out_grad.type());
   new_out_grad.ShareDataWith(out_grad);
-  new_out_grad.Resize(phi::make_ddim(update_dims));
+  new_out_grad.Resize(common::make_ddim(update_dims));
 
   // make equal_out
   phi::DenseTensor* equal_out = new phi::DenseTensor();
diff --git a/paddle/phi/kernels/gpu/roi_pool_kernel.cu b/paddle/phi/kernels/gpu/roi_pool_kernel.cu
index f7a53636fcbf65..75bdf5d4664529 100644
--- a/paddle/phi/kernels/gpu/roi_pool_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_pool_kernel.cu
@@ -111,7 +111,7 @@ void RoiPoolKernel(const Context& dev_ctx,
                    DenseTensor* arg_max) {
   auto x_dims = x.dims();
   int batch_size = x_dims[0];
-  auto in_stride = phi::stride(x_dims);
+  auto in_stride = common::stride(x_dims);
   int channels = x_dims[1];
   int height = x_dims[2];
   int width = x_dims[3];
diff --git a/paddle/phi/kernels/gpu/roll_grad_kernel.cu b/paddle/phi/kernels/gpu/roll_grad_kernel.cu
index 71d1cd356a2692..7239868e78159e 100644
--- a/paddle/phi/kernels/gpu/roll_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/roll_grad_kernel.cu
@@ -37,7 +37,7 @@ void RollGradKernel(const Context& dev_ctx,
 
   int64_t numel = out_grad.numel();
   auto input_dim = out_grad.dims();
-  auto stride_dim = phi::stride(input_dim);
+  auto stride_dim = common::stride(input_dim);
 
   std::vector<int64_t> strides(rank), sizes(rank);
   if (axis.size() == 0) {
diff --git a/paddle/phi/kernels/gpu/roll_kernel.cu b/paddle/phi/kernels/gpu/roll_kernel.cu
index cf4f87ac118546..718abfe46994b2 100644
--- a/paddle/phi/kernels/gpu/roll_kernel.cu
+++ b/paddle/phi/kernels/gpu/roll_kernel.cu
@@ -14,11 +14,11 @@
 
 #include "paddle/phi/kernels/roll_kernel.h"
 
+#include "paddle/common/array.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/utils/array.h"
 #include "paddle/phi/kernels/gpu/roll_kernel_impl.h"
 
 namespace phi {
@@ -37,7 +37,7 @@ void RollKernel(const Context& dev_ctx,
 
   int64_t numel = x.numel();
   auto input_dim = x.dims();
-  auto stride_dim = phi::stride(input_dim);
+  auto stride_dim = common::stride(input_dim);
 
   std::vector<int64_t> strides(rank), sizes(rank);
   if (axis.size() == 0) {
diff --git a/paddle/phi/kernels/gpu/roll_kernel_impl.h b/paddle/phi/kernels/gpu/roll_kernel_impl.h
index c7ffcb2d5ca522..7689f5242a1223 100644
--- a/paddle/phi/kernels/gpu/roll_kernel_impl.h
+++ b/paddle/phi/kernels/gpu/roll_kernel_impl.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
+#include "paddle/common/array.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
-#include "paddle/phi/core/utils/array.h"
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/gpu/send_u_recv_kernel.cu b/paddle/phi/kernels/gpu/send_u_recv_kernel.cu
index 5e34c490d82999..192257ed8f95c0 100644
--- a/paddle/phi/kernels/gpu/send_u_recv_kernel.cu
+++ b/paddle/phi/kernels/gpu/send_u_recv_kernel.cu
@@ -47,11 +47,11 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
     }
   } else {
     // Set out dim following out_size.
-    std::vector<int64_t> dims_ = phi::vectorize(out->dims());
+    std::vector<int64_t> dims_ = common::vectorize(out->dims());
     if (dims_.size() > 0) {
       dims_[0] = out_size;
     }
-    out->Resize(phi::make_ddim(dims_));
+    out->Resize(common::make_ddim(dims_));
     memset_size = out_size;
     for (int i = 1; i < src_dims.size(); ++i) {
       memset_size *= src_dims[i];
diff --git a/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu b/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu
index 7274b391e8d135..07c81d86f61014 100644
--- a/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu
+++ b/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu
@@ -44,13 +44,13 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& ctx,
   const int& index_size = src_index.dims()[0];
   auto out_dims = out->dims();
   int64_t memset_size = 1;
-  std::vector<int64_t> dims_ = phi::vectorize(out_dims);
+  std::vector<int64_t> dims_ = common::vectorize(out_dims);
   if (out_size <= 0) {
     dims_[0] = x.dims()[0];
   } else {
     dims_[0] = out_size;
   }
-  out->Resize(phi::make_ddim(dims_));
+  out->Resize(common::make_ddim(dims_));
   for (size_t i = 0; i < dims_.size(); i++) {
     memset_size *= dims_[i];
   }
diff --git a/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu b/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu
index c50b1960d00563..f5aea524031d24 100644
--- a/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu
@@ -81,7 +81,7 @@ void CalculateGrad(const Context& ctx,
               out_grad, d_index, s_index, index_size, slice_size, x_grad);
     } else {
       const auto& bcast_info = phi::CalcBCastInfo(out_grad_dims, x_grad_dims);
-      auto out_grad_dims_1 = phi::vectorize<int>(out_grad_dims);
+      auto out_grad_dims_1 = common::vectorize<int>(out_grad_dims);
       std::vector<int> out_grad_dims_2(out_grad_dims_1.begin() + 1,
                                        out_grad_dims_1.end());
       out_grad_dims_2.insert(out_grad_dims_2.begin(), x_grad_dims[0]);
@@ -160,7 +160,7 @@ void CalculateGrad(const Context& ctx,
               mul_functor,
               sum_functor);
     } else {
-      auto out_grad_dims_1 = phi::vectorize<int>(out_grad_dims);
+      auto out_grad_dims_1 = common::vectorize<int>(out_grad_dims);
       std::vector<int> out_grad_dims_2(out_grad_dims_1.begin() + 1,
                                        out_grad_dims_1.end());
       out_grad_dims_2.insert(out_grad_dims_2.begin(), x_grad_dims[0]);
diff --git a/paddle/phi/kernels/gpu/shuffle_batch_grad_kernel.cu b/paddle/phi/kernels/gpu/shuffle_batch_grad_kernel.cu
index 33b39666edf071..9472861a64c8e3 100644
--- a/paddle/phi/kernels/gpu/shuffle_batch_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/shuffle_batch_grad_kernel.cu
@@ -21,9 +21,9 @@
 #include <thrust/shuffle.h>
 #endif
 
+#include "paddle/common/errors.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
diff --git a/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu b/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu
index e145e7e1c8a206..e1bc107e214f78 100644
--- a/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu
+++ b/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu
@@ -21,9 +21,9 @@
 #include <thrust/shuffle.h>
 #endif
 
+#include "paddle/common/errors.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
@@ -49,7 +49,7 @@ void ShuffleBatchKernel(const Context& dev_ctx,
   for (int i = 0; i < x.dims().size() - 1; i++) {
     elem_size *= x.dims()[i];
   }
-  shuffleidx->Resize(phi::make_ddim({elem_size}));
+  shuffleidx->Resize(common::make_ddim({elem_size}));
 
   int64_t seed_int = 0;
   if (seed.initialized()) {
@@ -92,7 +92,7 @@ void ShuffleBatchKernel(const Context& dev_ctx,
   phi::funcs::ForRange<phi::GPUContext> for_range(dev_ctx,
                                                   elem_size * x_embed_size);
   for_range(functor);
-  seed_out->Resize(phi::make_ddim({1}));
+  seed_out->Resize(common::make_ddim({1}));
   auto* seed_out_data = dev_ctx.template HostAlloc<int64_t>(seed_out);
   *seed_out_data = engine();
 #endif
diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
index a6e627a5fb4bf2..ffc3055c27acde 100644
--- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
@@ -124,7 +124,7 @@ void SigmoidCrossEntropyWithLogitsGradKernel(
     DenseTensor *norm_tensor = new DenseTensor();
     norm_tensor->Resize({sizeof(T)});
     dev_ctx.template Alloc<T>(norm_tensor);
-    auto dims = phi::vectorize(counts_tensor->dims());
+    auto dims = common::vectorize(counts_tensor->dims());
     std::vector<int> reduce_dim = {};
     for (int i = 0; i < dims.size(); i++) {
       reduce_dim.push_back(i);
diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
index 966c85506a128a..f94c09922980b7 100644
--- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
@@ -126,7 +126,7 @@ void SigmoidCrossEntropyWithLogitsKernel(
     DenseTensor *norm_tensor = new DenseTensor();
     norm_tensor->Resize({sizeof(T)});
     dev_ctx.template Alloc<T>(norm_tensor);
-    auto dims = phi::vectorize(counts_tensor->dims());
+    auto dims = common::vectorize(counts_tensor->dims());
     std::vector<int> reduce_dim = {};
     for (int i = 0; i < dims.size(); i++) {
       reduce_dim.push_back(i);
diff --git a/paddle/phi/kernels/gpu/strided_copy_kernel.cu b/paddle/phi/kernels/gpu/strided_copy_kernel.cu
index fc452eb44973dd..ae173b5f03528f 100644
--- a/paddle/phi/kernels/gpu/strided_copy_kernel.cu
+++ b/paddle/phi/kernels/gpu/strided_copy_kernel.cu
@@ -779,9 +779,9 @@ bool LaunchStrided2ContiguousCazeOneKernel(
 template <typename T, size_t IN_RANK>
 __global__ void Strided2ContiguousDefaultFunc(
     const T* input_data,
-    phi::Array<int64_t, phi::DDim::kMaxRank + 1> input_stride,
+    Array<int64_t, phi::DDim::kMaxRank + 1> input_stride,
     T* output_data,
-    phi::Array<int64_t, phi::DDim::kMaxRank + 1> dims,
+    Array<int64_t, phi::DDim::kMaxRank + 1> dims,
     const int64_t numel) {
   int64_t gid = blockIdx.x * blockDim.x + threadIdx.x;
 #pragma unroll
@@ -1185,8 +1185,8 @@ template <typename T, size_t OUT_RANK>
 __global__ void Contiguous2StridedDefaultFunc(
     const T* input_data,
     T* output_data,
-    phi::Array<int64_t, phi::DDim::kMaxRank + 1> output_stride,
-    phi::Array<int64_t, phi::DDim::kMaxRank + 1> dims,
+    Array<int64_t, phi::DDim::kMaxRank + 1> output_stride,
+    Array<int64_t, phi::DDim::kMaxRank + 1> dims,
     const int64_t numel) {
   int64_t gid = blockIdx.x * blockDim.x + threadIdx.x;
 #pragma unroll
@@ -1265,8 +1265,8 @@ void StridedCopyKernel(const Context& dev_ctx,
                        int64_t offset,
                        DenseTensor* out) {
   phi::DenseTensorMeta meta = input.meta();
-  meta.strides = phi::make_ddim(out_stride);
-  meta.dims = phi::make_ddim(dims);
+  meta.strides = common::make_ddim(out_stride);
+  meta.dims = common::make_ddim(dims);
   meta.offset = offset;
   out->set_meta(meta);
 
@@ -1286,8 +1286,8 @@ void StridedCopyKernel(const Context& dev_ctx,
 
   const T* input_data = input.data<T>();
   int rank = input.dims().size();
-  phi::Array<int64_t, phi::DDim::kMaxRank + 1> input_dims;
-  phi::Array<int64_t, phi::DDim::kMaxRank + 1> input_stride;
+  Array<int64_t, phi::DDim::kMaxRank + 1> input_dims;
+  Array<int64_t, phi::DDim::kMaxRank + 1> input_stride;
   for (int i = 0; i < input.dims().size(); i++) {
     input_dims[i] = input.dims()[i];
     input_stride[i] = input.strides()[i];
@@ -1299,7 +1299,7 @@ void StridedCopyKernel(const Context& dev_ctx,
                               "StridedCopyKernel's out tensor must complete "
                               "mutable data before call kernel."));
 
-  phi::Array<int64_t, phi::DDim::kMaxRank + 1> output_stride;
+  Array<int64_t, phi::DDim::kMaxRank + 1> output_stride;
   for (int i = 0; i < meta.dims.size(); i++) {
     output_stride[i] = meta.strides[i];
   }
diff --git a/paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu b/paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu
index b50fad637d106e..8d3a8ee2114bd9 100644
--- a/paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu
@@ -14,8 +14,8 @@
 
 #include "paddle/phi/kernels/temporal_shift_grad_kernel.h"
 
+#include "paddle/common/layout.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
@@ -98,7 +98,7 @@ void TemporalShiftGradKernel(const Context& dev_ctx,
   auto* input_grad = x_grad;
   auto* output_grad = &out_grad;
   int t = seg_num;
-  const DataLayout data_layout = phi::StringToDataLayout(data_format_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_format_str);
 
   const int nt = output_grad->dims()[0];
   const int c = (data_layout == DataLayout::kNCHW ? output_grad->dims()[1]
@@ -117,8 +117,8 @@ void TemporalShiftGradKernel(const Context& dev_ctx,
   const int c2 = static_cast<int>(c * 2 * shift_ratio);
 
   DDim in_grad_dims =
-      (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w})
-                                        : phi::make_ddim({nt, h, w, c}));
+      (data_layout == DataLayout::kNCHW ? common::make_ddim({nt, c, h, w})
+                                        : common::make_ddim({nt, h, w, c}));
   const T* output_grad_data = output_grad->data<T>();
   input_grad->Resize(in_grad_dims);
   T* input_grad_data = dev_ctx.template Alloc<T>(input_grad);
diff --git a/paddle/phi/kernels/gpu/temporal_shift_kernel.cu b/paddle/phi/kernels/gpu/temporal_shift_kernel.cu
index 4904da296488f3..5867fa98e01641 100644
--- a/paddle/phi/kernels/gpu/temporal_shift_kernel.cu
+++ b/paddle/phi/kernels/gpu/temporal_shift_kernel.cu
@@ -14,8 +14,8 @@
 
 #include "paddle/phi/kernels/temporal_shift_kernel.h"
 
+#include "paddle/common/layout.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
@@ -98,7 +98,7 @@ void TemporalShiftKernel(const Context& dev_ctx,
   auto* input = &x;
   auto* output = out;
   int t = seg_num;
-  const DataLayout data_layout = phi::StringToDataLayout(data_format_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_format_str);
 
   const int nt = input->dims()[0];
   const int c =
@@ -117,8 +117,8 @@ void TemporalShiftKernel(const Context& dev_ctx,
   const int c2 = static_cast<int>(c * 2 * shift_ratio);
 
   DDim out_dims =
-      (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w})
-                                        : phi::make_ddim({nt, h, w, c}));
+      (data_layout == DataLayout::kNCHW ? common::make_ddim({nt, c, h, w})
+                                        : common::make_ddim({nt, h, w, c}));
   const T* input_data = input->data<T>();
   output->Resize(out_dims);
   T* output_data = dev_ctx.template Alloc<T>(output);
diff --git a/paddle/phi/kernels/gpu/tile_kernel.cu b/paddle/phi/kernels/gpu/tile_kernel.cu
index 7861a2bdf01f87..06b07437cf660e 100644
--- a/paddle/phi/kernels/gpu/tile_kernel.cu
+++ b/paddle/phi/kernels/gpu/tile_kernel.cu
@@ -45,7 +45,7 @@ void TileKernel(const Context& dev_ctx,
             repeat_times_data[i]));
   }
 
-  auto vec_x_dims = phi::vectorize<int>(x_dims);
+  auto vec_x_dims = common::vectorize<int>(x_dims);
   if (repeat_times_data.size() < vec_x_dims.size()) {
     int diff = vec_x_dims.size() - repeat_times_data.size();
     repeat_times_data.insert(repeat_times_data.begin(), diff, 1);
@@ -63,19 +63,19 @@ void TileKernel(const Context& dev_ctx,
           vec_x_dims.size(),
           repeat_times_data.size()));
 
-  DDim new_x_dims = make_ddim(vec_x_dims);
+  DDim new_x_dims = common::make_ddim(vec_x_dims);
   DDim out_dims(new_x_dims);
   DenseTensor new_x = x;
   vec_x_dims.insert(vec_x_dims.begin(), 1, 1);
   for (size_t i = 0; i < repeat_times_data.size(); ++i) {
     out_dims[i] *= repeat_times_data[i];
-    new_x.Resize(make_ddim(vec_x_dims));
+    new_x.Resize(common::make_ddim(vec_x_dims));
     std::vector<const DenseTensor*> ins = {&new_x};
     vec_x_dims[i] *= repeat_times_data[i];
     if (i != repeat_times_data.size() - 1) {
       if (repeat_times_data[i] != 1) {
         DenseTensor tmp_out;
-        tmp_out.Resize(make_ddim(vec_x_dims));
+        tmp_out.Resize(common::make_ddim(vec_x_dims));
         dev_ctx.template Alloc<T>(&tmp_out);
         std::vector<DenseTensor*> outs = {&tmp_out};
         phi::funcs::BroadcastKernel<T>(
@@ -86,7 +86,7 @@ void TileKernel(const Context& dev_ctx,
       vec_x_dims[i] *= vec_x_dims[i + 1];
       vec_x_dims[i + 1] = 1;
     } else {
-      out->Resize(make_ddim(vec_x_dims));
+      out->Resize(common::make_ddim(vec_x_dims));
       dev_ctx.template Alloc<T>(out);
       std::vector<DenseTensor*> outs = {out};
       phi::funcs::BroadcastKernel<T>(
diff --git a/paddle/phi/kernels/gpu/top_k_kernel.cu b/paddle/phi/kernels/gpu/top_k_kernel.cu
index c5ac9f244d9682..aa8eb2c4969deb 100644
--- a/paddle/phi/kernels/gpu/top_k_kernel.cu
+++ b/paddle/phi/kernels/gpu/top_k_kernel.cu
@@ -98,7 +98,7 @@ void TopkKernel(const Context& dev_ctx,
   if (axis == in_dims.size() - 1) {
     // if get the topK from the last axis
     const int64_t& input_height =
-        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+        common::product(common::slice_ddim(in_dims, 0, in_dims.size() - 1));
     const int64_t& input_width = in_dims[in_dims.size() - 1];
 
     if (k > input_width) {
@@ -264,8 +264,8 @@ void TopkKernel(const Context& dev_ctx,
     dev_ctx.template Alloc<int64_t>(&trans_ind);
     dev_ctx.template Alloc<T>(&trans_out);
 
-    const int64_t input_height =
-        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const int64_t input_height = common::product(
+        common::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
     const int64_t input_width = trans_dims[trans_dims.size() - 1];
 
     if (k > input_width) k = input_width;
diff --git a/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu b/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu
index 549ecca212c85b..a78040eb6a6697 100644
--- a/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu
+++ b/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu
@@ -587,20 +587,20 @@ void TopPSamplingKernel(const Context& dev_ctx,
   int64_t* ids_ptr = dev_ctx.template Alloc<int64_t>(ids);
 
   DenseTensor ps_now;
-  ps_now.Resize(phi::make_ddim({bs, 1}));
+  ps_now.Resize(common::make_ddim({bs, 1}));
   dev_ctx.template Alloc<T>(&ps_now);
   phi::Copy(dev_ctx, ps, dev_ctx.GetPlace(), false, &ps_now);
 
   DenseTensor inds_input;
-  inds_input.Resize(phi::make_ddim({bs, vocab_size}));
+  inds_input.Resize(common::make_ddim({bs, vocab_size}));
   dev_ctx.template Alloc<int64_t>(&inds_input);
 
   DenseTensor sorted_out;
-  sorted_out.Resize(phi::make_ddim({bs, vocab_size}));
+  sorted_out.Resize(common::make_ddim({bs, vocab_size}));
   dev_ctx.template Alloc<T>(&sorted_out);
 
   DenseTensor sorted_id;
-  sorted_id.Resize(phi::make_ddim({bs, vocab_size}));
+  sorted_id.Resize(common::make_ddim({bs, vocab_size}));
   dev_ctx.template Alloc<int64_t>(&sorted_id);
 
   int BlockSize = GetBlockSize(vocab_size);
@@ -629,10 +629,10 @@ void TopPSamplingKernel(const Context& dev_ctx,
   setup_kernel<<<1, 256, 0, cu_stream>>>(dev_curand_states, seed, bs);
 
   DenseTensor count_iter;
-  count_iter.Resize(phi::make_ddim({bs + 1}));
+  count_iter.Resize(common::make_ddim({bs + 1}));
   dev_ctx.template Alloc<int>(&count_iter);
   DenseTensor count_iter_begin;
-  count_iter_begin.Resize(phi::make_ddim({bs}));
+  count_iter_begin.Resize(common::make_ddim({bs}));
   dev_ctx.template Alloc<int>(&count_iter_begin);
   SetCountIter<<<1, 256, 0, cu_stream>>>(count_iter.data<int>(), bs + 1);
 
@@ -684,7 +684,7 @@ void TopPSamplingKernel(const Context& dev_ctx,
   temp_storage_bytes = div_up(temp_storage_bytes, 256) * 256;
   int64_t temp_size = temp_storage_bytes;
   DenseTensor temp_storage;
-  temp_storage.Resize(phi::make_ddim({temp_size}));
+  temp_storage.Resize(common::make_ddim({temp_size}));
   dev_ctx.template Alloc<uint8_t>(&temp_storage);
 
   cub::DeviceSegmentedRadixSort::SortPairsDescending(
diff --git a/paddle/phi/kernels/gpu/triangular_solve_kernel.cu b/paddle/phi/kernels/gpu/triangular_solve_kernel.cu
index 889c421eb0bb96..2a943fd0ac6815 100644
--- a/paddle/phi/kernels/gpu/triangular_solve_kernel.cu
+++ b/paddle/phi/kernels/gpu/triangular_solve_kernel.cu
@@ -14,9 +14,9 @@
 
 #include "paddle/phi/kernels/triangular_solve_kernel.h"
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/expand_kernel.h"
@@ -47,7 +47,7 @@ void TriangularSolveKernel(const Context& dev_ctx,
   const T* x_bst_data = x_bst.data<T>();
   ExpandKernel<T, Context>(dev_ctx, x, x_bst_dims, &x_bst);
 
-  out->Resize(phi::make_ddim(y_bst_dims_vec));
+  out->Resize(common::make_ddim(y_bst_dims_vec));
   T* out_data = dev_ctx.template Alloc<T>(out);
   IntArray y_bst_dims(y_bst_dims_vec);
   ExpandKernel<T, Context>(dev_ctx, y, y_bst_dims, out);
diff --git a/paddle/phi/kernels/gpu/uniform_inplace_grad_kernel.cu b/paddle/phi/kernels/gpu/uniform_inplace_grad_kernel.cu
index 3b7f8a931278e9..aece91fb3ea46f 100644
--- a/paddle/phi/kernels/gpu/uniform_inplace_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/uniform_inplace_grad_kernel.cu
@@ -30,7 +30,7 @@ void UniformInplaceGradKernel(const Context& ctx,
                               int diag_step,
                               float diag_val,
                               DenseTensor* x_grad) {
-  auto dims = vectorize(x_grad->dims());
+  auto dims = common::vectorize(x_grad->dims());
   float value = static_cast<float>(0.0f);
   phi::FullKernel<T>(ctx, dims, value, phi::DataType::UNDEFINED, x_grad);
 }
diff --git a/paddle/phi/kernels/gpu/uniform_kernel.cu b/paddle/phi/kernels/gpu/uniform_kernel.cu
index 2a514947bb7177..f148cef4b3d535 100644
--- a/paddle/phi/kernels/gpu/uniform_kernel.cu
+++ b/paddle/phi/kernels/gpu/uniform_kernel.cu
@@ -61,7 +61,7 @@ void UniformKernel(const Context& dev_ctx,
                    const Scalar& max,
                    int seed,
                    DenseTensor* out) {
-  out->Resize(phi::make_ddim(shape.GetData()));
+  out->Resize(common::make_ddim(shape.GetData()));
   dev_ctx.template Alloc<T>(out);
   if (seed == 0) {
     // Use global Generator seed
diff --git a/paddle/phi/kernels/gpu/unique_consecutive_functor.h b/paddle/phi/kernels/gpu/unique_consecutive_functor.h
index d70813c84aaaee..1ec918a02c43f5 100644
--- a/paddle/phi/kernels/gpu/unique_consecutive_functor.h
+++ b/paddle/phi/kernels/gpu/unique_consecutive_functor.h
@@ -54,13 +54,13 @@ static void UniqueConsecutiveFlattenedCUDATensor(const Context& context,
   auto in_data_hat = context.template Alloc<InT>(&in_hat);
 
   DenseTensor sorted_indices;
-  sorted_indices.Resize(phi::make_ddim({num_input}));
+  sorted_indices.Resize(common::make_ddim({num_input}));
   auto sorted_indices_data = context.template Alloc<IndexT>(&sorted_indices);
   thrust::sequence(
       thrust::device, sorted_indices_data, sorted_indices_data + num_input);
   // 1. Calculate op result: 'out'
   DenseTensor range;
-  range.Resize(phi::make_ddim({num_input + 1}));
+  range.Resize(common::make_ddim({num_input + 1}));
   auto range_data_ptr = context.template Alloc<IndexT>(&range);
   thrust::sequence(
       thrust::device, range_data_ptr, range_data_ptr + num_input + 1);
@@ -72,14 +72,14 @@ static void UniqueConsecutiveFlattenedCUDATensor(const Context& context,
           thrust::device, out_data, out_data + num_input, range_data_ptr, equal)
           .first -
       out_data;
-  out->Resize(phi::make_ddim({num_out}));
+  out->Resize(common::make_ddim({num_out}));
 
   // 2. Calculate inverse index: 'inverse'
   if (return_inverse) {
-    inverse->Resize(phi::make_ddim({num_input}));
+    inverse->Resize(common::make_ddim({num_input}));
     auto inverse_data = context.template Alloc<IndexT>(inverse);
     DenseTensor inv_loc;
-    inv_loc.Resize(phi::make_ddim({num_input}));
+    inv_loc.Resize(common::make_ddim({num_input}));
     auto inv_loc_data_ptr = context.template Alloc<IndexT>(&inv_loc);
     thrust::adjacent_difference(thrust::device,
                                 in_data_hat,
@@ -100,7 +100,7 @@ static void UniqueConsecutiveFlattenedCUDATensor(const Context& context,
   }
   // 3. Calculate 'counts'
   if (return_counts) {
-    counts->Resize(phi::make_ddim({num_out}));
+    counts->Resize(common::make_ddim({num_out}));
     auto count_data = context.template Alloc<IndexT>(counts);
     // init 'count_data' as 0
     thrust::fill(thrust::device, count_data, count_data + num_out, 0);
@@ -174,10 +174,10 @@ static void ComputeUniqueConsecutiveDims(const Context& context,
                                          DenseTensor* inverse,
                                          DenseTensor* counts) {
   // 1. inverse indices: 'inverse'
-  inverse->Resize(phi::make_ddim({row}));
+  inverse->Resize(common::make_ddim({row}));
   auto inverse_data = context.template Alloc<IndexT>(inverse);
   DenseTensor inv_loc;
-  inv_loc.Resize(phi::make_ddim({row}));
+  inv_loc.Resize(common::make_ddim({row}));
   auto inv_loc_data_ptr = context.template Alloc<IndexT>(&inv_loc);
   thrust::adjacent_difference(thrust::device,
                               sorted_indices_data,
@@ -198,7 +198,7 @@ static void ComputeUniqueConsecutiveDims(const Context& context,
 
   // 2. sorted indices
   DenseTensor range;
-  range.Resize(phi::make_ddim({row + 1}));
+  range.Resize(common::make_ddim({row + 1}));
   auto range_data_ptr = context.template Alloc<IndexT>(&range);
   thrust::sequence(thrust::device, range_data_ptr, range_data_ptr + row + 1);
   int num_out;
@@ -211,10 +211,10 @@ static void ComputeUniqueConsecutiveDims(const Context& context,
             sorted_indices_data;
   thrust::device_ptr<IndexT> range_data_ptr_dev(range_data_ptr);
   range_data_ptr_dev[num_out] = row;
-  sorted_indices->Resize(phi::make_ddim({num_out}));
+  sorted_indices->Resize(common::make_ddim({num_out}));
 
   // 3. counts: 'counts'
-  counts->Resize(phi::make_ddim({num_out}));
+  counts->Resize(common::make_ddim({num_out}));
   auto count_data = context.template Alloc<IndexT>(counts);
   thrust::fill(thrust::device, count_data, count_data + row, 0);
   thrust::adjacent_difference(
@@ -349,11 +349,11 @@ static void UniqueConsecutiveDimsCUDATensor(const Context& context,
   std::iota(permute.begin(), permute.end(), 0);
   permute[axis] = 0;
   permute[0] = axis;
-  std::vector<int64_t> in_trans_dims_vec(phi::vectorize(in.dims()));
+  std::vector<int64_t> in_trans_dims_vec(common::vectorize(in.dims()));
   in_trans_dims_vec[axis] = in.dims()[0];
   in_trans_dims_vec[0] = in.dims()[axis];
   DenseTensor in_trans;
-  DDim in_trans_dims = phi::make_ddim(in_trans_dims_vec);
+  DDim in_trans_dims = common::make_ddim(in_trans_dims_vec);
   in_trans.Resize(in_trans_dims);
   context.template Alloc<InT>(&in_trans);
   phi::funcs::TransCompute<Context, InT>(in.dims().size(),  // num of dims
@@ -363,7 +363,7 @@ static void UniqueConsecutiveDimsCUDATensor(const Context& context,
                                          permute);   // index of axis
 
   // Reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2]
-  DDim in_trans_flat_dims = phi::flatten_to_2d(in_trans_dims, 1);
+  DDim in_trans_flat_dims = common::flatten_to_2d(in_trans_dims, 1);
   in_trans.Resize(in_trans_flat_dims);
 
   // now 'in_trans' is 2D
@@ -372,7 +372,7 @@ static void UniqueConsecutiveDimsCUDATensor(const Context& context,
   const InT* in_trans_data = in_trans.data<InT>();
 
   DenseTensor sorted_indices;
-  sorted_indices.Resize(phi::make_ddim({row}));
+  sorted_indices.Resize(common::make_ddim({row}));
   auto sorted_indices_data = context.template Alloc<IndexT>(&sorted_indices);
 
   // 2. Calculate 'inverse', 'counts'
@@ -396,14 +396,14 @@ static void UniqueConsecutiveDimsCUDATensor(const Context& context,
   DenseTensor out_trans;
   std::vector<int64_t> out_trans_dims_vec = in_trans_dims_vec;
   out_trans_dims_vec[0] = sorted_indices.numel();
-  out_trans.Resize(phi::make_ddim(out_trans_dims_vec));
+  out_trans.Resize(common::make_ddim(out_trans_dims_vec));
   context.template Alloc<InT>(&out_trans);
 
   IndexSelect<Context, InT, IndexT>(
       context, in_trans, sorted_indices, &out_trans, 0);
 
   std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]);
-  out->Resize(phi::make_ddim(out_trans_dims_vec));
+  out->Resize(common::make_ddim(out_trans_dims_vec));
   context.template Alloc<InT>(out);
   std::vector<DenseTensor> out_trans_unbind = phi::funcs::Unbind(out_trans);
   phi::funcs::ConcatFunctor<Context, InT> concat_functor;
diff --git a/paddle/phi/kernels/gpu/unique_consecutive_kernel.cu b/paddle/phi/kernels/gpu/unique_consecutive_kernel.cu
index 9c32bff0ccb809..207593065b7a91 100644
--- a/paddle/phi/kernels/gpu/unique_consecutive_kernel.cu
+++ b/paddle/phi/kernels/gpu/unique_consecutive_kernel.cu
@@ -17,8 +17,8 @@
 #include "paddle/phi/kernels/unique_consecutive_kernel.h"
 #include "paddle/phi/kernels/gpu/unique_consecutive_functor.h"
 
+#include "paddle/common/errors.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/gpu/unique_kernel.cu b/paddle/phi/kernels/gpu/unique_kernel.cu
index 5d4399e42e1abb..682528b1d80c64 100644
--- a/paddle/phi/kernels/gpu/unique_kernel.cu
+++ b/paddle/phi/kernels/gpu/unique_kernel.cu
@@ -121,7 +121,7 @@ UniqueFlattendCUDATensor(const Context& context,
   phi::Copy(context, in, context.GetPlace(), false, &in_hat);
   auto* in_data_hat = context.template Alloc<InT>(&in_hat);
 
-  indices->Resize(phi::make_ddim({num_input}));
+  indices->Resize(common::make_ddim({num_input}));
   auto* indices_data = context.template Alloc<IndexT>(indices);
 
   thrust::sequence(thrust::device, indices_data, indices_data + num_input);
@@ -130,7 +130,7 @@ UniqueFlattendCUDATensor(const Context& context,
 
   // 1. Calculate op result: 'out'
   DenseTensor range;
-  range.Resize(phi::make_ddim({num_input + 1}));
+  range.Resize(common::make_ddim({num_input + 1}));
   auto* range_data_ptr = context.template Alloc<IndexT>(&range);
   thrust::sequence(
       thrust::device, range_data_ptr, range_data_ptr + num_input + 1);
@@ -142,14 +142,14 @@ UniqueFlattendCUDATensor(const Context& context,
           thrust::device, out_data, out_data + num_input, range_data_ptr, equal)
           .first -
       out_data;
-  out->Resize(phi::make_ddim({num_out}));
+  out->Resize(common::make_ddim({num_out}));
 
   // 3. Calculate inverse index: 'inverse'
   if (return_inverse) {
-    index->Resize(phi::make_ddim({num_input}));
+    index->Resize(common::make_ddim({num_input}));
     auto* inverse_data = context.template Alloc<IndexT>(index);
     DenseTensor inv_loc;
-    inv_loc.Resize(phi::make_ddim({num_input}));
+    inv_loc.Resize(common::make_ddim({num_input}));
     auto inv_loc_data_ptr = context.template Alloc<IndexT>(&inv_loc);
     thrust::adjacent_difference(thrust::device,
                                 in_data_hat,
@@ -172,7 +172,7 @@ UniqueFlattendCUDATensor(const Context& context,
   // 2. Calculate sorted index: 'indices'
   if (return_index) {
     DenseTensor tmp_indices;
-    tmp_indices.Resize(phi::make_ddim({num_input}));
+    tmp_indices.Resize(common::make_ddim({num_input}));
     auto* tmp_indices_data_ptr = context.template Alloc<IndexT>(&tmp_indices);
     thrust::copy(thrust::device,
                  in_data_hat,
@@ -183,12 +183,12 @@ UniqueFlattendCUDATensor(const Context& context,
                           tmp_indices_data_ptr + num_input,
                           indices_data,
                           equal);
-    indices->Resize(phi::make_ddim({num_out}));
+    indices->Resize(common::make_ddim({num_out}));
   }
 
   // 4. Calculate 'counts'
   if (return_counts) {
-    counts->Resize(phi::make_ddim({num_out}));
+    counts->Resize(common::make_ddim({num_out}));
     auto count_data = context.template Alloc<IndexT>(counts);
     // init 'count_data' as 0
     thrust::fill(thrust::device, count_data, count_data + num_out, 0);
@@ -219,12 +219,12 @@ UniqueFlattendCUDATensor(const Context& context,
   // 1. Sort indices
   DenseTensor in_resize;
   in_resize.ShareDataWith(in);
-  in_resize.Resize(phi::make_ddim({num_input}));
+  in_resize.Resize(common::make_ddim({num_input}));
   const InT* in_data = in_resize.data<InT>();
   auto equal = BinaryEqual<InT>(1, in_data);
   auto not_equal = BinaryNotEqual<InT>(1, in_data);
 
-  indices->Resize(phi::make_ddim({num_input}));
+  indices->Resize(common::make_ddim({num_input}));
   auto* indices_data = context.template Alloc<IndexT>(indices);
 
   thrust::sequence(thrust::device, indices_data, indices_data + num_input);
@@ -235,10 +235,10 @@ UniqueFlattendCUDATensor(const Context& context,
 
   // 2. Calculate inverse indices: 'index'
   if (return_inverse) {
-    index->Resize(phi::make_ddim({num_input}));
+    index->Resize(common::make_ddim({num_input}));
     auto* inverse_data = context.template Alloc<IndexT>(index);
     DenseTensor inv_loc;
-    inv_loc.Resize(phi::make_ddim({num_input}));
+    inv_loc.Resize(common::make_ddim({num_input}));
     auto inv_loc_data_ptr = context.template Alloc<IndexT>(&inv_loc);
     thrust::adjacent_difference(thrust::device,
                                 indices_data,
@@ -260,7 +260,7 @@ UniqueFlattendCUDATensor(const Context& context,
 
   // 3. Calculate op result and sorted index: 'out' & 'indices'
   DenseTensor range;
-  range.Resize(phi::make_ddim({num_input + 1}));
+  range.Resize(common::make_ddim({num_input + 1}));
   auto* range_data_ptr = context.template Alloc<IndexT>(&range);
   thrust::sequence(
       thrust::device, range_data_ptr, range_data_ptr + num_input + 1);
@@ -272,14 +272,14 @@ UniqueFlattendCUDATensor(const Context& context,
                                   equal)
                 .first -
             indices_data;
-  indices->Resize(phi::make_ddim({num_out}));
-  out->Resize(phi::make_ddim({num_out}));
+  indices->Resize(common::make_ddim({num_out}));
+  out->Resize(common::make_ddim({num_out}));
   context.template Alloc<InT>(out);
   phi::IndexSelectKernel<InT, Context>(context, in_resize, *indices, 0, out);
 
   // 4. Calculate 'counts'
   if (return_counts) {
-    counts->Resize(phi::make_ddim({num_out}));
+    counts->Resize(common::make_ddim({num_out}));
     auto count_data = context.template Alloc<IndexT>(counts);
     // init 'count_data' as 0
     thrust::fill(thrust::device, count_data, count_data + num_out, 0);
@@ -312,10 +312,10 @@ static void ComputeUniqueDims(const Context& context,
                               not_equal_T not_equal,
                               int64_t row) {
   // 1. inverse indices: 'inverse'
-  inverse->Resize(phi::make_ddim({row}));
+  inverse->Resize(common::make_ddim({row}));
   auto* inverse_data = context.template Alloc<IndexT>(inverse);
   DenseTensor inv_loc;
-  inv_loc.Resize(phi::make_ddim({row}));
+  inv_loc.Resize(common::make_ddim({row}));
   auto inv_loc_data_ptr = context.template Alloc<IndexT>(&inv_loc);
   thrust::adjacent_difference(thrust::device,
                               sorted_indices_data,
@@ -336,7 +336,7 @@ static void ComputeUniqueDims(const Context& context,
 
   // 2. sorted indices
   DenseTensor range;
-  range.Resize(phi::make_ddim({row + 1}));
+  range.Resize(common::make_ddim({row + 1}));
   auto range_data_ptr = context.template Alloc<IndexT>(&range);
   thrust::sequence(thrust::device, range_data_ptr, range_data_ptr + row + 1);
   int num_out;
@@ -349,10 +349,10 @@ static void ComputeUniqueDims(const Context& context,
             sorted_indices_data;
   thrust::device_ptr<IndexT> range_data_ptr_dev(range_data_ptr);
   range_data_ptr_dev[num_out] = row;
-  sorted_indices->Resize(phi::make_ddim({num_out}));
+  sorted_indices->Resize(common::make_ddim({num_out}));
 
   // 3. counts: 'counts'
-  counts->Resize(phi::make_ddim({num_out}));
+  counts->Resize(common::make_ddim({num_out}));
   auto* count_data = context.template Alloc<IndexT>(counts);
   thrust::fill(thrust::device, count_data, count_data + num_out, 0);
   thrust::adjacent_difference(thrust::device,
@@ -376,8 +376,8 @@ static void UniqueDimsCUDATensor(const Context& context,
   // 1. Transpose & reshape
   // Transpose tensor: eg. axis=1, [dim0, dim1, dim2] -> [dim1, dim0, dim2]
   DenseTensor in_trans;
-  std::vector<int64_t> in_trans_dims_vec(phi::vectorize(in.dims()));
-  auto in_trans_dims = phi::make_ddim(in_trans_dims_vec);
+  std::vector<int64_t> in_trans_dims_vec(common::vectorize(in.dims()));
+  auto in_trans_dims = common::make_ddim(in_trans_dims_vec);
   std::vector<int> permute(in.dims().size());
   bool is_transpose = axis != 0;
   if (is_transpose) {
@@ -386,7 +386,7 @@ static void UniqueDimsCUDATensor(const Context& context,
     permute[0] = axis;
     in_trans_dims_vec[axis] = in.dims()[0];
     in_trans_dims_vec[0] = in.dims()[axis];
-    in_trans_dims = phi::make_ddim(in_trans_dims_vec);
+    in_trans_dims = common::make_ddim(in_trans_dims_vec);
     in_trans.Resize(in_trans_dims);
     context.template Alloc<InT>(&in_trans);
     phi::funcs::TransCompute<Context, InT>(
@@ -399,7 +399,7 @@ static void UniqueDimsCUDATensor(const Context& context,
     in_trans.ShareDataWith(in);
   }
   // Reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2]
-  auto in_trans_flat_dims = phi::flatten_to_2d(in_trans_dims, 1);
+  auto in_trans_flat_dims = common::flatten_to_2d(in_trans_dims, 1);
   in_trans.Resize(in_trans_flat_dims);
 
   // now 'in_trans' is 2D
@@ -407,7 +407,7 @@ static void UniqueDimsCUDATensor(const Context& context,
   int64_t row = in_trans.dims()[0];
   const InT* in_trans_data = in_trans.data<InT>();
 
-  indices->Resize(phi::make_ddim({row}));
+  indices->Resize(common::make_ddim({row}));
   auto* sorted_indices_data = context.template Alloc<IndexT>(indices);
 
   // 2. Calculate 'indices', 'inverse', 'counts'
@@ -437,19 +437,19 @@ static void UniqueDimsCUDATensor(const Context& context,
   out_trans_dims_vec[0] = indices->numel();
   if (is_transpose) {
     DenseTensor out_trans;
-    out_trans.Resize(phi::make_ddim(out_trans_dims_vec));
+    out_trans.Resize(common::make_ddim(out_trans_dims_vec));
     context.template Alloc<InT>(&out_trans);
 
     phi::IndexSelectKernel<InT, Context>(
         context, in_trans, *indices, 0, &out_trans);
 
     std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]);
-    out->Resize(phi::make_ddim(out_trans_dims_vec));
+    out->Resize(common::make_ddim(out_trans_dims_vec));
     context.template Alloc<InT>(out);
     phi::funcs::TransCompute<Context, InT>(
         out_trans.dims().size(), context, out_trans, out, permute);
   } else {
-    out->Resize(phi::make_ddim(out_trans_dims_vec));
+    out->Resize(common::make_ddim(out_trans_dims_vec));
     context.template Alloc<InT>(out);
 
     phi::IndexSelectKernel<InT, Context>(context, in_trans, *indices, 0, out);
diff --git a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
index b69c4a691d0e33..81cc8ee78a947d 100644
--- a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
+++ b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
@@ -188,7 +188,7 @@ struct GetMaxValue {
                   const DenseTensor& input,
                   T* max_value) {
     DenseTensor out_data;
-    out_data.Resize(phi::make_ddim({1}));
+    out_data.Resize(common::make_ddim({1}));
     dev_ctx.template Alloc<T>(&out_data);
     switch (ComputeBlockSize(input.numel())) {
       FIXED_BLOCK_DIM_CASE(
diff --git a/paddle/phi/kernels/gpu/yolo_box_kernel.cu b/paddle/phi/kernels/gpu/yolo_box_kernel.cu
index e948667624d6ca..8616b8bb429556 100644
--- a/paddle/phi/kernels/gpu/yolo_box_kernel.cu
+++ b/paddle/phi/kernels/gpu/yolo_box_kernel.cu
@@ -129,7 +129,8 @@ void YoloBoxKernel(const Context& dev_ctx,
 
   int bytes = sizeof(int) * anchors.size();
   DenseTensor tmp_anchors;
-  tmp_anchors.Resize(phi::make_dim(anchors.size()));
+  using common::make_dim;
+  tmp_anchors.Resize(make_dim(anchors.size()));
   int* anchors_data = dev_ctx.template Alloc<int>(&tmp_anchors);
   const auto gplace = dev_ctx.GetPlace();
   const auto cplace = phi::CPUPlace();
diff --git a/paddle/phi/kernels/gpudnn/affine_grid_kernel.cu b/paddle/phi/kernels/gpudnn/affine_grid_kernel.cu
index 060f8c86710b58..bde4faefc5de3f 100644
--- a/paddle/phi/kernels/gpudnn/affine_grid_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/affine_grid_kernel.cu
@@ -49,7 +49,7 @@ void AffineGridCudnnKernel(const Context& dev_ctx,
   h_size_data[1] = size_attr[1];
   h_size_data[2] = size_attr[2];
   h_size_data[3] = size_attr[3];
-  output->Resize(phi::make_ddim({n, h_size_data[2], h_size_data[3], 2}));
+  output->Resize(common::make_ddim({n, h_size_data[2], h_size_data[3], 2}));
   T* output_data = dev_ctx.template Alloc<T>(output);
   ScopedSpatialTransformerDescriptor st_desc;
   cudnnSpatialTransformerDescriptor_t cudnn_st_desc =
diff --git a/paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h b/paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h
index d0bdcc10beaa83..1b8fe788c30c21 100644
--- a/paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h
+++ b/paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h
@@ -87,7 +87,7 @@ class CudnnFrontendConvHelper {
       const phi::DenseTensor* tensor,
       int64_t id,
       cudnnTensorFormat_t layout_format) {
-    auto transformed_dims = phi::vectorize<int64_t>(tensor->dims());
+    auto transformed_dims = common::vectorize<int64_t>(tensor->dims());
     if (layout_format == CUDNN_TENSOR_NHWC) {
       transformed_dims =
           phi::backends::gpu::TransformDimOrder(transformed_dims);
diff --git a/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h b/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h
index 186bbd75fae62c..1b6ad4fdaa93c3 100644
--- a/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h
+++ b/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h
@@ -146,8 +146,8 @@ struct ConvArgsBase {
 
   template <typename T>
   phi::autotune::ConvCacheKey ConvertToConvCacheKey() const {
-    auto x_shape = phi::vectorize(x->dims());
-    auto w_shape = phi::vectorize(w->dims());
+    auto x_shape = common::vectorize(x->dims());
+    auto w_shape = common::vectorize(w->dims());
     VLOG(10) << "[ConvArgs] x_dims=" << x_shape << ", w_dims=" << w_shape
              << ", strides=" << s << ", paddings=" << p << ", dilations=" << d
              << ", data=" << phi::CppTypeToDataType<T>::Type()
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
index 2c6e898fa25c85..77b636bbb4ba1c 100644
--- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
@@ -536,7 +536,7 @@ void ConvCudnnGradKernel(const Context& ctx,
     in_data_dims = slice_ddim(in_dims, 1, in_dims.size() - 1);
     filter_data_dims = slice_ddim(filter_dims, 1, filter_dims.size() - 1);
   }
-  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
   UpdatePaddingAndDilation(
       &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
 
@@ -579,7 +579,7 @@ void ConvCudnnGradKernel(const Context& ctx,
         input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i];
       }
     }
-    DDim new_input_shape(make_ddim(new_input_shape_vec));
+    DDim new_input_shape(common::make_ddim(new_input_shape_vec));
     transformed_input.Resize(new_input_shape);
     ctx.template Alloc<T>(&transformed_input);
 
@@ -906,7 +906,7 @@ void ConvCudnnGradGradKernel(
   auto filter_dims = W->dims();
   DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
   DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
-  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
   UpdatePaddingAndDilation(
       &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
 
@@ -935,7 +935,7 @@ void ConvCudnnGradGradKernel(
       input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
       input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
     }
-    DDim new_input_shape(make_ddim(new_input_shape_vec));
+    DDim new_input_shape(common::make_ddim(new_input_shape_vec));
     transformed_X.Resize(new_input_shape);
     transformed_ddX.Resize(new_input_shape);
     transformed_dX.Resize(new_input_shape);
diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu
index 65418673827cd5..36d0bad6b103f7 100644
--- a/paddle/phi/kernels/gpudnn/conv_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu
@@ -395,7 +395,7 @@ void ConvCudnnKernel(const Context& ctx,
     filter_data_dims = slice_ddim(filter_dims, 1, filter_dims.size() - 1);
   }
 
-  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
   UpdatePaddingAndDilation(
       &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
 
@@ -435,7 +435,7 @@ void ConvCudnnKernel(const Context& ctx,
         input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i];
       }
     }
-    DDim new_input_shape(make_ddim(new_input_shape_vec));
+    DDim new_input_shape(common::make_ddim(new_input_shape_vec));
     transformed_input.Resize(new_input_shape);
     ctx.template Alloc<T>(&transformed_input);
 
diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
index 50bae0a8bca3e2..07ab10e8f5a542 100644
--- a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
@@ -16,11 +16,11 @@ limitations under the License. */
 
 #include <algorithm>
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/dynload/cudnn.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/funcs/batch_norm_utils.h"
@@ -65,8 +65,8 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
   // if channel_last, transpose to channel_first
   DenseTensor x_transpose;
   DenseTensor dout_transpose;
-  std::vector<int> x_vec = vectorize<int>(x.dims());
-  std::vector<int> out_vec = vectorize<int>(dout.dims());
+  std::vector<int> x_vec = common::vectorize<int>(x.dims());
+  std::vector<int> out_vec = common::vectorize<int>(dout.dims());
   if (data_layout == GPUDNNDataLayout::kNHWC) {
     if (strides.size() == 2U) {
       std::vector<int> axis = {0, 3, 1, 2};
@@ -96,7 +96,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
   DDim x_data_dims;
   x_data_dims = slice_ddim(x_dims, 2, x_dims.size());
   DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
-  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
   UpdatePaddingAndDilation(
       &paddings_, &dilations_, padding_algorithm, x_data_dims, strides, ksize);
 
@@ -121,7 +121,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
       x_pad[2 * i + 4 + 1] = paddings_[2 * i + 1] - padding_common[i];
     }
 
-    transformed_dout.Resize(make_ddim(new_dout_shape_vec));
+    transformed_dout.Resize(common::make_ddim(new_dout_shape_vec));
     ctx.template Alloc<T>(&transformed_dout);
 
     const int rank = x_transpose.dims().size();
@@ -154,7 +154,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
 
   const T* x_data = x_transpose.data<T>();
   const T* dout_data = transformed_dout.data<T>();
-  out_vec = vectorize<int>(transformed_dout.dims());
+  out_vec = common::vectorize<int>(transformed_dout.dims());
 
   // ------------------- cudnn descriptors ---------------------
   GPUDNNDataLayout layout;
@@ -312,7 +312,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
       DenseTensor dx_transpose;
       DenseTensor dx_nchw;
       dx_nchw.ShareDataWith(*dx);
-      dx_nchw.Resize(make_ddim(x_vec));
+      dx_nchw.Resize(common::make_ddim(x_vec));
       if (strides.size() == 2U) {
         std::vector<int> axis = {0, 2, 3, 1};
         dx_transpose = Transpose<T, Context>(ctx, dx_nchw, axis);
@@ -483,13 +483,14 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
       transformed_dx_channel = *dx;
     }
   }
-  std::vector<int> out_vec = vectorize<int>(transformed_dout_channel.dims());
+  std::vector<int> out_vec =
+      common::vectorize<int>(transformed_dout_channel.dims());
 
   auto x_dims = transformed_x_channel.dims();
   auto filter_dims = filter.dims();
   DDim x_data_dims = slice_ddim(x_dims, 2, x_dims.size());
   DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
-  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
   UpdatePaddingAndDilation(
       &paddings_, &dilations_, padding_algorithm, x_data_dims, strides, ksize);
 
@@ -527,10 +528,10 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
       input_pad[2 * i + 4] = paddings_[2 * i] - padding_common[i];
       input_pad[2 * i + 4 + 1] = paddings_[2 * i + 1] - padding_common[i];
     }
-    DDim new_input_shape(make_ddim(new_input_shape_vec));
+    DDim new_input_shape(common::make_ddim(new_input_shape_vec));
     transformed_x.Resize(new_input_shape);
     transformed_ddx.Resize(new_input_shape);
-    transformed_dout.Resize(make_ddim(new_output_grad_shape_vec));
+    transformed_dout.Resize(common::make_ddim(new_output_grad_shape_vec));
 
     ctx.template Alloc<T>(&transformed_x);
     ctx.template Alloc<T>(&transformed_ddx);
@@ -601,12 +602,12 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
   }
 
   if (!is_sys_pad) {
-    transformed_ddout_channel.Resize(make_ddim(transformed_out_vec));
+    transformed_ddout_channel.Resize(common::make_ddim(transformed_out_vec));
     ctx.template Alloc<T>(&transformed_ddout_channel);
   } else {
     ctx.template Alloc<T>(ddout);
     transformed_ddout_channel = *ddout;
-    transformed_ddout_channel.Resize(make_ddim(transformed_out_vec));
+    transformed_ddout_channel.Resize(common::make_ddim(transformed_out_vec));
   }
 
   const T* x_ = transformed_x.data<T>();
diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
index df360ab388a6d7..fe46ea978f14b3 100644
--- a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
@@ -16,11 +16,11 @@ limitations under the License. */
 
 #include <algorithm>
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/dynload/cudnn.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/funcs/padding.h"
@@ -57,8 +57,8 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
   const GPUDNNDataLayout data_layout =
       (data_format != "NHWC" ? GPUDNNDataLayout::kNCHW
                              : GPUDNNDataLayout::kNHWC);
-  std::vector<int> x_vec = vectorize<int>(x.dims());
-  std::vector<int> out_vec = vectorize<int>(out->dims());
+  std::vector<int> x_vec = common::vectorize<int>(x.dims());
+  std::vector<int> out_vec = common::vectorize<int>(out->dims());
   // if channel_last, transpose to channel_first
   DenseTensor x_transpose;
   if (data_layout == GPUDNNDataLayout::kNHWC) {
@@ -87,7 +87,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
   DDim x_data_dims;
   x_data_dims = slice_ddim(x_dims, 2, x_dims.size());
   DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
-  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
   UpdatePaddingAndDilation(
       &paddings_, &dilations_, padding_algorithm, x_data_dims, strides, ksize);
 
@@ -110,7 +110,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
       x_pad[2 * i + 4] = paddings_[2 * i] - padding_common[i];
       x_pad[2 * i + 4 + 1] = paddings_[2 * i + 1] - padding_common[i];
     }
-    DDim new_x_shape(make_ddim(new_x_shape_vec));
+    DDim new_x_shape(common::make_ddim(new_x_shape_vec));
     transformed_x.Resize(new_x_shape);
     ctx.template Alloc<T>(&transformed_x);
 
@@ -152,7 +152,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
   }
 
   const T* x_data = transformed_x.data<T>();
-  x_vec = vectorize<int>(transformed_x.dims());
+  x_vec = common::vectorize<int>(transformed_x.dims());
 
   std::vector<int> transformed_out_vec = out_vec;
   for (size_t i = 0; i < data_dim; ++i) {
@@ -163,12 +163,12 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
 
   DenseTensor transformed_out;
   if (!is_sys_pad) {
-    transformed_out.Resize(make_ddim(transformed_out_vec));
+    transformed_out.Resize(common::make_ddim(transformed_out_vec));
     ctx.template Alloc<T>(&transformed_out);
   } else {
     ctx.template Alloc<T>(out);
     transformed_out.ShareDataWith(*out);
-    transformed_out.Resize(make_ddim(transformed_out_vec));
+    transformed_out.Resize(common::make_ddim(transformed_out_vec));
   }
   T* transformed_out_data = transformed_out.data<T>();
 
@@ -288,7 +288,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
     DenseTensor out_transpose;
     DenseTensor out_nchw;
     out_nchw.ShareDataWith(*out);
-    out_nchw.Resize(make_ddim(out_vec));
+    out_nchw.Resize(common::make_ddim(out_vec));
 
     if (strides.size() == 2U) {
       out_transpose = Transpose<T, Context>(ctx, out_nchw, {0, 2, 3, 1});
diff --git a/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu b/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu
index 1161040f2163f1..24e79c77a50e1b 100644
--- a/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu
@@ -119,12 +119,12 @@ void PoolGradRawGPUDNNKernel(const Context& ctx,
 
     // input
     transformed_input.Resize(input->dims());
-    auto in_dims_vec = vectorize(input->dims());
+    auto in_dims_vec = common::vectorize(input->dims());
     in_dims_vec[1] = input->dims()[4];
     in_dims_vec[2] = input->dims()[1];
     in_dims_vec[3] = input->dims()[2];
     in_dims_vec[4] = input->dims()[3];
-    transformed_input.Resize(make_ddim(in_dims_vec));
+    transformed_input.Resize(common::make_ddim(in_dims_vec));
     ctx.Alloc(&transformed_input, input->type());
 
     funcs::Transpose<Context, T, 5> trans5;
@@ -132,12 +132,12 @@ void PoolGradRawGPUDNNKernel(const Context& ctx,
 
     // output
     transformed_output.Resize(output->dims());
-    auto out_dims_vec = vectorize(output->dims());
+    auto out_dims_vec = common::vectorize(output->dims());
     out_dims_vec[1] = output->dims()[4];
     out_dims_vec[2] = output->dims()[1];
     out_dims_vec[3] = output->dims()[2];
     out_dims_vec[4] = output->dims()[3];
-    transformed_output.Resize(make_ddim(out_dims_vec));
+    transformed_output.Resize(common::make_ddim(out_dims_vec));
 
     ctx.Alloc(&transformed_output, output->type());
 
@@ -145,14 +145,14 @@ void PoolGradRawGPUDNNKernel(const Context& ctx,
     trans5_v2(ctx, *output, &transformed_output, axis);
 
     // output grad
-    transformed_output_grad.Resize(make_ddim(out_dims_vec));
+    transformed_output_grad.Resize(common::make_ddim(out_dims_vec));
     ctx.Alloc(&transformed_output_grad, output_grad->type());
 
     funcs::Transpose<Context, T, 5> trans5_v3;
     trans5_v3(ctx, *output_grad, &transformed_output_grad, axis);
 
     // input grad
-    transformed_input_grad.Resize(make_ddim(in_dims_vec));
+    transformed_input_grad.Resize(common::make_ddim(in_dims_vec));
 
 #ifdef PADDLE_WITH_HIP
     // MIOPEN not support NHWC data layout
@@ -163,11 +163,11 @@ void PoolGradRawGPUDNNKernel(const Context& ctx,
 
     // input
     transformed_input.Resize(input->dims());
-    auto in_dims_vec = vectorize(input->dims());
+    auto in_dims_vec = common::vectorize(input->dims());
     in_dims_vec[1] = input->dims()[3];
     in_dims_vec[2] = input->dims()[1];
     in_dims_vec[3] = input->dims()[2];
-    transformed_input.Resize(make_ddim(in_dims_vec));
+    transformed_input.Resize(common::make_ddim(in_dims_vec));
     ctx.Alloc(&transformed_input, input->type());
 
     funcs::Transpose<Context, T, 4> trans4;
@@ -175,25 +175,25 @@ void PoolGradRawGPUDNNKernel(const Context& ctx,
 
     // output
     transformed_output.Resize(output->dims());
-    auto out_dims_vec = vectorize(output->dims());
+    auto out_dims_vec = common::vectorize(output->dims());
     out_dims_vec[1] = output->dims()[3];
     out_dims_vec[2] = output->dims()[1];
     out_dims_vec[3] = output->dims()[2];
-    transformed_output.Resize(make_ddim(out_dims_vec));
+    transformed_output.Resize(common::make_ddim(out_dims_vec));
     ctx.Alloc(&transformed_output, output->type());
 
     funcs::Transpose<Context, T, 4> trans4_v2;
     trans4_v2(ctx, *output, &transformed_output, axis);
 
     // output grad
-    transformed_output_grad.Resize(make_ddim(out_dims_vec));
+    transformed_output_grad.Resize(common::make_ddim(out_dims_vec));
     ctx.Alloc(&transformed_output_grad, output_grad->type());
 
     funcs::Transpose<Context, T, 4> trans4_v3;
     trans4_v3(ctx, *output_grad, &transformed_output_grad, axis);
 
     // input grad
-    transformed_input_grad.Resize(make_ddim(in_dims_vec));
+    transformed_input_grad.Resize(common::make_ddim(in_dims_vec));
 #endif
   } else {
     layout = GetLayoutFromStr(data_format);
@@ -214,14 +214,14 @@ void PoolGradRawGPUDNNKernel(const Context& ctx,
 
 #ifdef PADDLE_WITH_HIP
   miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-      layout, vectorize<int>(transformed_input.dims()));
+      layout, common::vectorize<int>(transformed_input.dims()));
   miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-      layout, vectorize<int>(transformed_output.dims()));
+      layout, common::vectorize<int>(transformed_output.dims()));
 #else
   cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-      layout, vectorize<int>(transformed_input.dims()));
+      layout, common::vectorize<int>(transformed_input.dims()));
   cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-      layout, vectorize<int>(transformed_output.dims()));
+      layout, common::vectorize<int>(transformed_output.dims()));
 #endif
   PoolingMode pooling_mode;
   if (pooling_type == "max") {
diff --git a/paddle/phi/kernels/gpudnn/pool_kernel.cu b/paddle/phi/kernels/gpudnn/pool_kernel.cu
index b1a79dd8740680..5bd1e2d6a12c1c 100644
--- a/paddle/phi/kernels/gpudnn/pool_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/pool_kernel.cu
@@ -91,12 +91,12 @@ void PoolRawGPUDNNKernel(const Context& ctx,
     // input
     transformed_input.Resize(input->dims());
 
-    auto in_dims_vec = vectorize(input->dims());
+    auto in_dims_vec = common::vectorize(input->dims());
     in_dims_vec[1] = input->dims()[4];
     in_dims_vec[2] = input->dims()[1];
     in_dims_vec[3] = input->dims()[2];
     in_dims_vec[4] = input->dims()[3];
-    transformed_input.Resize(make_ddim(in_dims_vec));
+    transformed_input.Resize(common::make_ddim(in_dims_vec));
     ctx.Alloc(&transformed_input, input->type());
 
     funcs::Transpose<Context, T, 5> trans5;
@@ -105,12 +105,12 @@ void PoolRawGPUDNNKernel(const Context& ctx,
     // output
     transformed_output.Resize(output->dims());
 
-    auto out_dims_vec = vectorize(output->dims());
+    auto out_dims_vec = common::vectorize(output->dims());
     out_dims_vec[1] = output->dims()[4];
     out_dims_vec[2] = output->dims()[1];
     out_dims_vec[3] = output->dims()[2];
     out_dims_vec[4] = output->dims()[3];
-    transformed_output.Resize(make_ddim(out_dims_vec));
+    transformed_output.Resize(common::make_ddim(out_dims_vec));
 #ifdef PADDLE_WITH_HIP
     // MIOPEN not support NHWC data layout
   } else if (data_format == str_NHWC) {
@@ -119,22 +119,22 @@ void PoolRawGPUDNNKernel(const Context& ctx,
     std::vector<int> axis{0, 3, 1, 2};
 
     transformed_input.Resize(input->dims());
-    auto in_dims_vec = vectorize(input->dims());
+    auto in_dims_vec = common::vectorize(input->dims());
     in_dims_vec[1] = input->dims()[3];
     in_dims_vec[2] = input->dims()[1];
     in_dims_vec[3] = input->dims()[2];
-    transformed_input.Resize(make_ddim(in_dims_vec));
+    transformed_input.Resize(common::make_ddim(in_dims_vec));
     ctx.Alloc(&transformed_input, input->type());
 
     funcs::Transpose<Context, T, 4> trans;
     trans(ctx, *input, &transformed_input, axis);
 
     transformed_output.Resize(output->dims());
-    auto out_dims_vec = vectorize(output->dims());
+    auto out_dims_vec = common::vectorize(output->dims());
     out_dims_vec[1] = output->dims()[3];
     out_dims_vec[2] = output->dims()[1];
     out_dims_vec[3] = output->dims()[2];
-    transformed_output.Resize(make_ddim(out_dims_vec));
+    transformed_output.Resize(common::make_ddim(out_dims_vec));
 #endif
   } else {
     layout = GetLayoutFromStr(data_format);
@@ -152,14 +152,14 @@ void PoolRawGPUDNNKernel(const Context& ctx,
 
 #ifdef PADDLE_WITH_HIP
   miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-      layout, vectorize<int>(transformed_input.dims()));
+      layout, common::vectorize<int>(transformed_input.dims()));
   miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-      layout, vectorize<int>(transformed_output.dims()));
+      layout, common::vectorize<int>(transformed_output.dims()));
 #else
   cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-      layout, vectorize<int>(transformed_input.dims()));
+      layout, common::vectorize<int>(transformed_input.dims()));
   cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-      layout, vectorize<int>(transformed_output.dims()));
+      layout, common::vectorize<int>(transformed_output.dims()));
 #endif
   PoolingMode pooling_mode;
   if (pooling_type == "max") {
diff --git a/paddle/phi/kernels/impl/amp_kernel_impl.h b/paddle/phi/kernels/impl/amp_kernel_impl.h
index 6757e1e6895751..ec857f3f640d56 100644
--- a/paddle/phi/kernels/impl/amp_kernel_impl.h
+++ b/paddle/phi/kernels/impl/amp_kernel_impl.h
@@ -128,7 +128,8 @@ void UpdateLossScalingKernel(const Context& dev_ctx,
   if (is_found_inf_on_cpu) {
     if (*found_inf_data) {
       for (auto* out : outs) {
-        Full<T>(dev_ctx, vectorize(out->dims()), static_cast<T>(0), out);
+        Full<T>(
+            dev_ctx, common::vectorize(out->dims()), static_cast<T>(0), out);
       }
     }
   } else {
diff --git a/paddle/phi/kernels/impl/bilinear_grad_kernel_impl.h b/paddle/phi/kernels/impl/bilinear_grad_kernel_impl.h
index dac527e24425d6..9fefa1704b3e6c 100644
--- a/paddle/phi/kernels/impl/bilinear_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/bilinear_grad_kernel_impl.h
@@ -42,13 +42,13 @@ void BilinearGradKernel(const Context& ctx,
   auto& place = *ctx.eigen_device();
   // Create the intermediate variable to calculate the Output(Y@Grad).
   DenseTensor x_scale;
-  x_scale.Resize(make_ddim({batch_size, x_dim}));
+  x_scale.Resize(common::make_ddim({batch_size, x_dim}));
   ctx.template Alloc<T>(&x_scale);
   auto x_scale_mat = EigenMatrix<T>::From(x_scale);
 
   // Create the intermediate variable to calculate the Output(X@Grad).
   DenseTensor y_scale;
-  y_scale.Resize(make_ddim({batch_size, y_dim}));
+  y_scale.Resize(common::make_ddim({batch_size, y_dim}));
   ctx.template Alloc<T>(&y_scale);
   auto y_scale_mat = EigenMatrix<T>::From(y_scale);
 
@@ -78,7 +78,7 @@ void BilinearGradKernel(const Context& ctx,
 
     for (int i = 0; i < out_dim; ++i) {
       DenseTensor weight_i =
-          weight.Slice(i, i + 1).Resize(make_ddim({x_dim, y_dim}));
+          weight.Slice(i, i + 1).Resize(common::make_ddim({x_dim, y_dim}));
       auto output_vec = dout_mat.chip(i, 1);
 
       if (dx) {
@@ -116,8 +116,8 @@ void BilinearGradKernel(const Context& ctx,
                     dy->data<T>());
         }
         if (dweight) {
-          DenseTensor dweight_i =
-              dweight->Slice(i, i + 1).Resize(make_ddim({x_dim, y_dim}));
+          DenseTensor dweight_i = dweight->Slice(i, i + 1).Resize(
+              common::make_ddim({x_dim, y_dim}));
           blas.GEMM(CblasTrans,
                     CblasNoTrans,
                     x_dim,
diff --git a/paddle/phi/kernels/impl/bilinear_kernel_impl.h b/paddle/phi/kernels/impl/bilinear_kernel_impl.h
index 12ad7eda263d0c..2b00cb69d728cc 100644
--- a/paddle/phi/kernels/impl/bilinear_kernel_impl.h
+++ b/paddle/phi/kernels/impl/bilinear_kernel_impl.h
@@ -44,14 +44,14 @@ void BilinearKernel(const Context& ctx,
   // Input(X) multiplied by Input(Weight_i), the formula is:
   // left_mul = X Weight_i.
   DenseTensor left_mul;
-  left_mul.Resize(phi::make_ddim({batch_size, y_dim}));
+  left_mul.Resize(common::make_ddim({batch_size, y_dim}));
   ctx.template Alloc<T>(&left_mul);
   auto left_mul_mat = EigenMatrix<T>::From(left_mul);
 
   for (int i = 0; i < out_dim; ++i) {
     auto output_col_vec = output_mat.chip(i, 1);
     DenseTensor weight_mat =
-        weight.Slice(i, i + 1).Resize(phi::make_ddim({x_dim, y_dim}));
+        weight.Slice(i, i + 1).Resize(common::make_ddim({x_dim, y_dim}));
     phi::funcs::GetBlas<Context, T>(ctx).GEMM(CblasNoTrans,
                                               CblasNoTrans,
                                               batch_size,
diff --git a/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h b/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h
index c61b10d5a21995..144c8fe44dd260 100644
--- a/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h
+++ b/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h
@@ -60,7 +60,7 @@ void ApplyBroadcast(const Context& ctx,
       new_input_dims_vec[out_axis] = input_dims[in_axis];
     }
   }
-  auto new_input_dims = phi::make_ddim(new_input_dims_vec);
+  auto new_input_dims = common::make_ddim(new_input_dims_vec);
 
   // Initialize input X with new_input_dims_vec, so it's rank-aligned with the
   // output
diff --git a/paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h b/paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h
index 562ff25317ec9b..40a12c471b94a7 100644
--- a/paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h
+++ b/paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h
@@ -75,7 +75,7 @@ void CholeskySolveKernel(const Context& dev_ctx,
   int x_bst_ndim = x_bst_dims_vec.size();
   int M = static_cast<int>(x_bst_dims_vec[x_bst_ndim - 2]);
   int N = static_cast<int>(x_bst_dims_vec[x_bst_ndim - 1]);
-  int batchsize = product(phi::slice_ddim(x_bst.dims(), 0, x_bst_ndim - 2));
+  int batchsize = product(common::slice_ddim(x_bst.dims(), 0, x_bst_ndim - 2));
 
   DenseTensor info = phi::Empty<int, Context>(dev_ctx, IntArray({batchsize}));
   int* info_data = info.data<int>();
@@ -94,7 +94,7 @@ void CholeskySolveKernel(const Context& dev_ctx,
 
   // calculate out's conjugate for complex
   result = phi::TransposeLast2Dim<T>(dev_ctx, result);
-  out->Resize(phi::make_ddim(x_bst_dims_vec));
+  out->Resize(common::make_ddim(x_bst_dims_vec));
   ConjKernel<T, Context>(dev_ctx, result, out);
 }
 
diff --git a/paddle/phi/kernels/impl/conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_grad_kernel_impl.h
index ec75952aaae8e2..3baf3fd84b0c49 100644
--- a/paddle/phi/kernels/impl/conv_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/conv_grad_kernel_impl.h
@@ -68,17 +68,17 @@ void ConvGradKernel(const Context& dev_ctx,
   auto filter_dims = filter.dims();
   DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
   DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
-  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
   UpdatePaddingAndDilation<int>(
       &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
 
   const int batch_size = static_cast<int>(transformed_input.dims()[0]);
 
   // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
-  std::vector<int64_t> filter_shape_vec(vectorize(filter.dims()));
+  std::vector<int64_t> filter_shape_vec(common::vectorize(filter.dims()));
   // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
   std::vector<int64_t> output_shape_vec(
-      vectorize(transformed_output_grad.dims()));
+      common::vectorize(transformed_output_grad.dims()));
 
   // use col_shape in the im2col calculation
   // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
@@ -90,7 +90,7 @@ void ConvGradKernel(const Context& dev_ctx,
     col_shape_vec[j + 1] = filter_shape_vec[j + 2];
     col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
   }
-  DDim col_shape(make_ddim(col_shape_vec));
+  DDim col_shape(common::make_ddim(col_shape_vec));
 
   // use col_matrix_shape in the gemm calculation
   // size: (i_c/g * k_h * k_w, o_h * o_w)
@@ -310,13 +310,14 @@ void ConvGradGradKernel(const Context& dev_ctx,
 
   DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
   DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
-  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
   UpdatePaddingAndDilation(
       &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
 
   const int batch_size = static_cast<int>(transformed_X.dims()[0]);
-  std::vector<int64_t> filter_shape_vec(vectorize(W.dims()));
-  std::vector<int64_t> output_shape_vec(vectorize(transformed_dY.dims()));
+  std::vector<int64_t> filter_shape_vec(common::vectorize(W.dims()));
+  std::vector<int64_t> output_shape_vec(
+      common::vectorize(transformed_dY.dims()));
 
   size_t data_dim = filter_shape_vec.size() - 2;
   std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
@@ -326,7 +327,7 @@ void ConvGradGradKernel(const Context& dev_ctx,
     col_shape_vec[j + 1] = filter_shape_vec[j + 2];
     col_shape_vec[j + data_dim + 1] = output_shape_vec[j + 2];
   }
-  DDim col_shape(make_ddim(col_shape_vec));
+  DDim col_shape(common::make_ddim(col_shape_vec));
   // col_matrix_shape [in_channel/group * kh * kw, oh * ow]
   DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim + 1);
   // input_shape [Cin, H, W]
diff --git a/paddle/phi/kernels/impl/conv_kernel_impl.h b/paddle/phi/kernels/impl/conv_kernel_impl.h
index 06ba3104a81124..e40ba59a2d3a11 100644
--- a/paddle/phi/kernels/impl/conv_kernel_impl.h
+++ b/paddle/phi/kernels/impl/conv_kernel_impl.h
@@ -66,7 +66,7 @@ void ConvKernelImpl(const Context& dev_ctx,
   DDim in_data_dims = slice_ddim(trans_in_dims, 2, trans_in_dims.size());
   DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
 
-  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
   UpdatePaddingAndDilation(
       &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
 
@@ -74,11 +74,12 @@ void ConvKernelImpl(const Context& dev_ctx,
 
   // filter_shape_vec:
   // {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
-  std::vector<int64_t> filter_shape_vec(vectorize(filter.dims()));
+  std::vector<int64_t> filter_shape_vec(common::vectorize(filter.dims()));
 
   // output_shape_vec:
   // {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
-  std::vector<int64_t> output_shape_vec(vectorize(transformed_output.dims()));
+  std::vector<int64_t> output_shape_vec(
+      common::vectorize(transformed_output.dims()));
 
   // use col_shape in the im2col calculation
   // col_shape_vec:
@@ -93,7 +94,7 @@ void ConvKernelImpl(const Context& dev_ctx,
     col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
   }
 
-  DDim col_shape(make_ddim(col_shape_vec));
+  DDim col_shape(common::make_ddim(col_shape_vec));
 
   // use col_matrix_shape in the gemm calculation
   // size:
diff --git a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
index 2d92f8156b607d..c4e58838c7e574 100644
--- a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
-#include "paddle/phi/common/layout.h"
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
+#include "paddle/common/layout.h"
 #include "paddle/phi/kernels/conv_transpose_grad_kernel.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
@@ -39,7 +39,7 @@ void ConvTransposeGradRawKernel(const Context& ctx,
                                 const std::string& data_format,
                                 DenseTensor* dx,
                                 DenseTensor* dfilter) {
-  const DataLayout data_layout = phi::StringToDataLayout(data_format);
+  const DataLayout data_layout = common::StringToDataLayout(data_format);
   // For filter, we do not use const pointer because we will do reshape,
   // but we should avoid modifying its value.
   DenseTensor filter_ = filter;
@@ -63,15 +63,15 @@ void ConvTransposeGradRawKernel(const Context& ctx,
     in_data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1);
   }
   DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
-  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
   UpdatePaddingAndDilation(
       &paddings_, &dilations_, padding_algorithm, in_data_dims, strides, ksize);
 
   // x_shape_vec: {n, c, h, w} or {n, c, d, h, w} for channel_first
   // x_shape_vec: {n, h, w, c} or {n, d, h, w, c} for channel_last
-  std::vector<int64_t> x_shape_vec = vectorize(x.dims());
+  std::vector<int64_t> x_shape_vec = common::vectorize(x.dims());
   // filter_shape_vec: {i_c, o_c, k_h, k_w} or {i_c, o_c, k_d, k_h, k_w}
-  std::vector<int64_t> filter_shape_vec = vectorize(filter_.dims());
+  std::vector<int64_t> filter_shape_vec = common::vectorize(filter_.dims());
 
   // use col_shape in the im2col and col2im (or vol2col and col2vol)
   // calculation
@@ -91,7 +91,7 @@ void ConvTransposeGradRawKernel(const Context& ctx,
       col_shape_vec[j + 1 + data_dim] = x_shape_vec[j + 1];
     }
   }
-  DDim col_shape(make_ddim(col_shape_vec));
+  DDim col_shape(common::make_ddim(col_shape_vec));
 
   // use col_matrix_shape in the gemm calculation
   // size: (o_c * k_h * k_w, h * w) or (o_c * k_d * k_h * k_w, d * h * w)
diff --git a/paddle/phi/kernels/impl/conv_transpose_kernel_impl.h b/paddle/phi/kernels/impl/conv_transpose_kernel_impl.h
index 9fab3e6735b40d..ac6ce032a9b254 100644
--- a/paddle/phi/kernels/impl/conv_transpose_kernel_impl.h
+++ b/paddle/phi/kernels/impl/conv_transpose_kernel_impl.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
-#include "paddle/phi/common/layout.h"
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
+#include "paddle/common/layout.h"
 #include "paddle/phi/kernels/conv_transpose_kernel.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
@@ -37,7 +37,7 @@ void ConvTransposeRawKernel(const Context& ctx,
                             const std::vector<int>& dilations,
                             const std::string& data_format,
                             DenseTensor* out) {
-  const DataLayout data_layout = phi::StringToDataLayout(data_format);
+  const DataLayout data_layout = common::StringToDataLayout(data_format);
   // The filter will be reshaped, so it should not be constant
   DenseTensor filter_ = filter;
   std::vector<int> paddings_ = paddings;
@@ -55,15 +55,15 @@ void ConvTransposeRawKernel(const Context& ctx,
     in_data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1);
   }
   DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
-  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
   UpdatePaddingAndDilation(
       &paddings_, &dilations_, padding_algorithm, in_data_dims, strides, ksize);
 
   // x_shape_vec: {n, c, h, w} or {n, c, d, h, w} for channel_first
   // x_shape_vec: {n, h, w, c} or {n, d, h, w, c} for channel_last
-  std::vector<int64_t> x_shape_vec = vectorize(x.dims());
+  std::vector<int64_t> x_shape_vec = common::vectorize(x.dims());
   // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
-  std::vector<int64_t> filter_shape_vec = vectorize(filter_.dims());
+  std::vector<int64_t> filter_shape_vec = common::vectorize(filter_.dims());
 
   // use col_shape in the im2col and col2im (or vol2col and col2vol)
   // calculation
@@ -83,7 +83,7 @@ void ConvTransposeRawKernel(const Context& ctx,
       col_shape_vec[j + 1 + data_dim] = x_shape_vec[j + 1];
     }
   }
-  DDim col_shape(make_ddim(col_shape_vec));
+  DDim col_shape(common::make_ddim(col_shape_vec));
 
   // use col_matrix_shape in the gemm calculation
   // size: (o_c/g * k_h * k_w, h * w) or (o_c/g * k_d * k_h * k_w, d * h * w)
diff --git a/paddle/phi/kernels/impl/crop_kernel_impl.h b/paddle/phi/kernels/impl/crop_kernel_impl.h
index 5aa951d4da09d6..3ad039b05b8465 100644
--- a/paddle/phi/kernels/impl/crop_kernel_impl.h
+++ b/paddle/phi/kernels/impl/crop_kernel_impl.h
@@ -63,7 +63,7 @@ static phi::DDim ValidateShape(const std::vector<int64_t>& shape,
     }
   }
 
-  return phi::make_ddim(output_shape);
+  return common::make_ddim(output_shape);
 }
 
 template <typename Context, typename T, size_t D>
diff --git a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
index 744c48b2bfbd61..fdd31e510510a6 100644
--- a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
@@ -177,10 +177,10 @@ void DeformableConvGradKernel(const Context& dev_ctx,
                               DenseTensor* mask_grad) {
   const int batch_size = static_cast<int>(x.dims()[0]);
 
-  DDim input_shape = phi::slice_ddim(x.dims(), 1, x.dims().size());
-  std::vector<int64_t> input_shape_vec = phi::vectorize(input_shape);
-  std::vector<int64_t> filter_shape_vec(phi::vectorize(filter.dims()));
-  std::vector<int64_t> output_shape_vec(phi::vectorize(out_grad.dims()));
+  DDim input_shape = common::slice_ddim(x.dims(), 1, x.dims().size());
+  std::vector<int64_t> input_shape_vec = common::vectorize(input_shape);
+  std::vector<int64_t> filter_shape_vec(common::vectorize(filter.dims()));
+  std::vector<int64_t> output_shape_vec(common::vectorize(out_grad.dims()));
 
   std::vector<int64_t> col_buffer_shape_vec(filter_shape_vec.size());
   col_buffer_shape_vec[0] = x.dims()[1] * filter.dims()[2] * filter.dims()[3];
@@ -195,7 +195,7 @@ void DeformableConvGradKernel(const Context& dev_ctx,
   DenseTensor col_buffer = Empty<T>(dev_ctx, col_buffer_shape_vec);
   DenseTensor output_buffer;
   output_buffer.ShareDataWith(out_grad).Resize(
-      make_ddim(output_buffer_shape_vec));
+      common::make_ddim(output_buffer_shape_vec));
 
   int64_t M =
       input_shape_vec[0] / groups * filter_shape_vec[2] * filter_shape_vec[3];
@@ -245,14 +245,14 @@ void DeformableConvGradKernel(const Context& dev_ctx,
 
   for (int i = 0; i < batch_size / im2col_step; ++i) {
     DenseTensor out_grad_3d = out_grad_4d.Slice(i, i + 1).Resize(
-        phi::slice_ddim(out_grad_4d.dims(), 1, out_grad_4d.dims().size()));
+        common::slice_ddim(out_grad_4d.dims(), 1, out_grad_4d.dims().size()));
     for (int g = 0; g < groups; ++g) {
       DenseTensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize(
-          phi::slice_ddim(weight_3d.dims(), 1, weight_3d.dims().size()));
+          common::slice_ddim(weight_3d.dims(), 1, weight_3d.dims().size()));
       DenseTensor out_grad_3d_slice = out_grad_3d.Slice(g, g + 1).Resize(
-          phi::slice_ddim(out_grad_3d.dims(), 1, out_grad_3d.dims().size()));
+          common::slice_ddim(out_grad_3d.dims(), 1, out_grad_3d.dims().size()));
       DenseTensor col_buffer_3d_slice =
-          col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim(
+          col_buffer_3d.Slice(g, g + 1).Resize(common::slice_ddim(
               col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
       blas.MatMul(weight_3d_slice,
                   true,
@@ -262,7 +262,7 @@ void DeformableConvGradKernel(const Context& dev_ctx,
                   &col_buffer_3d_slice,
                   T(0.0));
     }
-    col_buffer.Resize(make_ddim(col_buffer_shape_vec));
+    col_buffer.Resize(common::make_ddim(col_buffer_shape_vec));
 
     T* col_buffer_ptr = col_buffer.data<T>();
     const T* input_ptr = x.data<T>();
@@ -329,13 +329,14 @@ void DeformableConvGradKernel(const Context& dev_ctx,
       DenseTensor dweight_3d = Empty<T>(
           dev_ctx, {filter_grad_shape.Get(), filter_grad_shape.size()});
       for (int g = 0; g < groups; ++g) {
-        DenseTensor out_grad_3d_slice = out_grad_3d.Slice(g, g + 1).Resize(
-            phi::slice_ddim(out_grad_3d.dims(), 1, out_grad_3d.dims().size()));
+        DenseTensor out_grad_3d_slice =
+            out_grad_3d.Slice(g, g + 1).Resize(common::slice_ddim(
+                out_grad_3d.dims(), 1, out_grad_3d.dims().size()));
         DenseTensor col_buffer_3d_slice =
-            col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim(
+            col_buffer_3d.Slice(g, g + 1).Resize(common::slice_ddim(
                 col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
         DenseTensor dweight_3d_slice = dweight_3d.Slice(g, g + 1).Resize(
-            phi::slice_ddim(dweight_3d.dims(), 1, dweight_3d.dims().size()));
+            common::slice_ddim(dweight_3d.dims(), 1, dweight_3d.dims().size()));
 
         blas.MatMul(out_grad_3d_slice,
                     false,
diff --git a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
index 119c7ad52202bf..d4647128963e5d 100644
--- a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
+++ b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
@@ -44,8 +44,8 @@ void DeformableConvKernel(const Context& dev_ctx,
     im2col_step = temp_step;
   }
 
-  std::vector<int64_t> filter_shape_vec(phi::vectorize(filter.dims()));
-  std::vector<int64_t> output_shape_vec(phi::vectorize(out->dims()));
+  std::vector<int64_t> filter_shape_vec(common::vectorize(filter.dims()));
+  std::vector<int64_t> output_shape_vec(common::vectorize(out->dims()));
 
   // col_shape_vec: {c_i * k_h * k_w, im2col_step, o_h, o_w}
   std::vector<int64_t> col_buffer_shape_vec(filter_shape_vec.size());
@@ -67,18 +67,18 @@ void DeformableConvKernel(const Context& dev_ctx,
   int64_t K = x.dims()[1] * filter_shape_vec[2] * filter_shape_vec[3] / groups;
 
   DenseTensor weight_3d;
-  weight_3d.ShareDataWith(filter).Resize(phi::make_ddim({groups, M, K}));
+  weight_3d.ShareDataWith(filter).Resize(common::make_ddim({groups, M, K}));
 
   DenseTensor col_buffer_3d;
   col_buffer_3d.ShareDataWith(col_buffer)
-      .Resize(phi::make_ddim({groups, K, N}));
+      .Resize(common::make_ddim({groups, K, N}));
 
   DenseTensor output_4d;
   output_4d.ShareDataWith(output_buffer)
-      .Resize(phi::make_ddim({batch_size / im2col_step, groups, M, N}));
+      .Resize(common::make_ddim({batch_size / im2col_step, groups, M, N}));
 
-  DDim input_shape = phi::slice_ddim(x.dims(), 1, x.dims().size());
-  std::vector<int64_t> input_shape_vec = phi::vectorize(input_shape);
+  DDim input_shape = common::slice_ddim(x.dims(), 1, x.dims().size());
+  std::vector<int64_t> input_shape_vec = common::vectorize(input_shape);
 
   int input_dim = x.numel() / x.dims()[0];
   int input_offset_dim = offset.numel() / offset.dims()[0];
@@ -107,7 +107,7 @@ void DeformableConvKernel(const Context& dev_ctx,
         dilations,
         deformable_groups,
         col_buffer_ptr);
-    DenseTensor output_3d = output_4d.Slice(i, i + 1).Resize(phi::slice_ddim(
+    DenseTensor output_3d = output_4d.Slice(i, i + 1).Resize(common::slice_ddim(
         output_4d.dims(),
         1,
         output_4d.dims().size()));  // group * C/group * (im2step * H * W)
@@ -115,12 +115,12 @@ void DeformableConvKernel(const Context& dev_ctx,
     // get the product of pixel and weight
     for (int g = 0; g < groups; ++g) {
       DenseTensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize(
-          phi::slice_ddim(weight_3d.dims(), 1, weight_3d.dims().size()));
+          common::slice_ddim(weight_3d.dims(), 1, weight_3d.dims().size()));
       DenseTensor col_buffer_3d_slice =
-          col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim(
+          col_buffer_3d.Slice(g, g + 1).Resize(common::slice_ddim(
               col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
       DenseTensor output_3d_slice =
-          output_3d.Slice(g, g + 1).Resize(phi::slice_ddim(
+          output_3d.Slice(g, g + 1).Resize(common::slice_ddim(
               output_3d.dims(),
               1,
               output_3d.dims().size()));  // C * ((im2col_step)*H*W))
@@ -145,16 +145,17 @@ void DeformableConvKernel(const Context& dev_ctx,
     DenseTensor real_output_buffer = phi::Transpose<T, Context>(
         dev_ctx,
         output_4d.Resize(
-            phi::make_ddim({batch_size / im2col_step,
-                            output_shape_vec[1],
-                            im2col_step,
-                            output_shape_vec[2] * output_shape_vec[3]})),
+            common::make_ddim({batch_size / im2col_step,
+                               output_shape_vec[1],
+                               im2col_step,
+                               output_shape_vec[2] * output_shape_vec[3]})),
         axis);
 
     out->ShareDataWith(real_output_buffer)
-        .Resize(phi::make_ddim(output_shape_vec));
+        .Resize(common::make_ddim(output_shape_vec));
   } else {
-    out->ShareDataWith(output_buffer).Resize(phi::make_ddim(output_shape_vec));
+    out->ShareDataWith(output_buffer)
+        .Resize(common::make_ddim(output_shape_vec));
   }
 }
 
diff --git a/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h b/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h
index b17512ad1da879..8b135c4b520ae8 100644
--- a/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h
@@ -110,7 +110,7 @@ void DeterminantGradKernel(const Context& dev_ctx,
     VLOG(3) << "The input matrix not invertible!";
     x_grad->Resize(x.dims());
     phi::Full<T>(
-        dev_ctx, phi::vectorize(x.dims()), static_cast<T>(0.0f), x_grad);
+        dev_ctx, common::vectorize(x.dims()), static_cast<T>(0.0f), x_grad);
     return;
   }
 
diff --git a/paddle/phi/kernels/impl/determinant_kernel_impl.h b/paddle/phi/kernels/impl/determinant_kernel_impl.h
index 01c54d780b4b0e..4a308a5798192d 100644
--- a/paddle/phi/kernels/impl/determinant_kernel_impl.h
+++ b/paddle/phi/kernels/impl/determinant_kernel_impl.h
@@ -105,7 +105,7 @@ template <typename T, typename Context>
 void DeterminantKernel(const Context& dev_ctx,
                        const DenseTensor& x,
                        DenseTensor* out) {
-  auto input_dim = vectorize(x.dims());
+  auto input_dim = common::vectorize(x.dims());
   auto input_dim_size = input_dim.size();
 
   auto batch_count = detail::GetBatchCount(x.dims());
@@ -121,12 +121,12 @@ void DeterminantKernel(const Context& dev_ctx,
                         "the input matrix should be square matrix."));
   auto rank = input_dim[input_dim_size - 1];  // square matrix length
   DeterminantFunctor<T, Context>()(dev_ctx, x, rank, batch_count, out);
-  auto output_dims = phi::slice_ddim(x.dims(), 0, input_dim_size - 2);
+  auto output_dims = common::slice_ddim(x.dims(), 0, input_dim_size - 2);
   if (input_dim_size > 2) {
     out->Resize(output_dims);
   } else {
     // when input is a two-dimension matrix, The det value is a number.
-    out->Resize(phi::make_ddim({}));
+    out->Resize(common::make_ddim({}));
   }
   VLOG(10) << "output dim:" << out->dims();
 }
diff --git a/paddle/phi/kernels/impl/diag_embed_impl.h b/paddle/phi/kernels/impl/diag_embed_impl.h
index a4430fde923434..044deccb3c2c35 100644
--- a/paddle/phi/kernels/impl/diag_embed_impl.h
+++ b/paddle/phi/kernels/impl/diag_embed_impl.h
@@ -82,7 +82,7 @@ void DiagEmbedKernel(const Context& dev_ctx,
   auto out_dims = out->dims();
   int dim1_ = dim1 < 0 ? out_dims.size() + dim1 : dim1;
   int dim2_ = dim2 < 0 ? out_dims.size() + dim2 : dim2;
-  auto stride = phi::stride(out_dims);
+  auto stride = common::stride(out_dims);
   int64_t diag_size;
   int64_t storage_offset = 0;
   if (offset >= 0) {
@@ -99,11 +99,11 @@ void DiagEmbedKernel(const Context& dev_ctx,
   } else {
     storage_offset -= offset * stride[dim1_];
   }
-  auto strides = vectorize(stride);
+  auto strides = common::vectorize(stride);
   strides.erase(strides.begin() + std::max(dim1_, dim2_));
   strides.erase(strides.begin() + std::min(dim1_, dim2_));
   strides.push_back(stride[dim1_] + stride[dim2_]);
-  const auto dims = vectorize(x.dims());
+  const auto dims = common::vectorize(x.dims());
 
 #if defined(__NVCC__) || defined(__HIPCC__)
   thrust::device_vector<int64_t> dims_vec(dims);
diff --git a/paddle/phi/kernels/impl/dot_grad_kernel_impl.h b/paddle/phi/kernels/impl/dot_grad_kernel_impl.h
index add72749d39e1e..3a82ace22860e5 100644
--- a/paddle/phi/kernels/impl/dot_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/dot_grad_kernel_impl.h
@@ -104,7 +104,7 @@ struct DotGradFunction<DeviceContext, T, phi::funcs::EnableComplex<T>> {
       auto* data_dx = ctx.template Alloc<T>(tensor_dx);
       const auto* data_y = tensor_y->data<T>();
       const DDim& dim = tensor_x->dims();
-      size_t N = static_cast<size_t>(phi::product(dim));
+      size_t N = static_cast<size_t>(common::product(dim));
 
       auto _step = dim.size() > 0 ? dim[dim.size() - 1] : 1;
       auto step = _step != 0 ? _step : 1;
@@ -120,7 +120,7 @@ struct DotGradFunction<DeviceContext, T, phi::funcs::EnableComplex<T>> {
       auto* data_dy = ctx.template Alloc<T>(tensor_dy);
       const auto* data_x = tensor_x->data<T>();
       const DDim& dim = tensor_y->dims();
-      size_t N = static_cast<size_t>(phi::product(dim));
+      size_t N = static_cast<size_t>(common::product(dim));
 
       auto _step = dim.size() > 0 ? dim[dim.size() - 1] : 1;
       auto step = _step != 0 ? _step : 1;
diff --git a/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h b/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h
index f39786fff2665f..817081a690385a 100644
--- a/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h
@@ -48,9 +48,9 @@ void EighGradKernel(const Context& dev_ctx,
   result.Resize(dims);
   dev_ctx.template Alloc<T>(&result);
 
-  std::vector<int> out_shape = phi::vectorize<int>(dims);
+  std::vector<int> out_shape = common::vectorize<int>(dims);
   DenseTensor constant;
-  constant.Resize(phi::make_ddim(out_shape));
+  constant.Resize(common::make_ddim(out_shape));
   dev_ctx.template Alloc<T>(&constant);
   phi::funcs::SetConstant<Context, T>()(dev_ctx, &constant, T(0.5));
   result = phi::Subtract<T>(
diff --git a/paddle/phi/kernels/impl/einsum_grad_impl.h b/paddle/phi/kernels/impl/einsum_grad_impl.h
index ce33d08c1d82db..9557b1609eeef2 100644
--- a/paddle/phi/kernels/impl/einsum_grad_impl.h
+++ b/paddle/phi/kernels/impl/einsum_grad_impl.h
@@ -61,7 +61,7 @@ DenseTensor PerformTileAndReduction(const Context& dev_ctx,
       }
     }
   }
-  t.Resize(make_ddim(resize_dims));
+  t.Resize(common::make_ddim(resize_dims));
   DenseTensor after_tile;
   if (std::all_of(repeat_times.begin(), repeat_times.end(), [](int x) {
         return x == 1;
@@ -100,7 +100,7 @@ DenseTensor PerformTileAndReduction(const Context& dev_ctx,
   }
   VLOG(5) << "PermformTileAndReduction: recover shape: "
           << paddle::string::join_strings(recover_shape, ",");
-  ret.Resize(make_ddim(recover_shape));
+  ret.Resize(common::make_ddim(recover_shape));
   // undiagonalize by einsum equation. only contain undiagonal operations.
   DenseTensor out;
   VLOG(5) << "Undiagonal by einsum with args: " << op_label + "->" + equ;
diff --git a/paddle/phi/kernels/impl/einsum_impl.h b/paddle/phi/kernels/impl/einsum_impl.h
index e32f64f347f4c6..0fec027fdf5e17 100644
--- a/paddle/phi/kernels/impl/einsum_impl.h
+++ b/paddle/phi/kernels/impl/einsum_impl.h
@@ -491,7 +491,8 @@ DenseTensor PerformDiagonalAndReduction(const Context& dev_ctx,
       if (cur != label2perm[c]) {
         // do diagonal, followed by movedim().
         VLOG(5) << "Do diagonal with shape="
-                << paddle::string::join_strings(vectorize<int>(res.dims()), ',')
+                << paddle::string::join_strings(
+                       common::vectorize<int>(res.dims()), ',')
                 << ", axis1=" << cur << ", axis2=" << label2perm[c];
         res = Diagonal<T, Context>(dev_ctx, res, 0, cur, label2perm[c]);
         res = Transpose<T, Context>(
@@ -623,7 +624,7 @@ DenseTensor PerformContraction(
     }
     VLOG(5) << "PerformContraction: mul_dims: "
             << paddle::string::join_strings(mul_dims, ",");
-    trans_t.Resize(make_ddim(mul_dims));
+    trans_t.Resize(common::make_ddim(mul_dims));
     return trans_t;
   };
 
@@ -643,7 +644,7 @@ DenseTensor PerformContraction(
   if (recover_dim.size() == 0) recover_dim.push_back(1);
   VLOG(5) << "PerformContraction: recover_dim: "
           << paddle::string::join_strings(recover_dim, ",");
-  after_contraction.Resize(make_ddim(recover_dim));
+  after_contraction.Resize(common::make_ddim(recover_dim));
   return after_contraction;
 }
 
@@ -740,7 +741,7 @@ void EinsumKernelImpl(const Context& dev_ctx,
                                        broadcast_dims.size());
   *out = PerformUndiagonal<T, Context>(
       dev_ctx, *out, broadcast_dims.size(), right);
-  out->Resize(make_ddim(output_dims));
+  out->Resize(common::make_ddim(output_dims));
 }
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h b/paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h
index f0c32dd32e42f3..54ef6e0c1f9cb7 100644
--- a/paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/expand_as_grad_kernel_impl.h
@@ -55,7 +55,7 @@ void ExpandAsGradKernel(const Context& context,
     return;
   }
 
-  auto vec_in_dims = phi::vectorize<int>(x_dims);
+  auto vec_in_dims = common::vectorize<int>(x_dims);
   auto diff = target_shape.size() - vec_in_dims.size();
   vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
   std::vector<int> repeat_times(vec_in_dims.size());
diff --git a/paddle/phi/kernels/impl/expand_as_kernel_impl.h b/paddle/phi/kernels/impl/expand_as_kernel_impl.h
index 7e3a1a6656140e..cee562b42778e1 100755
--- a/paddle/phi/kernels/impl/expand_as_kernel_impl.h
+++ b/paddle/phi/kernels/impl/expand_as_kernel_impl.h
@@ -30,7 +30,7 @@ void ExpandAs(const Context& context,
               const std::vector<int>& target_shape,
               DenseTensor* out) {
   auto in_dims = x.dims();
-  auto vec_in_dims = phi::vectorize<int>(in_dims);
+  auto vec_in_dims = common::vectorize<int>(in_dims);
   auto diff = target_shape.size() - vec_in_dims.size();
   vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
   std::vector<int> repeat_times(vec_in_dims.size());
@@ -82,8 +82,8 @@ void ExpandAs(const Context& context,
     bcast_dims[i] = repeat_times[i];
   }
 
-  phi::DDim new_in_dims = phi::make_ddim(vec_in_dims);
-  phi::DDim out_dims = phi::make_ddim(target_shape);
+  phi::DDim new_in_dims = common::make_ddim(vec_in_dims);
+  phi::DDim out_dims = common::make_ddim(target_shape);
 
   out->Resize(out_dims);
   context.template Alloc<T>(out);
@@ -129,7 +129,7 @@ void ExpandAsKernel(const Context& ctx,
     if (target_shape[i] == -1) {
       if (y) {
         if (y->IsInitialized()) {
-          real_target_shape = phi::vectorize<int>(y->dims());
+          real_target_shape = common::vectorize<int>(y->dims());
         }
       }
       break;
diff --git a/paddle/phi/kernels/impl/expand_grad_kernel_impl.h b/paddle/phi/kernels/impl/expand_grad_kernel_impl.h
index 700f64863e4fee..4dd9dc4d50337a 100644
--- a/paddle/phi/kernels/impl/expand_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/expand_grad_kernel_impl.h
@@ -59,7 +59,7 @@ void ExpandGradKernel(const Context& ctx,
     phi::Copy(ctx, out_grad, ctx.GetPlace(), false, in_grad);
     return;
   }
-  auto vec_in_dims = phi::vectorize<int>(x_dims);
+  auto vec_in_dims = common::vectorize<int>(x_dims);
   auto diff = expand_shape.size() - vec_in_dims.size();
   vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
   // 1. reshape_dims_vec is the broadcast parameter.
diff --git a/paddle/phi/kernels/impl/expand_kernel_impl.h b/paddle/phi/kernels/impl/expand_kernel_impl.h
index 4738088781de9d..181dd2558fa385 100644
--- a/paddle/phi/kernels/impl/expand_kernel_impl.h
+++ b/paddle/phi/kernels/impl/expand_kernel_impl.h
@@ -31,7 +31,7 @@ void Expand(const Context& ctx,
             DenseTensor* out) {
   auto in_dims = x.dims();
   auto expand_shape = shape.GetData();
-  auto vec_in_dims = phi::vectorize<int>(in_dims);
+  auto vec_in_dims = common::vectorize<int>(in_dims);
   auto diff = expand_shape.size() - vec_in_dims.size();
   vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
   std::vector<int> repeat_times(vec_in_dims.size());
@@ -83,7 +83,7 @@ void Expand(const Context& ctx,
     bcast_dims[i] = repeat_times[i];
   }
 
-  DDim new_in_dims = phi::make_ddim(vec_in_dims);
+  DDim new_in_dims = common::make_ddim(vec_in_dims);
   DDim out_dims(new_in_dims);
   for (size_t i = 0; i < repeat_times.size(); ++i) {
     out_dims[i] *= repeat_times[i];
diff --git a/paddle/phi/kernels/impl/fc_kernel_impl.h b/paddle/phi/kernels/impl/fc_kernel_impl.h
index 061f1baad3108b..c30da9d4e50009 100644
--- a/paddle/phi/kernels/impl/fc_kernel_impl.h
+++ b/paddle/phi/kernels/impl/fc_kernel_impl.h
@@ -46,13 +46,13 @@ void FCKernel(const Context& dev_ctx,
   std::vector<int64_t> output_dims;
   phi::funcs::FCOutputSize(
       input.dims(), w_dims, output_dims, in_num_col_dims, padding_weights);
-  out->Resize(phi::make_ddim(output_dims));
+  out->Resize(common::make_ddim(output_dims));
   out->set_lod(input.lod());
 
   auto out_dims = out->dims();
   auto w_dims0 = padding_weights ? w_dims[0] - 4 : w_dims[0];
   auto w_dims1 = padding_weights ? w_dims[1] - 4 : w_dims[1];
-  int M = phi::product(out_dims) / w_dims1;
+  int M = common::product(out_dims) / w_dims1;
 
   const T* input_data = input.data<T>();
   const T* w_data = w.data<T>();
diff --git a/paddle/phi/kernels/impl/fft_grad_kernel_impl.h b/paddle/phi/kernels/impl/fft_grad_kernel_impl.h
index de4bb8d4bd1734..72c8bc659a632a 100644
--- a/paddle/phi/kernels/impl/fft_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/fft_grad_kernel_impl.h
@@ -18,8 +18,8 @@
 #include <string>
 #include <vector>
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/common/data_type.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/kernels/complex_kernel.h"
@@ -92,10 +92,10 @@ void FFTC2RGradKernel(const Context& ctx,
 
   const int64_t double_length =
       out_grad.dims()[axes.back()] - x_grad->dims()[axes.back()];
-  const phi::DDim strides = phi::stride(x_grad->dims());
+  const phi::DDim strides = common::stride(x_grad->dims());
 
 #if defined(__NVCC__) || defined(__HIPCC__)
-  const thrust::device_vector<int64_t> strides_g(phi::vectorize(strides));
+  const thrust::device_vector<int64_t> strides_g(common::vectorize(strides));
   const int64_t* pstrides = thrust::raw_pointer_cast(strides_g.data());
 #else
   const int64_t* pstrides = strides.Get();
diff --git a/paddle/phi/kernels/impl/fft_kernel_impl.h b/paddle/phi/kernels/impl/fft_kernel_impl.h
index 13c54182d1d316..eab6c5f5a111bb 100644
--- a/paddle/phi/kernels/impl/fft_kernel_impl.h
+++ b/paddle/phi/kernels/impl/fft_kernel_impl.h
@@ -18,7 +18,7 @@
 #include <string>
 #include <vector>
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/fft.h"
@@ -75,7 +75,7 @@ void FFTR2CKernel(const Context& ctx,
         out->dims().at(last_fft_axis) / 2 + 1;
     onesided_out_shape[last_fft_axis] = onesided_last_axis_size;
     DenseTensor onesided_out =
-        Empty<C, Context>(ctx, phi::vectorize(onesided_out_shape));
+        Empty<C, Context>(ctx, common::vectorize(onesided_out_shape));
     fft_r2c_func(ctx, x, &onesided_out, axes, norm_type, forward);
     funcs::FFTFillConj<Context, C>(ctx, &onesided_out, out, axes);
   }
diff --git a/paddle/phi/kernels/impl/fold_grad_kernel_impl.h b/paddle/phi/kernels/impl/fold_grad_kernel_impl.h
index 1cfbb496d7750c..067ca010b31a90 100644
--- a/paddle/phi/kernels/impl/fold_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/fold_grad_kernel_impl.h
@@ -53,8 +53,8 @@ void FoldGradKernel(const Context& ctx,
   int n_output_plane = n_input_plane / (kernel_sizes[0] * kernel_sizes[1]);
 
   DDim out_shape =
-      make_ddim({n_output_plane, output_sizes[0], output_sizes[1]});
-  DDim input_matrix_shape = make_ddim(
+      common::make_ddim({n_output_plane, output_sizes[0], output_sizes[1]});
+  DDim input_matrix_shape = common::make_ddim(
       {1, kernel_sizes[0], kernel_sizes[1], output_height, output_width});
 
   phi::funcs::Im2ColFunctor<phi::funcs::ColFormat::kCFO, Context, T> im2col;
diff --git a/paddle/phi/kernels/impl/fold_kernel_impl.h b/paddle/phi/kernels/impl/fold_kernel_impl.h
index 694d754ecfb8e4..dfe11b0759aad5 100644
--- a/paddle/phi/kernels/impl/fold_kernel_impl.h
+++ b/paddle/phi/kernels/impl/fold_kernel_impl.h
@@ -52,9 +52,9 @@ void FoldKernel(const Context& ctx,
   int n_output_plane = n_input_plane / (kernel_sizes[0] * kernel_sizes[1]);
 
   DDim output_shape =
-      make_ddim({n_output_plane, output_sizes[0], output_sizes[1]});
+      common::make_ddim({n_output_plane, output_sizes[0], output_sizes[1]});
 
-  DDim input_matrix_shape = make_ddim(
+  DDim input_matrix_shape = common::make_ddim(
       {1, kernel_sizes[0], kernel_sizes[1], output_height, output_width});
 
   phi::funcs::SetConstant<Context, T> set_zero;
diff --git a/paddle/phi/kernels/impl/frame_grad_kernel_impl.h b/paddle/phi/kernels/impl/frame_grad_kernel_impl.h
index 9f6ceee24f183a..37f5de45cca5cd 100644
--- a/paddle/phi/kernels/impl/frame_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/frame_grad_kernel_impl.h
@@ -41,15 +41,15 @@ void FrameGradKernel(const Context& dev_ctx,
     DDim dx_resized_dims;
     DDim dout_resized_dims;
     if (axis == 0) {
-      preserved_dims = phi::slice_ddim(dx->dims(), 1, dx_rank);
-      dx_resized_dims = {seq_length, phi::product(preserved_dims)};
+      preserved_dims = common::slice_ddim(dx->dims(), 1, dx_rank);
+      dx_resized_dims = {seq_length, common::product(preserved_dims)};
       dout_resized_dims = {
-          n_frames, frame_length, phi::product(preserved_dims)};
+          n_frames, frame_length, common::product(preserved_dims)};
     } else {
-      preserved_dims = phi::slice_ddim(dx->dims(), 0, dx_rank - 1);
-      dx_resized_dims = {phi::product(preserved_dims), seq_length};
+      preserved_dims = common::slice_ddim(dx->dims(), 0, dx_rank - 1);
+      dx_resized_dims = {common::product(preserved_dims), seq_length};
       dout_resized_dims = {
-          phi::product(preserved_dims), frame_length, n_frames};
+          common::product(preserved_dims), frame_length, n_frames};
     }
     dx->Resize(dx_resized_dims);
     dout_tmp.Resize(dout_resized_dims);
@@ -64,31 +64,31 @@ void FrameGradKernel(const Context& dev_ctx,
       trans_dx = *dx;
 
       std::vector<int> perm_dout{1, 0};
-      auto dout_dims_vec = phi::vectorize(dout_tmp.dims());
+      auto dout_dims_vec = common::vectorize(dout_tmp.dims());
       for (int i = 0; i < dout_tmp.dims().size(); ++i) {
         dout_dims_vec[i] = dout_tmp.dims()[perm_dout[i]];
       }
-      trans_dout.Resize(phi::make_ddim(dout_dims_vec));
+      trans_dout.Resize(common::make_ddim(dout_dims_vec));
       dev_ctx.template Alloc<T>(&trans_dout);
       phi::funcs::TransCompute<Context, T>(
           perm_dout.size(), dev_ctx, dout_tmp, &trans_dout, perm_dout);
     } else {
       std::vector<int> perm_dx{1, 0};
-      auto dx_dims_vec = phi::vectorize(dx->dims());
+      auto dx_dims_vec = common::vectorize(dx->dims());
       for (int i = 0; i < dx->dims().size(); ++i) {
         dx_dims_vec[i] = dx->dims()[perm_dx[i]];
       }
-      trans_dx.Resize(phi::make_ddim(dx_dims_vec));
+      trans_dx.Resize(common::make_ddim(dx_dims_vec));
       dev_ctx.template Alloc<T>(&trans_dx);
       phi::funcs::TransCompute<Context, T>(
           perm_dx.size(), dev_ctx, *dx, &trans_dx, perm_dx);
 
       std::vector<int> perm_dout{2, 1, 0};
-      auto dout_dims_vec = phi::vectorize(dout_tmp.dims());
+      auto dout_dims_vec = common::vectorize(dout_tmp.dims());
       for (int i = 0; i < dout_tmp.dims().size(); ++i) {
         dout_dims_vec[i] = dout_tmp.dims()[perm_dout[i]];
       }
-      trans_dout.Resize(phi::make_ddim(dout_dims_vec));
+      trans_dout.Resize(common::make_ddim(dout_dims_vec));
       dev_ctx.template Alloc<T>(&trans_dout);
       phi::funcs::TransCompute<Context, T>(
           perm_dout.size(), dev_ctx, dout_tmp, &trans_dout, perm_dout);
@@ -129,7 +129,7 @@ void FrameGradKernel(const Context& dev_ctx,
       restored_dx_shape.push_back(seq_length);
     }
 
-    dx->Resize(phi::make_ddim(restored_dx_shape));
+    dx->Resize(common::make_ddim(restored_dx_shape));
   }
 }
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/frame_kernel_impl.h b/paddle/phi/kernels/impl/frame_kernel_impl.h
index b6a0b2ab6a3e48..fa0c5658efe550 100644
--- a/paddle/phi/kernels/impl/frame_kernel_impl.h
+++ b/paddle/phi/kernels/impl/frame_kernel_impl.h
@@ -42,13 +42,15 @@ void FrameKernel(const Context& dev_ctx,
     DDim x_resized_dims;
     DDim out_resized_dims;
     if (axis == 0) {
-      preserved_dims = phi::slice_ddim(x_tmp.dims(), 1, x_rank);
-      x_resized_dims = {seq_length, phi::product(preserved_dims)};
-      out_resized_dims = {n_frames, frame_length, phi::product(preserved_dims)};
+      preserved_dims = common::slice_ddim(x_tmp.dims(), 1, x_rank);
+      x_resized_dims = {seq_length, common::product(preserved_dims)};
+      out_resized_dims = {
+          n_frames, frame_length, common::product(preserved_dims)};
     } else {
-      preserved_dims = phi::slice_ddim(x_tmp.dims(), 0, x_rank - 1);
-      x_resized_dims = {phi::product(preserved_dims), seq_length};
-      out_resized_dims = {phi::product(preserved_dims), frame_length, n_frames};
+      preserved_dims = common::slice_ddim(x_tmp.dims(), 0, x_rank - 1);
+      x_resized_dims = {common::product(preserved_dims), seq_length};
+      out_resized_dims = {
+          common::product(preserved_dims), frame_length, n_frames};
     }
     x_tmp.Resize(x_resized_dims);
     out->Resize(out_resized_dims);
@@ -63,32 +65,32 @@ void FrameKernel(const Context& dev_ctx,
       trans_x = x_tmp;
 
       std::vector<int> perm_out{1, 0};
-      auto out_dims_vec = phi::vectorize(out->dims());
+      auto out_dims_vec = common::vectorize(out->dims());
       for (int i = 0; i < out->dims().size(); ++i) {
         out_dims_vec[i] = out->dims()[perm_out[i]];
       }
-      trans_out.Resize(phi::make_ddim(out_dims_vec));
+      trans_out.Resize(common::make_ddim(out_dims_vec));
 
       dev_ctx.template Alloc<T>(&trans_out);
       phi::funcs::TransCompute<Context, T>(
           perm_out.size(), dev_ctx, *out, &trans_out, perm_out);
     } else {
       std::vector<int> perm_x{1, 0};
-      auto x_dims_vec = phi::vectorize(x_tmp.dims());
+      auto x_dims_vec = common::vectorize(x_tmp.dims());
       for (int i = 0; i < x_tmp.dims().size(); ++i) {
         x_dims_vec[i] = x_tmp.dims()[perm_x[i]];
       }
-      trans_x.Resize(phi::make_ddim(x_dims_vec));
+      trans_x.Resize(common::make_ddim(x_dims_vec));
       dev_ctx.template Alloc<T>(&trans_x);
       phi::funcs::TransCompute<Context, T>(
           perm_x.size(), dev_ctx, x_tmp, &trans_x, perm_x);
 
       std::vector<int> perm_out{2, 1, 0};
-      auto out_dims_vec = phi::vectorize(out->dims());
+      auto out_dims_vec = common::vectorize(out->dims());
       for (int i = 0; i < out->dims().size(); ++i) {
         out_dims_vec[i] = out->dims()[perm_out[i]];
       }
-      trans_out.Resize(phi::make_ddim(out_dims_vec));
+      trans_out.Resize(common::make_ddim(out_dims_vec));
       dev_ctx.template Alloc<T>(&trans_out);
       phi::funcs::TransCompute<Context, T>(
           perm_out.size(), dev_ctx, *out, &trans_out, perm_out);
@@ -137,7 +139,7 @@ void FrameKernel(const Context& dev_ctx,
       restored_out_shape.push_back(n_frames);
     }
 
-    out->Resize(phi::make_ddim(restored_out_shape));
+    out->Resize(common::make_ddim(restored_out_shape));
   }
 }
 
diff --git a/paddle/phi/kernels/impl/full_whit_tensor_kernel_impl.h b/paddle/phi/kernels/impl/full_whit_tensor_kernel_impl.h
index a78af4f98c2b5c..ae7ce8a3f41a86 100644
--- a/paddle/phi/kernels/impl/full_whit_tensor_kernel_impl.h
+++ b/paddle/phi/kernels/impl/full_whit_tensor_kernel_impl.h
@@ -25,7 +25,7 @@ void FullWithTensorKernel(const Context& dev_ctx,
                           DataType dtype,
                           DenseTensor* out) {
   auto shape_tmp = IntArray(shape);
-  out->Resize(phi::make_ddim(shape_tmp.GetData()));
+  out->Resize(common::make_ddim(shape_tmp.GetData()));
   FullKernel<T, Context>(dev_ctx, shape_tmp, Scalar(value), dtype, out);
 }
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/graph_message_passing_impl.h b/paddle/phi/kernels/impl/graph_message_passing_impl.h
index dc1477e77227b9..448836c0f84052 100644
--- a/paddle/phi/kernels/impl/graph_message_passing_impl.h
+++ b/paddle/phi/kernels/impl/graph_message_passing_impl.h
@@ -90,8 +90,8 @@ inline BroadCastInfo CalcBCastInfo(const phi::DDim& l_dims,
 inline std::vector<int> InferBroadcastShape(const phi::DDim& x_dims,
                                             const phi::DDim& e_dims,
                                             const std::string& type = "x") {
-  auto x_dims1 = phi::vectorize<int>(x_dims);
-  auto e_dims1 = phi::vectorize<int>(e_dims);
+  auto x_dims1 = common::vectorize<int>(x_dims);
+  auto e_dims1 = common::vectorize<int>(e_dims);
   std::vector<int> x_dims2(x_dims1.begin() + 1, x_dims1.end());
   std::vector<int> e_dims2(e_dims1.begin() + 1, e_dims1.end());
   int max_dim = std::max(x_dims2.size(), e_dims2.size());
@@ -100,8 +100,8 @@ inline std::vector<int> InferBroadcastShape(const phi::DDim& x_dims,
   std::vector<int> e_dims_array(max_dim);
   std::vector<int> out_dims_array(max_dim);
   // Only need to broadcast dimensions other than the 0th dimension.
-  phi::funcs::GetBroadcastDimsArrays(phi::make_ddim(x_dims2),
-                                     phi::make_ddim(e_dims2),
+  phi::funcs::GetBroadcastDimsArrays(common::make_ddim(x_dims2),
+                                     common::make_ddim(e_dims2),
                                      x_dims_array.data(),
                                      e_dims_array.data(),
                                      out_dims_array.data(),
@@ -117,7 +117,7 @@ inline std::vector<int> InferBroadcastShape(const phi::DDim& x_dims,
 
 inline bool ReduceGrad(const phi::DDim& out_grad_dims,
                        const phi::DDim& x_dims,
-                       std::vector<int64_t>& axis) {
+                       std::vector<int64_t>& axis) {  // NOLINT
   // We must ensure the ndim of out_grad and x are the same.
   bool reduce = false;
   for (int i = 1; i < out_grad_dims.size(); i++) {
diff --git a/paddle/phi/kernels/impl/kron_grad_kernel_impl.h b/paddle/phi/kernels/impl/kron_grad_kernel_impl.h
index 352e4d30067197..3b195d6fa8b0ad 100644
--- a/paddle/phi/kernels/impl/kron_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/kron_grad_kernel_impl.h
@@ -168,11 +168,11 @@ struct KronGradOpFunctor {
     const phi::DDim &dim_y = y.dims();
     const phi::DDim &dim_dout = dout.dims();
     const phi::DDim stride_x =
-        dim_x.size() == 0 ? phi::DDim(dim_x) : phi::stride(dim_x);
+        dim_x.size() == 0 ? phi::DDim(dim_x) : common::stride(dim_x);
     const phi::DDim stride_y =
-        dim_y.size() == 0 ? phi::DDim(dim_y) : phi::stride(dim_y);
+        dim_y.size() == 0 ? phi::DDim(dim_y) : common::stride(dim_y);
     const phi::DDim stride_dout =
-        dim_dout.size() == 0 ? phi::DDim(dim_dout) : phi::stride(dim_dout);
+        dim_dout.size() == 0 ? phi::DDim(dim_dout) : common::stride(dim_dout);
 
     const int64_t *p_stride_x = nullptr;
     const int64_t *p_stride_y = nullptr;
diff --git a/paddle/phi/kernels/impl/kron_kernel_impl.h b/paddle/phi/kernels/impl/kron_kernel_impl.h
index e1fcb49949a748..e90c45c01879fc 100644
--- a/paddle/phi/kernels/impl/kron_kernel_impl.h
+++ b/paddle/phi/kernels/impl/kron_kernel_impl.h
@@ -45,7 +45,7 @@ inline DenseTensor UnsqueezeTo(const DenseTensor &src, int ndims) {
     for (int i = ndims - rank; i < ndims; i++) {
       new_dim[i] = shape[i - ndims + rank];
     }
-    res.Resize(phi::make_ddim(new_dim));
+    res.Resize(common::make_ddim(new_dim));
   }
   return res;
 }
@@ -109,11 +109,11 @@ struct KronOpFunctor {
     const phi::DDim &dim_y = y.dims();
     const phi::DDim &dim_out = out->dims();
     const phi::DDim stride_x =
-        dim_x.size() == 0 ? phi::DDim(dim_x) : phi::stride(dim_x);
+        dim_x.size() == 0 ? phi::DDim(dim_x) : common::stride(dim_x);
     const phi::DDim stride_y =
-        dim_y.size() == 0 ? phi::DDim(dim_y) : phi::stride(dim_y);
+        dim_y.size() == 0 ? phi::DDim(dim_y) : common::stride(dim_y);
     const phi::DDim stride_out =
-        dim_out.size() == 0 ? phi::DDim(dim_out) : phi::stride(dim_out);
+        dim_out.size() == 0 ? phi::DDim(dim_out) : common::stride(dim_out);
 
     const int64_t *p_stride_x = nullptr, *p_stride_y = nullptr,
                   *p_stride_out = nullptr, *p_shape_y = nullptr;
diff --git a/paddle/phi/kernels/impl/lamb_kernel_impl.h b/paddle/phi/kernels/impl/lamb_kernel_impl.h
index 5b1eb43129f203..91f73402411ec3 100644
--- a/paddle/phi/kernels/impl/lamb_kernel_impl.h
+++ b/paddle/phi/kernels/impl/lamb_kernel_impl.h
@@ -249,11 +249,11 @@ void ComputeImpl(const Context& dev_ctx,
   auto* trust_ratio_div_norm_ptr = trust_ratio_div_norm_t.data<MT>();
 
   // DenseTensor p_norm_t;
-  // p_norm_t.Resize(phi::make_ddim({1}));
+  // p_norm_t.Resize(common::make_ddim({1}));
   // auto* p_norm_ptr = dev_ctx.template Alloc<MT>(&p_norm_t);
 
   // DenseTensor trust_ratio_div_norm_t;
-  // trust_ratio_div_norm_t.Resize(phi::make_ddim({1}));
+  // trust_ratio_div_norm_t.Resize(common::make_ddim({1}));
   // auto* trust_ratio_div_norm_ptr =
   //     dev_ctx.template Alloc<MT>(&trust_ratio_div_norm_t);
 
diff --git a/paddle/phi/kernels/impl/lerp_grad_kernel_impl.h b/paddle/phi/kernels/impl/lerp_grad_kernel_impl.h
index 54a6172501aeae..316d00f07a35c6 100644
--- a/paddle/phi/kernels/impl/lerp_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/lerp_grad_kernel_impl.h
@@ -105,7 +105,7 @@ static void LerpGradFunctionZero(const Context& ctx,
                                  const DenseTensor& out_grad,
                                  DenseTensor* x_grad,
                                  DenseTensor* y_grad) {
-  auto dim = make_ddim(std::vector<int64_t>(1, 1));
+  auto dim = common::make_ddim(std::vector<int64_t>(1, 1));
   auto eigen_w = phi::EigenTensor<T, 1>::From(weight, dim);
   auto eigen_dout = phi::EigenTensor<T, 1>::From(out_grad, dim);
 
diff --git a/paddle/phi/kernels/impl/lerp_kernel_impl.h b/paddle/phi/kernels/impl/lerp_kernel_impl.h
index 9509d3300e5bdf..0c17f0e61ab30a 100644
--- a/paddle/phi/kernels/impl/lerp_kernel_impl.h
+++ b/paddle/phi/kernels/impl/lerp_kernel_impl.h
@@ -62,7 +62,7 @@ static void LerpFunctionZero(const Context& ctx,
                              DenseTensor* out) {
   ctx.template Alloc<T>(out);
 
-  auto dim = make_ddim(std::vector<int64_t>(1, 1));
+  auto dim = common::make_ddim(std::vector<int64_t>(1, 1));
   auto eigen_x = phi::EigenTensor<T, 1>::From(x, dim);
   auto eigen_y = phi::EigenTensor<T, 1>::From(y, dim);
   auto eigen_w = phi::EigenTensor<T, 1>::From(weight, dim);
diff --git a/paddle/phi/kernels/impl/lstsq_kernel_impl.h b/paddle/phi/kernels/impl/lstsq_kernel_impl.h
index 2f26391bc6be3f..0aafee5788fa91 100644
--- a/paddle/phi/kernels/impl/lstsq_kernel_impl.h
+++ b/paddle/phi/kernels/impl/lstsq_kernel_impl.h
@@ -122,7 +122,7 @@ inline void BatchedOrmqr<GPUContext, float>(const GPUContext& dev_ctx,
   PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnSormqr_bufferSize(
       handle, side, trans, m, n, k, a, lda, tau, other, ldc, &lwork));
   DenseTensor* info = new DenseTensor();
-  info->Resize(make_ddim({1}));
+  info->Resize(common::make_ddim({1}));
   int* info_d = dev_ctx.template Alloc<int>(info);
 
   for (int i = 0; i < batch_size; ++i) {
@@ -132,7 +132,7 @@ inline void BatchedOrmqr<GPUContext, float>(const GPUContext& dev_ctx,
 
     handle = dev_ctx.cusolver_dn_handle();
     DenseTensor* workspace = new DenseTensor();
-    workspace->Resize(make_ddim({lwork}));
+    workspace->Resize(common::make_ddim({lwork}));
     float* workspace_ptr = dev_ctx.template Alloc<float>(workspace);
 
     // compute ormgr
@@ -191,7 +191,7 @@ inline void BatchedOrmqr<GPUContext, double>(const GPUContext& dev_ctx,
   PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnDormqr_bufferSize(
       handle, side, trans, m, n, k, a, lda, tau, other, ldc, &lwork));
   DenseTensor* info = new DenseTensor();
-  info->Resize(make_ddim({1}));
+  info->Resize(common::make_ddim({1}));
   int* info_d = dev_ctx.template Alloc<int>(info);
 
   for (int i = 0; i < batch_size; ++i) {
@@ -201,7 +201,7 @@ inline void BatchedOrmqr<GPUContext, double>(const GPUContext& dev_ctx,
 
     handle = dev_ctx.cusolver_dn_handle();
     DenseTensor* workspace = new DenseTensor();
-    workspace->Resize(make_ddim({lwork}));
+    workspace->Resize(common::make_ddim({lwork}));
     double* workspace_ptr = dev_ctx.template Alloc<double>(workspace);
 
     // compute ormgr
diff --git a/paddle/phi/kernels/impl/lu_grad_kernel_impl.h b/paddle/phi/kernels/impl/lu_grad_kernel_impl.h
index 8f3a37d25b2fb9..71747addfcdbd2 100644
--- a/paddle/phi/kernels/impl/lu_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/lu_grad_kernel_impl.h
@@ -109,7 +109,7 @@ void LUGradKernel(const Context& dev_ctx,
   std::vector<int64_t> axes = {xrank - 2, xrank - 1};
   std::vector<int64_t> slice_starts(2, 0);
   std::vector<int64_t> slice_ends(2, 0);
-  auto valuedims = vectorize(xdims);
+  auto valuedims = common::vectorize(xdims);
 
   DenseTensor Pmat;
   Unpack_Pivot<Context, T>(dev_ctx, pivots, &Pmat, m, k);
diff --git a/paddle/phi/kernels/impl/lu_kernel_impl.h b/paddle/phi/kernels/impl/lu_kernel_impl.h
index d2838551ff20a7..feca05cf734100 100644
--- a/paddle/phi/kernels/impl/lu_kernel_impl.h
+++ b/paddle/phi/kernels/impl/lu_kernel_impl.h
@@ -79,7 +79,7 @@ void SetValueCompute(const Context& dev_ctx,
       none_axes_cur++;
     }
 
-    slice_dims_for_assign = phi::make_ddim(slice_dims_with_none);
+    slice_dims_for_assign = common::make_ddim(slice_dims_with_none);
   }
 
   auto place = dev_ctx.GetPlace();
@@ -158,7 +158,7 @@ void SetValueCompute(const Context& dev_ctx,
         dev_ctx, slice_tensor, *value_tensor, SubFunctor<T>(), &slice_tensor);
   } else {
     DenseTensor value_t(dtype);
-    auto value_dims = phi::make_ddim(shape);
+    auto value_dims = common::make_ddim(shape);
     CheckIsDimsMatch(slice_dims_for_assign, value_dims);
 
     value_t.Resize(value_dims);
@@ -389,7 +389,7 @@ void arange(const Context& dev_ctx,
             int w,
             int batchsize = 1,
             int h = 1) {
-  tmp->Resize(phi::make_ddim({batchsize * w}));
+  tmp->Resize(common::make_ddim({batchsize * w}));
   dev_ctx.template HostAlloc<int32_t>(tmp);
   auto tmpdata = tmp->data<int32_t>();
   for (int b = 0; b < batchsize; b++) {
@@ -439,7 +439,7 @@ void LU_Unpack(const Context& dev_ctx,
   // set L's diagonal 1
   auto dim = std::min(H, W);
   DenseTensor rowtensor, rt_dev;
-  auto batchsize = product(phi::slice_ddim(udims, 0, udims.size() - 2));
+  auto batchsize = product(common::slice_ddim(udims, 0, udims.size() - 2));
 
   // if udims is [0, ..., H, W], it should be 0
   if (udims.size() == 2) batchsize = std::max(static_cast<int>(batchsize), 1);
@@ -477,7 +477,7 @@ void Unpack_Pivot(const Context& dev_ctx,
                   int h,
                   int w UNUSED) {
   auto dims = Pivot.dims();
-  auto Pdimvec = vectorize(dims);
+  auto Pdimvec = common::vectorize(dims);
   auto prank = Pdimvec.size();
   auto Pnum = dims[prank - 1];
   DenseTensor Pivot_cpu;
@@ -486,14 +486,14 @@ void Unpack_Pivot(const Context& dev_ctx,
   auto pdataptr = Pivot_cpu.data<int32_t>();
   Pdimvec[prank - 1] = h;
   Pdimvec.emplace_back(h);
-  auto Pdim = phi::make_ddim(Pdimvec);
+  auto Pdim = common::make_ddim(Pdimvec);
   P->Resize(Pdim);
   dev_ctx.template Alloc<T>(P);
   auto pdata = P->data<T>();
   phi::funcs::SetConstant<Context, T> setter;
   setter(dev_ctx, P, static_cast<T>(0));
 
-  auto batchsize = product(phi::slice_ddim(dims, 0, prank - 1));
+  auto batchsize = product(common::slice_ddim(dims, 0, prank - 1));
   if (prank == 1) batchsize = std::max(static_cast<int>(batchsize), 1);
 
   DenseTensor idt;
@@ -525,7 +525,7 @@ DenseTensor Transpose2DTo6D(const Context& dev_ctx, const DenseTensor& x) {
   // transpose the last two dimision
   DenseTensor ret;
   auto x_dim = x.dims();
-  auto x_vec = phi::vectorize<int>(x_dim);
+  auto x_vec = common::vectorize<int>(x_dim);
   int rank = x_vec.size();
 
   for (int i = 0; i < x_dim.size(); i++) {
@@ -542,7 +542,7 @@ DenseTensor Transpose2DTo6D(const Context& dev_ctx, const DenseTensor& x) {
     axis[i] = i;
   }
   std::swap(axis[rank - 1], axis[rank - 2]);
-  ret.Resize(phi::make_ddim(x_vec));
+  ret.Resize(common::make_ddim(x_vec));
   dev_ctx.template Alloc<T>(&ret);
   switch (rank) {
     case 2: {
diff --git a/paddle/phi/kernels/impl/lu_unpack_grad_kernel_impl.h b/paddle/phi/kernels/impl/lu_unpack_grad_kernel_impl.h
index 7098b745e6d255..f1d904663a7233 100644
--- a/paddle/phi/kernels/impl/lu_unpack_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/lu_unpack_grad_kernel_impl.h
@@ -64,7 +64,7 @@ void LUUnpackGradKernel(const Context& dev_ctx,
   std::vector<int64_t> axes = {xrank - 2, xrank - 1};
   std::vector<int64_t> slice_starts(2, 0);
   std::vector<int64_t> slice_ends(2, 0);
-  auto valuedims = vectorize(xdims);
+  auto valuedims = common::vectorize(xdims);
 
   phi::funcs::SetConstant<Context, T> setter;
   setter(dev_ctx, x_grad, static_cast<T>(0));
diff --git a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
index 4125e49db6eef6..40ff69c50f1d7f 100644
--- a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
@@ -134,7 +134,7 @@ static DDim RowMatrixFromVector(const DDim& x_dim) {
   if (x_dim.size() > 1) {
     return x_dim;
   }
-  return phi::make_ddim({1, x_dim[0]});
+  return common::make_ddim({1, x_dim[0]});
 }
 
 /**
@@ -145,7 +145,7 @@ static DDim ColumnMatrixFromVector(const DDim& y_dim) {
   if (y_dim.size() > 1) {
     return y_dim;
   }
-  return phi::make_ddim({y_dim[0], 1});
+  return common::make_ddim({y_dim[0], 1});
 }
 
 /**
@@ -229,9 +229,9 @@ void MatmulGradKernel(const Context& dev_ctx,
                       DenseTensor* dx,
                       DenseTensor* dy) {
   // get dims
-  std::vector<std::int64_t> x_dims = vectorize(x.dims());
-  std::vector<std::int64_t> y_dims = vectorize(y.dims());
-  std::vector<std::int64_t> dout_dims = vectorize(out_grad.dims());
+  std::vector<std::int64_t> x_dims = common::vectorize(x.dims());
+  std::vector<std::int64_t> y_dims = common::vectorize(y.dims());
+  std::vector<std::int64_t> dout_dims = common::vectorize(out_grad.dims());
 
   int x_ndim = x_dims.size();
   int y_ndim = y_dims.size();
@@ -422,8 +422,10 @@ void MatmulGradKernel(const Context& dev_ctx,
     }
 
     // get help dims
-    const std::vector<std::int64_t> dx_help_dims = vectorize(dx_help.dims());
-    const std::vector<std::int64_t> dy_help_dims = vectorize(dy_help.dims());
+    const std::vector<std::int64_t> dx_help_dims =
+        common::vectorize(dx_help.dims());
+    const std::vector<std::int64_t> dy_help_dims =
+        common::vectorize(dy_help.dims());
 
     std::vector<std::int64_t> dx_broadcast_dims(ndim);
     std::vector<std::int64_t> dy_broadcast_dims(ndim);
@@ -485,9 +487,9 @@ void MatmulDoubleGradKernel(const Context& dev_ctx,
                             DenseTensor* dy,
                             DenseTensor* ddout) {
   // Get dims from the input x, y, output_grad
-  std::vector<std::int64_t> x_dims = vectorize(x.dims());
-  std::vector<std::int64_t> y_dims = vectorize(y.dims());
-  std::vector<std::int64_t> dout_dims = vectorize(dout.dims());
+  std::vector<std::int64_t> x_dims = common::vectorize(x.dims());
+  std::vector<std::int64_t> y_dims = common::vectorize(y.dims());
+  std::vector<std::int64_t> dout_dims = common::vectorize(dout.dims());
 
   int x_ndim = x_dims.size();
   int y_ndim = y_dims.size();
@@ -791,8 +793,10 @@ void MatmulDoubleGradKernel(const Context& dev_ctx,
     }
 
     // get help dims
-    const std::vector<std::int64_t> dx_help_dims = vectorize(dx_help.dims());
-    const std::vector<std::int64_t> dy_help_dims = vectorize(dy_help.dims());
+    const std::vector<std::int64_t> dx_help_dims =
+        common::vectorize(dx_help.dims());
+    const std::vector<std::int64_t> dy_help_dims =
+        common::vectorize(dy_help.dims());
 
     std::vector<std::int64_t> dx_broadcast_dims(ndim);
     std::vector<std::int64_t> dy_broadcast_dims(ndim);
@@ -888,9 +892,9 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
                             DenseTensor* out_d_ddx,
                             DenseTensor* out_d_ddy) {
   // Get dims from the input x, y, output_grad
-  std::vector<std::int64_t> x_dims = vectorize(x.dims());
-  std::vector<std::int64_t> y_dims = vectorize(y.dims());
-  std::vector<std::int64_t> dout_dims = vectorize(dout.dims());
+  std::vector<std::int64_t> x_dims = common::vectorize(x.dims());
+  std::vector<std::int64_t> y_dims = common::vectorize(y.dims());
+  std::vector<std::int64_t> dout_dims = common::vectorize(dout.dims());
 
   int x_ndim = x_dims.size();
   int y_ndim = y_dims.size();
@@ -1539,9 +1543,9 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
 
     // get help dims
     const std::vector<std::int64_t> dx_help_dims =
-        vectorize(out_dx_help.dims());
+        common::vectorize(out_dx_help.dims());
     const std::vector<std::int64_t> dy_help_dims =
-        vectorize(out_dx_help.dims());
+        common::vectorize(out_dx_help.dims());
 
     std::vector<std::int64_t> dx_broadcast_dims(ndim);
     std::vector<std::int64_t> dy_broadcast_dims(ndim);
@@ -1883,8 +1887,8 @@ void MatmulWithFlattenGradKernel(const Context& dev_ctx,
   auto* dout = &out_grad;
 
   DenseTensor dout_mat(*dout);
-  dout_mat.Resize({phi::flatten_to_2d(x.dims(), x_num_col_dims)[0],
-                   phi::flatten_to_2d(y.dims(), y_num_col_dims)[1]});
+  dout_mat.Resize({common::flatten_to_2d(x.dims(), x_num_col_dims)[0],
+                   common::flatten_to_2d(y.dims(), y_num_col_dims)[1]});
 
   auto* dx = x_grad;
   auto* dy = y_grad;
@@ -1932,8 +1936,8 @@ void MatmulWithFlattenDoubleGradKernel(
   auto y_mat =
       y.dims().size() > 2 ? phi::ReshapeToMatrix(y, y_num_col_dims) : y;
 
-  const int m = phi::flatten_to_2d(x.dims(), x_num_col_dims)[0];
-  const int n = phi::flatten_to_2d(y.dims(), y_num_col_dims)[1];
+  const int m = common::flatten_to_2d(x.dims(), x_num_col_dims)[0];
+  const int n = common::flatten_to_2d(y.dims(), y_num_col_dims)[1];
 
   auto* dout = &out_grad;
   DenseTensor dout_mat(*dout);
diff --git a/paddle/phi/kernels/impl/matmul_kernel_impl.h b/paddle/phi/kernels/impl/matmul_kernel_impl.h
index 373453d1eefa45..85826728f404c4 100644
--- a/paddle/phi/kernels/impl/matmul_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_kernel_impl.h
@@ -131,7 +131,7 @@ void MatMulFunctionImplWithBlas(
             M,
             N));
     VLOG(3) << "MatMul's case 1";
-    Out->Resize(phi::make_ddim({}));
+    Out->Resize(common::make_ddim({}));
     dev_ctx.template Alloc<T>(Out);
     blas.GEMM(CblasNoTrans,
               CblasTrans,
@@ -178,7 +178,7 @@ void MatMulFunctionImplWithBlas(
       std::copy_n(y_dims.cbegin(), y_ndim - 2, out_dims.begin());
       out_dims.back() = y_dims.back();
     }
-    Out->ResizeAndAllocate(phi::make_ddim(out_dims));
+    Out->ResizeAndAllocate(common::make_ddim(out_dims));
     dev_ctx.template Alloc<T>(Out);
     if (trans_y) {
       const int M = Y.numel() / N;
@@ -256,7 +256,7 @@ void MatMulFunctionImplWithBlas(
     } else {
       std::copy_n(x_dims.cbegin(), x_ndim - 1, out_dims.begin());
     }
-    Out->ResizeAndAllocate(phi::make_ddim(out_dims));
+    Out->ResizeAndAllocate(common::make_ddim(out_dims));
     dev_ctx.template Alloc<T>(Out);
 
     if (trans_x) {
@@ -344,7 +344,7 @@ void MatMulFunctionImplWithBlas(
   out_broadcast_dims[ndim - 2] = M;
   out_broadcast_dims[ndim - 1] = N;
 
-  Out->ResizeAndAllocate(phi::make_ddim(out_broadcast_dims));
+  Out->ResizeAndAllocate(common::make_ddim(out_broadcast_dims));
   dev_ctx.template Alloc<T>(Out);
 
   const int batch_dim = ndim - 2;
@@ -521,7 +521,7 @@ void MatMulFunctionImplWithCublasLt(
             N));
 
     // MatMul's case 0  =>  vector * vector
-    Out->Resize(phi::make_ddim({}));
+    Out->Resize(common::make_ddim({}));
     dev_ctx.template Alloc<T>(Out);
     VLOG(3) << "MatMul with blaslt case 1";
     blaslt::Run(dev_ctx,
@@ -569,7 +569,7 @@ void MatMulFunctionImplWithCublasLt(
       std::copy_n(y_dims.cbegin(), y_ndim - 2, out_dims.begin());
       out_dims.back() = y_dims.back();
     }
-    Out->ResizeAndAllocate(phi::make_ddim(out_dims));
+    Out->ResizeAndAllocate(common::make_ddim(out_dims));
     dev_ctx.template Alloc<T>(Out);
     if (trans_y) {
       const int M = Y.numel() / N;
@@ -652,7 +652,7 @@ void MatMulFunctionImplWithCublasLt(
     } else {
       std::copy_n(x_dims.cbegin(), x_ndim - 1, out_dims.begin());
     }
-    Out->ResizeAndAllocate(phi::make_ddim(out_dims));
+    Out->ResizeAndAllocate(common::make_ddim(out_dims));
     dev_ctx.template Alloc<T>(Out);
 
     if (trans_x) {
@@ -745,7 +745,7 @@ void MatMulFunctionImplWithCublasLt(
   out_broadcast_dims[ndim - 2] = M;
   out_broadcast_dims[ndim - 1] = N;
 
-  Out->ResizeAndAllocate(phi::make_ddim(out_broadcast_dims));
+  Out->ResizeAndAllocate(common::make_ddim(out_broadcast_dims));
   dev_ctx.template Alloc<T>(Out);
 
   const int batch_dim = ndim - 2;
@@ -1030,7 +1030,7 @@ bool inline MatMulInt8Function(const phi::GPUContext& ctx,
       return false;
     }
 
-    out->Resize(phi::make_ddim({}));
+    out->Resize(common::make_ddim({}));
     ctx.template Alloc<int32_t>(out);
     blaslt::Run(ctx,
                 y_data,
@@ -1083,7 +1083,7 @@ bool inline MatMulInt8Function(const phi::GPUContext& ctx,
       std::copy_n(y_dims.cbegin(), y_ndim - 2, out_dims.begin());
       out_dims.back() = y_dims.back();
     }
-    out->ResizeAndAllocate(phi::make_ddim(out_dims));
+    out->ResizeAndAllocate(common::make_ddim(out_dims));
     ctx.template Alloc<int32_t>(out);
     if (trans_y) {
       const int M = y.numel() / N;
@@ -1170,7 +1170,7 @@ bool inline MatMulInt8Function(const phi::GPUContext& ctx,
     } else {
       std::copy_n(x_dims.cbegin(), x_ndim - 1, out_dims.begin());
     }
-    out->ResizeAndAllocate(phi::make_ddim(out_dims));
+    out->ResizeAndAllocate(common::make_ddim(out_dims));
     ctx.template Alloc<int32_t>(out);
 
     if (trans_x) {
@@ -1259,7 +1259,7 @@ bool inline MatMulInt8Function(const phi::GPUContext& ctx,
   out_broadcast_dims[ndim - 2] = M;
   out_broadcast_dims[ndim - 1] = N;
 
-  out->ResizeAndAllocate(phi::make_ddim(out_broadcast_dims));
+  out->ResizeAndAllocate(common::make_ddim(out_broadcast_dims));
   ctx.template Alloc<int32_t>(out);
 
   const int batch_dim = ndim - 2;
@@ -1475,17 +1475,17 @@ void MatmulKernel(const Context& ctx,
                   bool transpose_y,
                   DenseTensor* out) {
   PADDLE_ENFORCE_NE(
-      phi::product(x.dims()),
+      common::product(x.dims()),
       0,
       phi::errors::InvalidArgument("The Input(X) dims size must not be equal 0,"
                                    " but reviced dims size is 0. "));
   PADDLE_ENFORCE_NE(
-      phi::product(y.dims()),
+      common::product(y.dims()),
       0,
       phi::errors::InvalidArgument("The Input(Y) dims size must not be equal 0,"
                                    " but reviced dims size is 0. "));
-  const std::vector<std::int64_t> x_dims = vectorize(x.dims());
-  const std::vector<std::int64_t> y_dims = vectorize(y.dims());
+  const std::vector<std::int64_t> x_dims = common::vectorize(x.dims());
+  const std::vector<std::int64_t> y_dims = common::vectorize(y.dims());
   MatmulJudgeDtypeKernel<Context, T>(
       ctx, x, y, x_dims, y_dims, out, transpose_x, transpose_y);
 }
diff --git a/paddle/phi/kernels/impl/matrix_rank_kernel_impl.h b/paddle/phi/kernels/impl/matrix_rank_kernel_impl.h
index b0dd76a17eeb36..23924a93f947b3 100644
--- a/paddle/phi/kernels/impl/matrix_rank_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matrix_rank_kernel_impl.h
@@ -21,37 +21,37 @@ namespace phi {
 
 namespace detail {
 static DDim GetEigenvalueDim(const DDim& dim, int k) {
-  auto vec = phi::vectorize(dim);
+  auto vec = common::vectorize(dim);
   vec.erase(vec.end() - 2, vec.end());
   vec.push_back(k);
-  return phi::make_ddim(vec);
+  return common::make_ddim(vec);
 }
 
 static DDim NewAxisDim(const DDim& dim, int k) {
-  auto vec = phi::vectorize(dim);
+  auto vec = common::vectorize(dim);
   vec.push_back(k);
-  return phi::make_ddim(vec);
+  return common::make_ddim(vec);
 }
 
 static DDim RemoveLastDim(const DDim& dim) {
-  auto vec = phi::vectorize(dim);
+  auto vec = common::vectorize(dim);
   if (vec.size() <= 1) {
-    return phi::make_ddim({1});
+    return common::make_ddim({1});
   }
   vec.erase(vec.end() - 1, vec.end());
-  return phi::make_ddim(vec);
+  return common::make_ddim(vec);
 }
 
 static DDim GetUDDim(const DDim& x_dim, int k) {
-  auto x_vec = phi::vectorize(x_dim);
+  auto x_vec = common::vectorize(x_dim);
   x_vec[x_vec.size() - 1] = k;
-  return phi::make_ddim(x_vec);
+  return common::make_ddim(x_vec);
 }
 
 static DDim GetVHDDim(const DDim& x_dim, int k) {
-  auto x_vec = phi::vectorize(x_dim);
+  auto x_vec = common::vectorize(x_dim);
   x_vec[x_vec.size() - 2] = k;
-  return phi::make_ddim(x_vec);
+  return common::make_ddim(x_vec);
 }
 }  // namespace detail
 
diff --git a/paddle/phi/kernels/impl/merged_momentum_impl.h b/paddle/phi/kernels/impl/merged_momentum_impl.h
index cdf90cba70690e..85f253fd32d492 100644
--- a/paddle/phi/kernels/impl/merged_momentum_impl.h
+++ b/paddle/phi/kernels/impl/merged_momentum_impl.h
@@ -16,10 +16,10 @@
 
 #include "glog/logging.h"
 
+#include "paddle/common/macros.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/hostdevice.h"
-#include "paddle/phi/core/macros.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
 #include "paddle/phi/kernels/impl/momentum_kernel_impl.h"
 #include "paddle/phi/kernels/merged_momentum_kernel.h"
diff --git a/paddle/phi/kernels/impl/meshgrid_grad_kernel_impl.h b/paddle/phi/kernels/impl/meshgrid_grad_kernel_impl.h
index bdedcee0957074..566f7ac38bdcf0 100644
--- a/paddle/phi/kernels/impl/meshgrid_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/meshgrid_grad_kernel_impl.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
+#include "paddle/common/macros.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/macros.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 #include "paddle/phi/kernels/meshgrid_grad_kernel.h"
diff --git a/paddle/phi/kernels/impl/meshgrid_kernel_impl.h b/paddle/phi/kernels/impl/meshgrid_kernel_impl.h
index dfe162a270a9b5..3507086a1964b3 100644
--- a/paddle/phi/kernels/impl/meshgrid_kernel_impl.h
+++ b/paddle/phi/kernels/impl/meshgrid_kernel_impl.h
@@ -58,9 +58,9 @@ void MeshgridForward(const Context& ctx,
 
     DenseTensor reshape_ins_tensor;
     phi::Copy(ctx, *ins[i], ctx.GetPlace(), false, &reshape_ins_tensor);
-    DDim out_dims_reshape = phi::make_ddim(view_shape);
+    DDim out_dims_reshape = common::make_ddim(view_shape);
     reshape_ins_tensor.Resize(out_dims_reshape);
-    DDim out_dims = phi::make_ddim(shape);
+    DDim out_dims = common::make_ddim(shape);
 
     Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
     for (int64_t j = 0; j < size; j++) {
diff --git a/paddle/phi/kernels/impl/multi_dot_kernel_impl.h b/paddle/phi/kernels/impl/multi_dot_kernel_impl.h
index e63ee31190757e..d3d854ef541fc4 100644
--- a/paddle/phi/kernels/impl/multi_dot_kernel_impl.h
+++ b/paddle/phi/kernels/impl/multi_dot_kernel_impl.h
@@ -42,7 +42,7 @@ inline DenseTensor MatMul(const Context& ctx,
   auto blas = phi::funcs::GetBlas<Context, T>(ctx);
 
   DenseTensor matrix_c;
-  phi::DDim c_dim = phi::make_ddim({a_dim[0], b_dim[1]});
+  phi::DDim c_dim = common::make_ddim({a_dim[0], b_dim[1]});
   matrix_c.Resize(c_dim);
   ctx.template Alloc<T>(&matrix_c);
 
@@ -175,9 +175,9 @@ inline void GetDims(const std::vector<const DenseTensor*>& ins,
   for (size_t i = 0; i < n; i++) {
     (*ins_dims)[i] = ins[i]->dims();
     if (i == 0 && (*ins_dims)[i].size() == 1) {
-      (*ins_dims)[i] = phi::make_ddim({1, (*ins_dims)[i][0]});
+      (*ins_dims)[i] = common::make_ddim({1, (*ins_dims)[i][0]});
     } else if (i == n - 1 && (*ins_dims)[i].size() == 1) {
-      (*ins_dims)[i] = phi::make_ddim({(*ins_dims)[i][0], 1});
+      (*ins_dims)[i] = common::make_ddim({(*ins_dims)[i][0], 1});
     }
   }
 }
@@ -212,7 +212,7 @@ void MultiDotKernel(const Context& ctx,
     auto mat_dim_c = phi::funcs::CreateMatrixDescriptor(ins_dims[2], 0, false);
     if (cost1 < cost2) {
       DenseTensor tmp_out;
-      phi::DDim tmp_dim = phi::make_ddim({Ma, Nb});
+      phi::DDim tmp_dim = common::make_ddim({Ma, Nb});
       tmp_out.Resize(tmp_dim);
       ctx.template Alloc<T>(&tmp_out);
       blas.MatMul(
@@ -221,7 +221,7 @@ void MultiDotKernel(const Context& ctx,
       blas.MatMul(tmp_out, mat_dim_tmp, *ins[2], mat_dim_c, scale, out, T(0));
     } else {
       DenseTensor tmp_out;
-      phi::DDim tmp_dim = phi::make_ddim({Ka, Nc});
+      phi::DDim tmp_dim = common::make_ddim({Ka, Nc});
       tmp_out.Resize(tmp_dim);
       ctx.template Alloc<T>(&tmp_out);
       blas.MatMul(
@@ -357,14 +357,14 @@ void MultiDotGradKernel(const Context& ctx,
 
   phi::DDim dout_dim = dout.dims();
   if (ins[0]->dims().size() == 1 && ins[n - 1]->dims().size() == 1) {
-    dout_dim = phi::make_ddim({1, 1});
+    dout_dim = common::make_ddim({1, 1});
   } else if (ins[0]->dims().size() == 1) {
     if (dout_dim.size() == 1) {
-      dout_dim = phi::make_ddim({1, dout_dim[0]});
+      dout_dim = common::make_ddim({1, dout_dim[0]});
     }
   } else if (ins[n - 1]->dims().size() == 1) {
     if (dout_dim.size() == 1) {
-      dout_dim = phi::make_ddim({dout_dim[0], 1});
+      dout_dim = common::make_ddim({dout_dim[0], 1});
     }
   }
 
diff --git a/paddle/phi/kernels/impl/pool_grad_kernel_impl.h b/paddle/phi/kernels/impl/pool_grad_kernel_impl.h
index e3e19370c86bf1..cf00a9b82b8dd8 100644
--- a/paddle/phi/kernels/impl/pool_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/pool_grad_kernel_impl.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/pooling.h"
 #include "paddle/phi/kernels/pool_grad_kernel.h"
diff --git a/paddle/phi/kernels/impl/pool_kernel_impl.h b/paddle/phi/kernels/impl/pool_kernel_impl.h
index a2a6705a68302b..dc0b7ad2108ac5 100644
--- a/paddle/phi/kernels/impl/pool_kernel_impl.h
+++ b/paddle/phi/kernels/impl/pool_kernel_impl.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <algorithm>
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 #include "paddle/phi/kernels/funcs/pooling.h"
 #include "paddle/phi/kernels/pool_kernel.h"
 
diff --git a/paddle/phi/kernels/impl/pow2_decay_with_linear_warmup_kernel_impl.h b/paddle/phi/kernels/impl/pow2_decay_with_linear_warmup_kernel_impl.h
index da28f52f6173b8..006a8f1e058626 100644
--- a/paddle/phi/kernels/impl/pow2_decay_with_linear_warmup_kernel_impl.h
+++ b/paddle/phi/kernels/impl/pow2_decay_with_linear_warmup_kernel_impl.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
+#include "paddle/common/macros.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/macros.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/impl/qr_grad_kernel_impl.h b/paddle/phi/kernels/impl/qr_grad_kernel_impl.h
index d22eca3c73393e..e015909d6e7b56 100644
--- a/paddle/phi/kernels/impl/qr_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/qr_grad_kernel_impl.h
@@ -38,7 +38,7 @@ static DenseTensor Fill(const Context& ctx,
                         std::vector<int> shape,
                         float fill_value) {
   DenseTensor ret;
-  ret.Resize(make_ddim(shape));
+  ret.Resize(common::make_ddim(shape));
   ctx.template Alloc<T>(&ret);
   funcs::SetConstant<Context, T>()(ctx, &ret, T(fill_value));
   return ret;
@@ -101,7 +101,7 @@ void QrGradKernel(const Context& ctx,
       R_term =
           Matmul<T, Context>(ctx, R, TransposeLast2Dim<T, Context>(ctx, dR));
     } else {
-      R_term = Fill<T, Context>(ctx, phi::vectorize<int>(R.dims()), 0);
+      R_term = Fill<T, Context>(ctx, common::vectorize<int>(R.dims()), 0);
     }
 
     // dQ^H * Q
@@ -110,7 +110,7 @@ void QrGradKernel(const Context& ctx,
       Q_term =
           Matmul<T, Context>(ctx, TransposeLast2Dim<T, Context>(ctx, dQ), Q);
     } else {
-      Q_term = Fill<T, Context>(ctx, phi::vectorize<int>(R.dims()), 0);
+      Q_term = Fill<T, Context>(ctx, common::vectorize<int>(R.dims()), 0);
     }
 
     DenseTensor M_tmp1 = Subtract<T, Context>(ctx, R_term, Q_term);
@@ -160,8 +160,8 @@ void QrGradKernel(const Context& ctx,
       dQ_prime =
           Matmul<T, Context>(ctx, Y, TransposeLast2Dim<T, Context>(ctx, dV));
     } else {
-      dV = Fill<T, Context>(ctx, phi::vectorize<int>(Y.dims()), 0);
-      dQ_prime = Fill<T, Context>(ctx, phi::vectorize<int>(Q.dims()), 0);
+      dV = Fill<T, Context>(ctx, common::vectorize<int>(Y.dims()), 0);
+      dQ_prime = Fill<T, Context>(ctx, common::vectorize<int>(Q.dims()), 0);
     }
 
     if (dQ.initialized()) {
diff --git a/paddle/phi/kernels/impl/quant_linear_kernel_impl.h b/paddle/phi/kernels/impl/quant_linear_kernel_impl.h
index dbd548f7af6da2..f48e871dce1659 100644
--- a/paddle/phi/kernels/impl/quant_linear_kernel_impl.h
+++ b/paddle/phi/kernels/impl/quant_linear_kernel_impl.h
@@ -37,7 +37,7 @@ void QuantLinearKernel(const Context& dev_ctx,
 
   auto input_dims = x.dims();
   std::vector<int64_t> output_dims;
-  auto in_mat_dims = phi::flatten_to_2d(input_dims, in_num_col_dims);
+  auto in_mat_dims = common::flatten_to_2d(input_dims, in_num_col_dims);
   auto w_dims0 = padding_weights ? w_dims[0] - 4 : w_dims[0];
   auto w_dims1 = padding_weights ? w_dims[1] - 4 : w_dims[1];
   PADDLE_ENFORCE_EQ(
@@ -51,7 +51,7 @@ void QuantLinearKernel(const Context& dev_ctx,
           in_mat_dims[1],
           in_mat_dims,
           w_dims0,
-          phi::make_ddim({w_dims0, w_dims1})));
+          common::make_ddim({w_dims0, w_dims1})));
 
   output_dims.reserve(static_cast<size_t>(in_num_col_dims + 1));
   for (int i = 0; i < in_num_col_dims; ++i) {
@@ -59,11 +59,11 @@ void QuantLinearKernel(const Context& dev_ctx,
   }
   output_dims.push_back(w_dims1);
 
-  y->Resize(phi::make_ddim(output_dims));
+  y->Resize(common::make_ddim(output_dims));
   y->set_lod(x.lod());
 
   auto out_dims = y->dims();
-  int M = phi::product(out_dims) / w_dims1;
+  int M = common::product(out_dims) / w_dims1;
 
   const T* input_data = x.data<T>();
   auto* output_data = dev_ctx.template Alloc<T>(y, y->numel() * sizeof(T));
diff --git a/paddle/phi/kernels/impl/reduce_grad.h b/paddle/phi/kernels/impl/reduce_grad.h
index 5665c9713c4764..2449d4decd965f 100644
--- a/paddle/phi/kernels/impl/reduce_grad.h
+++ b/paddle/phi/kernels/impl/reduce_grad.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "paddle/phi/core/macros.h"
+#include "paddle/common/macros.h"
 #include "paddle/phi/kernels/cast_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/reduce_grad_functions.h"
diff --git a/paddle/phi/kernels/impl/renorm_impl.h b/paddle/phi/kernels/impl/renorm_impl.h
index 554ccb6c1833f9..409c0a5c4e1f31 100644
--- a/paddle/phi/kernels/impl/renorm_impl.h
+++ b/paddle/phi/kernels/impl/renorm_impl.h
@@ -280,8 +280,8 @@ void RenormFunc(const phi::GPUContext& ctx,
   int64_t dim_divisor = 1, pre_mul = 1;
   for (int i = dim + 1; i < dim_size; i++) dim_divisor *= input_dims[i];
   for (int i = 0; i < dim; i++) pre_mul *= input_dims[i];
-  pow_value.Resize(phi::make_ddim({pre_mul, dimension_each, dim_divisor}));
-  dim_value.Resize(phi::make_ddim({dimension_each}));
+  pow_value.Resize(common::make_ddim({pre_mul, dimension_each, dim_divisor}));
+  dim_value.Resize(common::make_ddim({dimension_each}));
   T* pow_value_data = ctx.template Alloc<T>(&pow_value);
   T* dim_value_data = ctx.template Alloc<T>(&dim_value);
   auto stream = ctx.stream();
@@ -317,11 +317,11 @@ void RenormGradFunc(const phi::GPUContext& ctx,
   for (int i = dim + 1; i < dim_size; i++) dim_divisor *= input_dims[i];
   for (int i = 0; i < dim; i++) pre_mul *= input_dims[i];
   DenseTensor pow_value, mul_value, dim_value, dim_power_sum, weight_derivative;
-  pow_value.Resize(phi::make_ddim({pre_mul, dimension_each, dim_divisor}));
-  mul_value.Resize(phi::make_ddim({pre_mul, dimension_each, dim_divisor}));
-  dim_value.Resize(phi::make_ddim({dimension_each}));
-  dim_power_sum.Resize(phi::make_ddim({dimension_each}));
-  weight_derivative.Resize(phi::make_ddim({dimension_each}));
+  pow_value.Resize(common::make_ddim({pre_mul, dimension_each, dim_divisor}));
+  mul_value.Resize(common::make_ddim({pre_mul, dimension_each, dim_divisor}));
+  dim_value.Resize(common::make_ddim({dimension_each}));
+  dim_power_sum.Resize(common::make_ddim({dimension_each}));
+  weight_derivative.Resize(common::make_ddim({dimension_each}));
   auto stream = ctx.stream();
   int block = std::min(numel, static_cast<int64_t>(256));
   int grid = (numel + block - 1) / block;
diff --git a/paddle/phi/kernels/impl/repeat_interleave_grad_kernel_impl.h b/paddle/phi/kernels/impl/repeat_interleave_grad_kernel_impl.h
index 806e2be66332cb..d8c56000639bbc 100644
--- a/paddle/phi/kernels/impl/repeat_interleave_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/repeat_interleave_grad_kernel_impl.h
@@ -107,7 +107,7 @@ void RepeatInterleaveWithTensorIndexGradKernel(
 #if defined(__NVCC__) || defined(__HIPCC__)
 
   auto output_dim = out_grad.dims();
-  auto stride_dim = phi::stride(input_dim);
+  auto stride_dim = common::stride(input_dim);
   int64_t stride = stride_dim[dim];
   int64_t size = output_dim[dim];
   int64_t delta = input_dim[dim] - size;
@@ -181,7 +181,7 @@ void RepeatInterleaveGradKernel(const Context& ctx,
   DenseTensor index;
 #if defined(__NVCC__) || defined(__HIPCC__)
   auto output_dim = out_grad.dims();
-  auto stride_dim = phi::stride(input_dim);
+  auto stride_dim = common::stride(input_dim);
   int64_t stride = stride_dim[dim];
   int64_t size = output_dim[dim];
   int64_t delta = input_dim[dim] - size;
@@ -201,7 +201,7 @@ void RepeatInterleaveGradKernel(const Context& ctx,
   for (int i = 0; i < x_grad->dims()[dim]; i++) {
     std::fill_n(index_vec.begin() + i * repeats, repeats, i);
   }
-  index.Resize(phi::make_ddim({index_size}));
+  index.Resize(common::make_ddim({index_size}));
   phi::TensorFromVector<int>(index_vec, ctx, &index);
 
   const int* index_data = index.data<int>();
diff --git a/paddle/phi/kernels/impl/repeat_interleave_kernel_impl.h b/paddle/phi/kernels/impl/repeat_interleave_kernel_impl.h
index 9ac7ac6072db44..05f1bba3c0ea68 100644
--- a/paddle/phi/kernels/impl/repeat_interleave_kernel_impl.h
+++ b/paddle/phi/kernels/impl/repeat_interleave_kernel_impl.h
@@ -77,24 +77,24 @@ void RepeatInterleaveKernel(const Context& ctx,
   for (int i = 0; i < input_dim[dim]; i++) {
     std::fill_n(index_vec.begin() + i * repeats, repeats, i);
   }
-  index.Resize(phi::make_ddim({index_size}));
+  index.Resize(common::make_ddim({index_size}));
   if (place == cpu_place) {
     DenseTensor x_copy = x;
     phi::TensorFromVector<int>(index_vec, ctx, &index);
 
-    auto output_dim = phi::vectorize(x.dims());
+    auto output_dim = common::vectorize(x.dims());
     output_dim[dim] = index_size;
-    out->Resize(phi::make_ddim(output_dim));
+    out->Resize(common::make_ddim(output_dim));
     phi::IndexSelectInner<Context, T, int>(ctx, &x_copy, index, out, dim);
 #if defined(__NVCC__) || defined(__HIPCC__)
   } else {
-    auto stride_dim = phi::stride(input_dim);
+    auto stride_dim = common::stride(input_dim);
     int64_t stride = stride_dim[dim];
     phi::TensorFromVector<int>(index_vec, ctx, &index);
     auto stream = ctx.stream();
-    auto output_dim = phi::vectorize(x.dims());
+    auto output_dim = common::vectorize(x.dims());
     output_dim[dim] = index_size;
-    out->Resize(phi::make_ddim(output_dim));
+    out->Resize(common::make_ddim(output_dim));
     ctx.template Alloc<T>(out);
     auto* out_data = out->data<T>();
     int64_t numel = out->numel();
@@ -153,21 +153,21 @@ void RepeatInterleaveWithTensorIndexKernel(const Context& ctx,
     if (index_type == phi::DataType::INT32) {
       phi::funcs::RepeatsTensor2IndexTensor<Context, int>(
           ctx, repeats_tensor, &index);
-      auto output_dim = phi::vectorize(x.dims());
+      auto output_dim = common::vectorize(x.dims());
       output_dim[dim] = index.dims()[0];
-      out->Resize(phi::make_ddim(output_dim));
+      out->Resize(common::make_ddim(output_dim));
       IndexSelectInner<Context, T, int>(ctx, &x_copy, index, out, dim);
     } else if (index_type == phi::DataType::INT64) {
       phi::funcs::RepeatsTensor2IndexTensor<Context, int64_t>(
           ctx, repeats_tensor, &index);
-      auto output_dim = phi::vectorize(x.dims());
+      auto output_dim = common::vectorize(x.dims());
       output_dim[dim] = index.dims()[0];
-      out->Resize(phi::make_ddim(output_dim));
+      out->Resize(common::make_ddim(output_dim));
       IndexSelectInner<Context, T, int64_t>(ctx, &x_copy, index, out, dim);
     }
 #if defined(__NVCC__) || defined(__HIPCC__)
   } else {
-    auto stride_dim = phi::stride(input_dim);
+    auto stride_dim = common::stride(input_dim);
     int64_t stride = stride_dim[dim];
     auto stream = ctx.stream();
     auto* in_data = x.data<T>();
@@ -176,9 +176,9 @@ void RepeatInterleaveWithTensorIndexKernel(const Context& ctx,
           ctx, repeats_tensor, &index);
 
       const int64_t* index_data = index.data<int64_t>();
-      auto output_dim = phi::vectorize(x.dims());
+      auto output_dim = common::vectorize(x.dims());
       output_dim[dim] = index.dims()[0];
-      out->Resize(phi::make_ddim(output_dim));
+      out->Resize(common::make_ddim(output_dim));
       T* out_data = ctx.template Alloc<T>(out);
       int64_t numel = out->numel();
       int64_t size = output_dim[dim];
@@ -195,9 +195,9 @@ void RepeatInterleaveWithTensorIndexKernel(const Context& ctx,
           ctx, repeats_tensor, &index);
 
       const int* index_data = index.data<int>();
-      auto output_dim = phi::vectorize(x.dims());
+      auto output_dim = common::vectorize(x.dims());
       output_dim[dim] = index.dims()[0];
-      out->Resize(phi::make_ddim(output_dim));
+      out->Resize(common::make_ddim(output_dim));
       T* out_data = ctx.template Alloc<T>(out);
       int64_t numel = out->numel();
       int64_t size = output_dim[dim];
diff --git a/paddle/phi/kernels/impl/searchsorted_kernel_impl.h b/paddle/phi/kernels/impl/searchsorted_kernel_impl.h
index b3be4b9d556645..f933b718a28fe8 100644
--- a/paddle/phi/kernels/impl/searchsorted_kernel_impl.h
+++ b/paddle/phi/kernels/impl/searchsorted_kernel_impl.h
@@ -16,7 +16,7 @@
 
 #include <math.h>
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 #include "paddle/phi/kernels/funcs/algorithm.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
 
diff --git a/paddle/phi/kernels/impl/segment_pool_kernel_impl.h b/paddle/phi/kernels/impl/segment_pool_kernel_impl.h
index 216d5e6100d6cf..3b6f9998a00129 100644
--- a/paddle/phi/kernels/impl/segment_pool_kernel_impl.h
+++ b/paddle/phi/kernels/impl/segment_pool_kernel_impl.h
@@ -67,7 +67,7 @@ void SegmentKernelLaunchHelper(const Context& dev_ctx,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (!cpu_place) {
     DenseTensor length;
-    length.Resize(phi::make_ddim({1}));
+    length.Resize(common::make_ddim({1}));
     IndexT* length_data = dev_ctx.template HostAlloc<IndexT>(&length);
 
     const IndexT* segment_ids_ptr = segment_ids.data<IndexT>();
diff --git a/paddle/phi/kernels/impl/sequence_mask_kernel_impl.h b/paddle/phi/kernels/impl/sequence_mask_kernel_impl.h
index 80834fae85411e..f2eb1f8a39f970 100644
--- a/paddle/phi/kernels/impl/sequence_mask_kernel_impl.h
+++ b/paddle/phi/kernels/impl/sequence_mask_kernel_impl.h
@@ -44,9 +44,9 @@ void SequenceMaskKernel(const Context& ctx,
       maxlen = *max_len_tensor.get_ptr()->data<int32_t>();
     }
 
-    auto y_dim = phi::vectorize<int>(x.dims());
+    auto y_dim = common::vectorize<int>(x.dims());
     y_dim.push_back(maxlen);
-    y->Resize(phi::make_ddim(y_dim));
+    y->Resize(common::make_ddim(y_dim));
 
     PADDLE_ENFORCE_GT(
         maxlen,
@@ -76,9 +76,9 @@ void SequenceMaskKernel(const Context& ctx,
       maxlen = static_cast<int>(*std::max_element(x_data, x_data + x_numel));
 #endif
     }
-    auto y_dim = phi::vectorize<int>(x.dims());
+    auto y_dim = common::vectorize<int>(x.dims());
     y_dim.push_back(maxlen);
-    y->Resize(phi::make_ddim(y_dim));
+    y->Resize(common::make_ddim(y_dim));
   }
 
   phi::VisitDataType(phi::TransToPhiDataType(out_dtype),
diff --git a/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h b/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h
index 3d2a0a3d0db67c..3f78361b92b8bd 100644
--- a/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h
@@ -84,7 +84,7 @@ void SetValueGradImpl(const Context& dev_ctx,
                              axes.size(),
                              false);
 
-  DDim out_dims(phi::make_ddim(out_dims_vector));
+  DDim out_dims(common::make_ddim(out_dims_vector));
 
   std::vector<int> reverse_vector(starts_local.size(), 0);
   funcs::StridedSliceFunctor(starts_local.data(),
diff --git a/paddle/phi/kernels/impl/set_value_kernel_impl.h b/paddle/phi/kernels/impl/set_value_kernel_impl.h
index 2c545ac06ada11..f9e582d440f7b4 100644
--- a/paddle/phi/kernels/impl/set_value_kernel_impl.h
+++ b/paddle/phi/kernels/impl/set_value_kernel_impl.h
@@ -113,7 +113,7 @@ void SetValueImpl(const Context& dev_ctx,
       none_axes_cur++;
     }
 
-    slice_dims_for_assign = phi::make_ddim(slice_dims_with_none);
+    slice_dims_for_assign = common::make_ddim(slice_dims_with_none);
   }
 
   auto place = dev_ctx.GetPlace();
@@ -336,7 +336,7 @@ void SetValueKernel(const Context& dev_ctx,
   }
   DenseTensor value_tensor = Empty<T>(dev_ctx, shape);
   phi::TensorFromVector(assgin_values, dev_ctx, &value_tensor);
-  value_tensor.Resize(phi::make_ddim(shape));
+  value_tensor.Resize(common::make_ddim(shape));
 
   SetTensorValueKernel<T, Context>(dev_ctx,
                                    x,
diff --git a/paddle/phi/kernels/impl/slice_grad_kernel_impl.h b/paddle/phi/kernels/impl/slice_grad_kernel_impl.h
index ac2769e041e398..fa3ef0318fbb17 100644
--- a/paddle/phi/kernels/impl/slice_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/slice_grad_kernel_impl.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "paddle/phi/core/macros.h"
+#include "paddle/common/macros.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
@@ -109,8 +109,8 @@ void EigenPaddingCompute(
         out_tore_shape[1] = out_dims[pad_dim];
 
         // convert array from std::vector to DDim
-        DDim reshaped_in_dims = make_ddim(in_tore_shape);
-        DDim reshaped_out_dims = make_ddim(out_tore_shape);
+        DDim reshaped_in_dims = common::make_ddim(in_tore_shape);
+        DDim reshaped_out_dims = common::make_ddim(out_tore_shape);
 
         // after reshape: the first dimension do not need padding,
         // set padding[0] zero
@@ -142,8 +142,8 @@ void EigenPaddingCompute(
         }
 
         // convert array from std::vector to DDim
-        DDim reshaped_in_dims = make_ddim(in_tore_shape);
-        DDim reshaped_out_dims = make_ddim(out_tore_shape);
+        DDim reshaped_in_dims = common::make_ddim(in_tore_shape);
+        DDim reshaped_out_dims = common::make_ddim(out_tore_shape);
 
         // after reshape:
         // the first dimension is the previous padding dimension
@@ -180,8 +180,8 @@ void EigenPaddingCompute(
         }
 
         // convert array from std::vector to DDim
-        DDim reshaped_in_dims = make_ddim(in_tore_shape);
-        DDim reshaped_out_dims = make_ddim(out_tore_shape);
+        DDim reshaped_in_dims = common::make_ddim(in_tore_shape);
+        DDim reshaped_out_dims = common::make_ddim(out_tore_shape);
 
         // after reshape:
         // the first dimension do not need padding, set padding[0] zero
@@ -228,7 +228,7 @@ void SliceGradCompute(const Context& ctx,
     if (decrease_size == static_cast<size_t>(in_dims.size())) {
       // all dims decrease
       std::vector<int> origin_out_shape(decrease_size, 1);
-      out_dims = make_ddim(std::vector<int>(decrease_size, 1));
+      out_dims = common::make_ddim(std::vector<int>(decrease_size, 1));
     } else {
       std::vector<int> origin_out_shape(out_dims.size() + decrease_size, -1);
       for (size_t i = 0; i < decrease_size; ++i) {
@@ -243,7 +243,7 @@ void SliceGradCompute(const Context& ctx,
         }
       }
 
-      out_dims = make_ddim(origin_out_shape);
+      out_dims = common::make_ddim(origin_out_shape);
     }
   }
 
diff --git a/paddle/phi/kernels/impl/slogdeterminant_grad_kernel_impl.h b/paddle/phi/kernels/impl/slogdeterminant_grad_kernel_impl.h
index e7fa5edf9ad4ab..c964f91c690037 100644
--- a/paddle/phi/kernels/impl/slogdeterminant_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/slogdeterminant_grad_kernel_impl.h
@@ -60,7 +60,7 @@ void SlogDeterminantGradKernel(const Context& dev_ctx,
     VLOG(3) << "The input matrix not invertible!";
     x_grad->Resize(x.dims());
     phi::Full<T>(dev_ctx,
-                 phi::vectorize(x.dims()),
+                 common::vectorize(x.dims()),
                  std::numeric_limits<T>::quiet_NaN(),
                  x_grad);
     return;
diff --git a/paddle/phi/kernels/impl/slogdeterminant_kernel_impl.h b/paddle/phi/kernels/impl/slogdeterminant_kernel_impl.h
index a5798d66ee5c7e..05bd6097554ca2 100644
--- a/paddle/phi/kernels/impl/slogdeterminant_kernel_impl.h
+++ b/paddle/phi/kernels/impl/slogdeterminant_kernel_impl.h
@@ -75,7 +75,7 @@ template <typename T, typename Context>
 void SlogDeterminantKernel(const Context& dev_ctx,
                            const DenseTensor& x,
                            DenseTensor* out) {
-  auto input_dim = vectorize(x.dims());
+  auto input_dim = common::vectorize(x.dims());
   auto input_dim_size = input_dim.size();
 
   auto batch_count = detail::GetBatchCount(x.dims());
@@ -98,7 +98,7 @@ void SlogDeterminantKernel(const Context& dev_ctx,
   }
   output_dim_vec.insert(output_dim_vec.begin(),
                         2);  // make the output dims as same as numpy
-  auto output_dims = phi::make_ddim(output_dim_vec);
+  auto output_dims = common::make_ddim(output_dim_vec);
   out->Resize(output_dims);
   VLOG(2) << "output dim:" << out->dims();
 }
diff --git a/paddle/phi/kernels/impl/solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/solve_grad_kernel_impl.h
index 7386e8beb22cbb..fa25f2a0887972 100644
--- a/paddle/phi/kernels/impl/solve_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/solve_grad_kernel_impl.h
@@ -100,17 +100,17 @@ void SolveGradKernel(const Context& dev_ctx,
       get_broadcast_dims(tmp_x, tmp_y);
   // tmp_dx
   DenseTensor tmp_dx;
-  tmp_dx.Resize(phi::make_ddim(x_broadcast_dims));
+  tmp_dx.Resize(common::make_ddim(x_broadcast_dims));
   dev_ctx.template Alloc<T>(&tmp_dx);
 
   // tmp_dy
   DenseTensor tmp_dy;
-  tmp_dy.Resize(phi::make_ddim(y_broadcast_dims));
+  tmp_dy.Resize(common::make_ddim(y_broadcast_dims));
   dev_ctx.template Alloc<T>(&tmp_dy);
 
   DenseTensor tmp_input(x.dtype());
   const auto& new_dims_vec = phi::funcs::getNewDimsVec(x.dims());
-  tmp_input.Resize(phi::make_ddim(new_dims_vec));
+  tmp_input.Resize(common::make_ddim(new_dims_vec));
   dev_ctx.template Alloc<T>(&tmp_input);
 
   phi::funcs::TransposeNormal<Context, T> trans;
@@ -174,9 +174,9 @@ void SolveGradKernel(const Context& dev_ctx,
     phi::Copy(dev_ctx, tmp_dy, dev_ctx.GetPlace(), false, &dy_help);
 
     // get dims
-    std::vector<std::int64_t> x_dims = vectorize(x.dims());
-    std::vector<std::int64_t> y_dims = vectorize(y.dims());
-    std::vector<std::int64_t> dout_dims = vectorize(dout.dims());
+    std::vector<std::int64_t> x_dims = common::vectorize(x.dims());
+    std::vector<std::int64_t> y_dims = common::vectorize(y.dims());
+    std::vector<std::int64_t> dout_dims = common::vectorize(dout.dims());
 
     if (is_vector_rhs(x, y)) {
       dout_dims.push_back(1);
@@ -185,7 +185,8 @@ void SolveGradKernel(const Context& dev_ctx,
     int y_ndim = y_dims.size();
     int ndim = dout_dims.size();
 
-    const std::vector<std::int64_t> dy_help_dims = vectorize(dy_help.dims());
+    const std::vector<std::int64_t> dy_help_dims =
+        common::vectorize(dy_help.dims());
     std::vector<std::int64_t> dy_broadcast_dims(ndim);
 
     std::fill(
@@ -224,13 +225,14 @@ void SolveGradKernel(const Context& dev_ctx,
     dev_ctx.Alloc(&dx_help, tmp_dx.dtype());
     phi::Copy(dev_ctx, tmp_dx, dev_ctx.GetPlace(), false, &dx_help);
     // get dims
-    std::vector<std::int64_t> x_dims = vectorize(x.dims());
-    std::vector<std::int64_t> y_dims = vectorize(y.dims());
+    std::vector<std::int64_t> x_dims = common::vectorize(x.dims());
+    std::vector<std::int64_t> y_dims = common::vectorize(y.dims());
 
     int x_ndim = x_dims.size();
     int ndim = x_broadcast_dims.size();
 
-    const std::vector<std::int64_t> dx_help_dims = vectorize(dx_help.dims());
+    const std::vector<std::int64_t> dx_help_dims =
+        common::vectorize(dx_help.dims());
     std::vector<std::int64_t> dx_broadcast_dims(ndim);
     std::fill(
         dx_broadcast_dims.data(), dx_broadcast_dims.data() + ndim - x_ndim, 1);
diff --git a/paddle/phi/kernels/impl/solve_kernel_impl.h b/paddle/phi/kernels/impl/solve_kernel_impl.h
index d5ecfdff21a998..ddfc18db7fc312 100644
--- a/paddle/phi/kernels/impl/solve_kernel_impl.h
+++ b/paddle/phi/kernels/impl/solve_kernel_impl.h
@@ -32,8 +32,8 @@ static inline bool is_vector_rhs(const DenseTensor& input,
   auto y_dim = other.dims();
   auto x_dim_size = x_dim.size();
   auto y_dim_size = y_dim.size();
-  std::vector<int64_t> x_dims_vec = phi::vectorize(x_dim);
-  std::vector<int64_t> y_dims_vec = phi::vectorize(y_dim);
+  std::vector<int64_t> x_dims_vec = common::vectorize(x_dim);
+  std::vector<int64_t> y_dims_vec = common::vectorize(y_dim);
 
   std::vector<int64_t>::const_iterator f = x_dims_vec.begin();
   std::vector<int64_t>::const_iterator l = x_dims_vec.end() - 1;
@@ -88,8 +88,8 @@ static inline std::vector<int> convert_to_int_vec(std::vector<int64_t> a) {
 // broadcast the batch dimensions of tensor x and tensor y.
 static inline std::tuple<std::vector<int64_t>, std::vector<int64_t>>
 get_broadcast_dims(const Tensor& x, const Tensor& y) {
-  std::vector<int64_t> x_dims_vec = phi::vectorize(x.dims());
-  std::vector<int64_t> y_dims_vec = phi::vectorize(y.dims());
+  std::vector<int64_t> x_dims_vec = common::vectorize(x.dims());
+  std::vector<int64_t> y_dims_vec = common::vectorize(y.dims());
   std::vector<int64_t>::const_iterator f1 = x_dims_vec.begin();
   std::vector<int64_t>::const_iterator l1 = x_dims_vec.end() - 2;
   std::vector<int64_t> x_dims_vec_cut(f1, l1);
diff --git a/paddle/phi/kernels/impl/spectral_norm_grad_kernel_impl.h b/paddle/phi/kernels/impl/spectral_norm_grad_kernel_impl.h
index 5bdb874bc89c47..dd9489da089c7b 100644
--- a/paddle/phi/kernels/impl/spectral_norm_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/spectral_norm_grad_kernel_impl.h
@@ -48,9 +48,9 @@ void SpectralNormGradKernel(const Context& dev_ctx,
         real_dims.push_back(dims[i]);
       }
     }
-    weight_mat.Resize(phi::make_ddim(real_dims));
+    weight_mat.Resize(common::make_ddim(real_dims));
     dev_ctx.template Alloc<T>(&weight_mat);
-    out_grad_mat.Resize(phi::make_ddim(real_dims));
+    out_grad_mat.Resize(common::make_ddim(real_dims));
     dev_ctx.template Alloc<T>(&out_grad_mat);
     TransCompute2DTo5D<Context, T>(dev_ctx, weight, rank, perm, &weight_mat);
     TransCompute2DTo5D<Context, T>(
@@ -114,7 +114,7 @@ void SpectralNormGradKernel(const Context& dev_ctx,
     dev_ctx.template Alloc<T>(weight_grad);
     TransCompute2DTo5D<Context, T>(
         dev_ctx,
-        weight_grad_mat.Resize(phi::make_ddim(real_dims)),
+        weight_grad_mat.Resize(common::make_ddim(real_dims)),
         rank,
         perm,
         weight_grad);
diff --git a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h b/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
index 57c5c69a63d614..86312b06c76950 100644
--- a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
+++ b/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
@@ -129,7 +129,7 @@ void SpectralNormKernel(const Context& dev_ctx,
         real_dims.push_back(dims[i]);
       }
     }
-    weight_mat.Resize(phi::make_ddim(real_dims));
+    weight_mat.Resize(common::make_ddim(real_dims));
     dev_ctx.template Alloc<T>(&weight_mat);
     TransCompute2DTo5D<Context, T>(dev_ctx, weight, rank, perm, &weight_mat);
   } else {
@@ -168,7 +168,11 @@ void SpectralNormKernel(const Context& dev_ctx,
     out->Resize(dims);
     dev_ctx.template Alloc<T>(out);
     TransCompute2DTo5D<Context, T>(
-        dev_ctx, weight_mat.Resize(phi::make_ddim(real_dims)), rank, perm, out);
+        dev_ctx,
+        weight_mat.Resize(common::make_ddim(real_dims)),
+        rank,
+        perm,
+        out);
   } else {
     phi::Copy(dev_ctx, weight_mat.Resize(dims), dev_ctx.GetPlace(), true, out);
   }
diff --git a/paddle/phi/kernels/impl/svd_grad_kernel_impl.h b/paddle/phi/kernels/impl/svd_grad_kernel_impl.h
index 13c86aa576104e..57556ff1990fb7 100644
--- a/paddle/phi/kernels/impl/svd_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/svd_grad_kernel_impl.h
@@ -30,7 +30,7 @@ static DenseTensor Fill(const Context& ctx,
                         std::vector<int> shape,
                         float fill_value) {
   DenseTensor ret;
-  ret.Resize(make_ddim(shape));
+  ret.Resize(common::make_ddim(shape));
   ctx.template Alloc<T>(&ret);
   funcs::SetConstant<Context, T>()(ctx, &ret, T(fill_value));
   return ret;
@@ -53,7 +53,7 @@ static DenseTensor Unsqueeze(const DenseTensor& x, int axis = 0) {
   // don't copy data, only change the dims
   DenseTensor out;
   out.ShareDataWith(x);
-  std::vector<int> out_shape = phi::vectorize<int>(x.dims());
+  std::vector<int> out_shape = common::vectorize<int>(x.dims());
   if (axis >= 0) {
     auto index = (out_shape.begin() + axis);
     out_shape.insert(index, 1);
@@ -61,7 +61,7 @@ static DenseTensor Unsqueeze(const DenseTensor& x, int axis = 0) {
     auto index = (out_shape.end() + axis + 1);
     out_shape.insert(index, 1);
   }
-  out.Resize(phi::make_ddim(out_shape));
+  out.Resize(common::make_ddim(out_shape));
   return out;
 }
 
diff --git a/paddle/phi/kernels/impl/tile_grad_kernel_impl.h b/paddle/phi/kernels/impl/tile_grad_kernel_impl.h
index d9b97956ce9d1e..a5a95b7bacd987 100644
--- a/paddle/phi/kernels/impl/tile_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/tile_grad_kernel_impl.h
@@ -56,7 +56,7 @@ void TileGradKernel(const Context& dev_ctx,
                     const IntArray& repeat_times,
                     DenseTensor* x_grad) {
   auto x_dims = x.dims();
-  auto vec_x_dims = phi::vectorize<int>(x_dims);
+  auto vec_x_dims = common::vectorize<int>(x_dims);
   auto repeat_times_data = repeat_times.GetData();
   if (repeat_times_data.size() < vec_x_dims.size()) {
     int diff = vec_x_dims.size() - repeat_times_data.size();
diff --git a/paddle/phi/kernels/impl/tile_kernel_impl.h b/paddle/phi/kernels/impl/tile_kernel_impl.h
index f7b923b00b1ca1..4e19d9183f4048 100644
--- a/paddle/phi/kernels/impl/tile_kernel_impl.h
+++ b/paddle/phi/kernels/impl/tile_kernel_impl.h
@@ -37,7 +37,7 @@ void Tile(const Context& dev_ctx,
             "be positive integers, but the value received is %d.",
             repeat_times[i]));
   }
-  auto vec_x_dims = phi::vectorize<int>(x_dims);
+  auto vec_x_dims = common::vectorize<int>(x_dims);
   if (repeat_times.size() < vec_x_dims.size()) {
     int diff = vec_x_dims.size() - repeat_times.size();
     repeat_times.insert(repeat_times.begin(), diff, 1);
@@ -63,7 +63,7 @@ void Tile(const Context& dev_ctx,
     bcast_dims[i] = repeat_times[i];
   }
 
-  DDim new_x_dims = make_ddim(vec_x_dims);
+  DDim new_x_dims = common::make_ddim(vec_x_dims);
   DDim out_dims(new_x_dims);
   for (size_t i = 0; i < repeat_times.size(); ++i) {
     out_dims[i] *= repeat_times[i];
diff --git a/paddle/phi/kernels/impl/trace_grad_kernel_impl.h b/paddle/phi/kernels/impl/trace_grad_kernel_impl.h
index 640fd07a92a2be..964d5871bf9319 100644
--- a/paddle/phi/kernels/impl/trace_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/trace_grad_kernel_impl.h
@@ -89,10 +89,10 @@ void TraceGradKernel(const Context& ctx,
                      int axis2,
                      DenseTensor* in_grad) {
   auto input_dims = in_grad->dims();
-  auto input_stride = phi::stride(input_dims);
+  auto input_stride = common::stride(input_dims);
   auto output_dims = out_grad.dims();
   auto output_stride = output_dims.size() == 0 ? phi::DDim(output_dims)
-                                               : phi::stride(output_dims);
+                                               : common::stride(output_dims);
 
   auto* out_data = out_grad.data<T>();
   T* x_data = ctx.template Alloc<T>(in_grad);
@@ -121,9 +121,9 @@ void TraceGradKernel(const Context& ctx,
   int64_t pos = std::abs(offset) * offset_stride;
   if (diag_size > 0) {
 #if defined(__NVCC__) || defined(__HIPCC__)
-    thrust::device_vector<int64_t> output_vec(vectorize(output_stride));
+    thrust::device_vector<int64_t> output_vec(common::vectorize(output_stride));
     const int64_t* output_arr = thrust::raw_pointer_cast(output_vec.data());
-    thrust::device_vector<int64_t> input_vec(vectorize(input_stride));
+    thrust::device_vector<int64_t> input_vec(common::vectorize(input_stride));
     const int64_t* input_arr = thrust::raw_pointer_cast(input_vec.data());
 
 #else
diff --git a/paddle/phi/kernels/impl/unfold_grad_kernel_impl.h b/paddle/phi/kernels/impl/unfold_grad_kernel_impl.h
index 28f034209188cd..a0e7c3c2ef7cd9 100644
--- a/paddle/phi/kernels/impl/unfold_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/unfold_grad_kernel_impl.h
@@ -52,8 +52,8 @@ void UnfoldGradKernel(const Context& ctx,
                                              paddings[3],
                                              strides[1]);
 
-  DDim x_shape = make_ddim({x_dims[1], x_dims[2], x_dims[3]});
-  DDim out_matrix_shape = make_ddim(
+  DDim x_shape = common::make_ddim({x_dims[1], x_dims[2], x_dims[3]});
+  DDim out_matrix_shape = common::make_ddim(
       {x_dims[1], kernel_sizes[0], kernel_sizes[1], out_height, out_width});
 
   phi::funcs::Col2ImFunctor<phi::funcs::ColFormat::kCFO, Context, T> col2im;
diff --git a/paddle/phi/kernels/impl/unfold_kernel_impl.h b/paddle/phi/kernels/impl/unfold_kernel_impl.h
index 7b7e9923d0004d..b1791af358cacd 100644
--- a/paddle/phi/kernels/impl/unfold_kernel_impl.h
+++ b/paddle/phi/kernels/impl/unfold_kernel_impl.h
@@ -50,8 +50,8 @@ void UnfoldKernel(const Context& ctx,
                                              paddings[3],
                                              strides[1]);
 
-  DDim x_shape = make_ddim({x_dims[1], x_dims[2], x_dims[3]});
-  DDim out_matrix_shape = make_ddim(
+  DDim x_shape = common::make_ddim({x_dims[1], x_dims[2], x_dims[3]});
+  DDim out_matrix_shape = common::make_ddim(
       {x_dims[1], kernel_sizes[0], kernel_sizes[1], out_height, out_width});
 
   for (int i = 0; i < batch_size; i++) {
diff --git a/paddle/phi/kernels/impl/warpctc_kernel_impl.h b/paddle/phi/kernels/impl/warpctc_kernel_impl.h
index 4b4bd6f5143dd3..275f32f0333cd5 100644
--- a/paddle/phi/kernels/impl/warpctc_kernel_impl.h
+++ b/paddle/phi/kernels/impl/warpctc_kernel_impl.h
@@ -336,7 +336,7 @@ void WarpctcKernel(const Context& dev_ctx,
     max_sequence_length = phi::funcs::MaximumSequenceLength(logits_lod);
   }
 
-  auto loss_dims = phi::make_ddim({static_cast<int64_t>(num_sequences), 1});
+  auto loss_dims = common::make_ddim({static_cast<int64_t>(num_sequences), 1});
 
   // warpctc needs sequences data stored in transposed padding format
   DenseTensor warpctc_logits_tmp =
diff --git a/paddle/phi/kernels/impl/warprnnt_kernel_impl.h b/paddle/phi/kernels/impl/warprnnt_kernel_impl.h
index f51041285aaee9..80ccf6e21b5377 100644
--- a/paddle/phi/kernels/impl/warprnnt_kernel_impl.h
+++ b/paddle/phi/kernels/impl/warprnnt_kernel_impl.h
@@ -313,7 +313,7 @@ void WarprnntKernel(const Context& dev_ctx,
       dev_ctx, warprnntgrad, static_cast<T>(0));
 
   // loss on cpu (B,)
-  auto loss_dims = phi::make_ddim({static_cast<int64_t>(B)});
+  auto loss_dims = common::make_ddim({static_cast<int64_t>(B)});
   DenseTensor warprnnt_loss;
   warprnnt_loss.Resize(loss_dims);
   T* warprnnt_loss_data = dev_ctx.template HostAlloc<T>(&warprnnt_loss);
diff --git a/paddle/phi/kernels/is_empty_kernel.cc b/paddle/phi/kernels/is_empty_kernel.cc
index 4b86f2dfe69504..dadaa2132e95ed 100644
--- a/paddle/phi/kernels/is_empty_kernel.cc
+++ b/paddle/phi/kernels/is_empty_kernel.cc
@@ -27,7 +27,7 @@ void IsEmptyKernel(const Context& dev_ctx,
   // always be allocated for CPUPlace. We reigister CUDA kernel for this op to
   // avoid the unnecessary data transform.
   bool* out_data = dev_ctx.template HostAlloc<bool>(out);
-  out_data[0] = phi::product(x.dims()) == 0;
+  out_data[0] = common::product(x.dims()) == 0;
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/kps/reduce_kernel.cu b/paddle/phi/kernels/kps/reduce_kernel.cu
index 506bd36e828bc5..74020a8f0975b4 100644
--- a/paddle/phi/kernels/kps/reduce_kernel.cu
+++ b/paddle/phi/kernels/kps/reduce_kernel.cu
@@ -173,7 +173,7 @@ void ReduceSumEigen(const KPDevice& dev_ctx,
     (*reduce_dims)[i] += added_dims;
   }
   auto eigen_reduce_dim =
-      EigenDim<ReducedDimSize>::From(phi::make_ddim(*reduce_dims));
+      EigenDim<ReducedDimSize>::From(common::make_ddim(*reduce_dims));
   // Caculate
   eigen_out_tensor.device(*dev_ctx.eigen_device()) =
       eigen_x_tensor.sum(eigen_reduce_dim);
diff --git a/paddle/phi/kernels/legacy/cpu/randint_kernel.cc b/paddle/phi/kernels/legacy/cpu/randint_kernel.cc
index 6b988f6294aac8..cf4a0e9a6d2770 100644
--- a/paddle/phi/kernels/legacy/cpu/randint_kernel.cc
+++ b/paddle/phi/kernels/legacy/cpu/randint_kernel.cc
@@ -29,7 +29,7 @@ void RandintWithSeedKernel(const Context& dev_ctx,
                            DataType dtype UNUSED,
                            int seed,
                            DenseTensor* out) {
-  out->Resize(phi::make_ddim(shape.GetData()));
+  out->Resize(common::make_ddim(shape.GetData()));
   T* data = dev_ctx.template Alloc<T>(out);
   auto numel = out->numel();
   std::shared_ptr<std::mt19937_64> engine;
diff --git a/paddle/phi/kernels/legacy/cpu/uniform_kernel.cc b/paddle/phi/kernels/legacy/cpu/uniform_kernel.cc
index 3aa697b2409ee9..897b57a8b27b4e 100644
--- a/paddle/phi/kernels/legacy/cpu/uniform_kernel.cc
+++ b/paddle/phi/kernels/legacy/cpu/uniform_kernel.cc
@@ -28,7 +28,7 @@ void UniformRawKernel(const Context &dev_ctx,
                       int diag_step,
                       float diag_val,
                       DenseTensor *out) {
-  out->Resize(phi::make_ddim(shape.GetData()));
+  out->Resize(common::make_ddim(shape.GetData()));
   T *data = dev_ctx.template Alloc<T>(out);
   auto size = out->numel();
   std::shared_ptr<std::mt19937_64> engine;
diff --git a/paddle/phi/kernels/legacy/gpu/randint_kernel.cu b/paddle/phi/kernels/legacy/gpu/randint_kernel.cu
index b4aa5e9d8c47ac..5aa0bf07d7ccb5 100644
--- a/paddle/phi/kernels/legacy/gpu/randint_kernel.cu
+++ b/paddle/phi/kernels/legacy/gpu/randint_kernel.cu
@@ -31,7 +31,7 @@ void RandintWithSeedKernel(const Context& dev_ctx,
                            DataType dtype,
                            int seed,
                            DenseTensor* out) {
-  out->Resize(phi::make_ddim(shape.GetData()));
+  out->Resize(common::make_ddim(shape.GetData()));
   T* data = dev_ctx.template Alloc<T>(out);
   funcs::uniform_distribution<uint32_t> dist;
   funcs::uniform_int_transform<T, uint32_t> trans(low, high);
diff --git a/paddle/phi/kernels/legacy/gpu/uniform_kernel.cu b/paddle/phi/kernels/legacy/gpu/uniform_kernel.cu
index abf51cf61f2b5c..c576608cc0d9f2 100644
--- a/paddle/phi/kernels/legacy/gpu/uniform_kernel.cu
+++ b/paddle/phi/kernels/legacy/gpu/uniform_kernel.cu
@@ -64,7 +64,7 @@ void UniformRawKernel(const Context& dev_ctx,
                       int diag_step,
                       float diag_val,
                       DenseTensor* out) {
-  out->Resize(phi::make_ddim(shape.GetData()));
+  out->Resize(common::make_ddim(shape.GetData()));
   dev_ctx.template Alloc<T>(out);
   if (seed == 0) {
     // Use global Generator seed
diff --git a/paddle/phi/kernels/legacy/xpu/compare_kernel.cc b/paddle/phi/kernels/legacy/xpu/compare_kernel.cc
index 5dd06c3fb88105..8957f09be78182 100644
--- a/paddle/phi/kernels/legacy/xpu/compare_kernel.cc
+++ b/paddle/phi/kernels/legacy/xpu/compare_kernel.cc
@@ -33,8 +33,8 @@ void XPUCompareRawKernelImpl(const Context& dev_ctx,
                                                bool*,
                                                const std::vector<int>&,
                                                const std::vector<int>&)> func) {
-  auto x_shape = vectorize<int>(x.dims());
-  auto y_shape = vectorize<int>(y.dims());
+  auto x_shape = common::vectorize<int>(x.dims());
+  auto y_shape = common::vectorize<int>(y.dims());
 
   if (x.dims().size() == 0) {
     x_shape = std::vector<int>({1});
diff --git a/paddle/phi/kernels/legacy/xpu/randint_kernel.cc b/paddle/phi/kernels/legacy/xpu/randint_kernel.cc
index 0349ad964c41a9..5f2f91cf0ac07d 100644
--- a/paddle/phi/kernels/legacy/xpu/randint_kernel.cc
+++ b/paddle/phi/kernels/legacy/xpu/randint_kernel.cc
@@ -32,7 +32,7 @@ void RandintWithSeedKernel(const Context& dev_ctx,
                            int seed,
                            DenseTensor* out) {
   int64_t size = out->numel();
-  out->Resize(phi::make_ddim(shape.GetData()));
+  out->Resize(common::make_ddim(shape.GetData()));
   T* data = dev_ctx.template Alloc<T>(out);
   auto numel = out->numel();
   std::shared_ptr<std::mt19937_64> engine;
diff --git a/paddle/phi/kernels/legacy/xpu/uniform_kernel.cc b/paddle/phi/kernels/legacy/xpu/uniform_kernel.cc
index f1907b13e5f967..9e4296dcb4efa1 100644
--- a/paddle/phi/kernels/legacy/xpu/uniform_kernel.cc
+++ b/paddle/phi/kernels/legacy/xpu/uniform_kernel.cc
@@ -34,7 +34,7 @@ void UniformRawKernel(const Context &dev_ctx,
                       int diag_step,
                       float diag_val,
                       DenseTensor *out) {
-  out->Resize(phi::make_ddim(shape.GetData()));
+  out->Resize(common::make_ddim(shape.GetData()));
   T *data = dev_ctx.template Alloc<T>(out);
   int64_t size = out->numel();
 
diff --git a/paddle/phi/kernels/onednn/add_n_kernel.cc b/paddle/phi/kernels/onednn/add_n_kernel.cc
index fcd48bdccc3636..f852254043e877 100644
--- a/paddle/phi/kernels/onednn/add_n_kernel.cc
+++ b/paddle/phi/kernels/onednn/add_n_kernel.cc
@@ -28,7 +28,7 @@ class SumOneDNNHandler : public OneDNNHandlerNoCachingT<T, dnnl::sum> {
 
       : OneDNNHandlerNoCachingT<T, dnnl::sum>(engine, cpu_place),
         num_inputs_(0) {
-    auto dst_tz = vectorize<int64_t>(out->dims());
+    auto dst_tz = common::vectorize<int64_t>(out->dims());
     auto src_tz = dst_tz;
 
     std::vector<dnnl::memory::desc> srcs_md;
diff --git a/paddle/phi/kernels/onednn/batch_norm_grad_kernel.cc b/paddle/phi/kernels/onednn/batch_norm_grad_kernel.cc
index e648686f3d2e7c..55f8dab5e8673f 100644
--- a/paddle/phi/kernels/onednn/batch_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/onednn/batch_norm_grad_kernel.cc
@@ -71,7 +71,7 @@ void BatchNormGradFunctor(const Context& dev_ctx,
   std::vector<int64_t> scale_tz;
   std::vector<int64_t> bias_tz;
   if (use_scale) {
-    scale_tz = vectorize<int64_t>(Scale->dims());
+    scale_tz = common::vectorize<int64_t>(Scale->dims());
     PADDLE_ENFORCE_EQ(
         scale_tz.size(),
         1,
@@ -80,7 +80,7 @@ void BatchNormGradFunctor(const Context& dev_ctx,
             scale_tz.size()));
   }
   if (use_bias) {
-    bias_tz = vectorize<int64_t>(Bias->dims());
+    bias_tz = common::vectorize<int64_t>(Bias->dims());
     PADDLE_ENFORCE_EQ(
         bias_tz.size(),
         1,
diff --git a/paddle/phi/kernels/onednn/batch_norm_kernel.cc b/paddle/phi/kernels/onednn/batch_norm_kernel.cc
index 070058062b6f49..9925aed9932565 100644
--- a/paddle/phi/kernels/onednn/batch_norm_kernel.cc
+++ b/paddle/phi/kernels/onednn/batch_norm_kernel.cc
@@ -98,7 +98,7 @@ void BatchNormKernel(const Context &dev_ctx,
   astream.wait();
 
   if (!global_stats) {
-    const unsigned int C = phi::vectorize(mean.dims())[0];
+    const unsigned int C = common::vectorize(mean.dims())[0];
 
     // mkldnn only compute stats for current batch
     // so we need compute momentum stats via Eigen lib
diff --git a/paddle/phi/kernels/onednn/cast_kernel.cc b/paddle/phi/kernels/onednn/cast_kernel.cc
index 74298cc055e0cb..9bf0a3e8a875fa 100644
--- a/paddle/phi/kernels/onednn/cast_kernel.cc
+++ b/paddle/phi/kernels/onednn/cast_kernel.cc
@@ -29,7 +29,7 @@ void CastKernel(const Context& dev_ctx,
   dnnl::memory::data_type in_dnnl_dtype = funcs::ToOneDNNDataType(in_dtype);
   dnnl::memory::data_type out_dnnl_dtype = funcs::ToOneDNNDataType(out_dtype);
 
-  auto x_tz = phi::vectorize(x.dims());
+  auto x_tz = common::vectorize(x.dims());
 
   funcs::ReorderOneDNNHandler reorder_handler(x_tz,
                                               in_dtype,
diff --git a/paddle/phi/kernels/onednn/concat_grad_kernel.cc b/paddle/phi/kernels/onednn/concat_grad_kernel.cc
index 29477a3ead8ae2..bbc57328ac2d6e 100644
--- a/paddle/phi/kernels/onednn/concat_grad_kernel.cc
+++ b/paddle/phi/kernels/onednn/concat_grad_kernel.cc
@@ -38,7 +38,7 @@ void ConcatGradKernel(const Context& dev_ctx,
 
   int axis = axis_scalar.to<int>();
 
-  auto out_grad_vec_dims = vectorize(out_grad.dims());
+  auto out_grad_vec_dims = common::vectorize(out_grad.dims());
 
   axis = funcs::ComputeAxis(axis, out_grad_vec_dims.size());
 
@@ -53,7 +53,7 @@ void ConcatGradKernel(const Context& dev_ctx,
 
   for (auto& grad : x_grad) {
     if (grad->numel() != 0UL) {
-      auto x_grad_vec_dims = vectorize(grad->dims());
+      auto x_grad_vec_dims = common::vectorize(grad->dims());
       auto slice_mem_p = reorder_handler.AcquireSubmemory(
           x_grad_vec_dims, offset, reorder_src_memory_p);
 
diff --git a/paddle/phi/kernels/onednn/concat_kernel.cc b/paddle/phi/kernels/onednn/concat_kernel.cc
index 0c9dbf5a85497a..f3ff30e2fa8613 100644
--- a/paddle/phi/kernels/onednn/concat_kernel.cc
+++ b/paddle/phi/kernels/onednn/concat_kernel.cc
@@ -56,7 +56,7 @@ class ConcatOneDNNHandler : public OneDNNHandlerNoCachingT<T, dnnl::concat> {
       srcs_md.push_back(input->mem_desc());
     }
 
-    auto dst_dims = vectorize<int64_t>(output->dims());
+    auto dst_dims = common::vectorize<int64_t>(output->dims());
 
     memory::desc dst_md = memory::desc(dst_dims, dt, OneDNNMemoryFormat::any);
 
@@ -104,7 +104,7 @@ void ConcatKernel(const Context& dev_ctx,
   auto multi_input = ReduceMultiInput(x);
   EnforceLayouts(multi_input);
 
-  auto out_dims_vec = vectorize(out->dims());
+  auto out_dims_vec = common::vectorize(out->dims());
   if (std::any_of(out_dims_vec.begin(), out_dims_vec.end(), [](int64_t i) {
         return i < 0;
       })) {
diff --git a/paddle/phi/kernels/onednn/conv_grad_kernel.cc b/paddle/phi/kernels/onednn/conv_grad_kernel.cc
index 93df685293fc3f..230d93f56966bd 100644
--- a/paddle/phi/kernels/onednn/conv_grad_kernel.cc
+++ b/paddle/phi/kernels/onednn/conv_grad_kernel.cc
@@ -125,7 +125,7 @@ void ConvGradKernel(const Context& dev_ctx,
                 funcs::ToOneDNNDataType(filter.dtype());
             // for 3d conv with groups (six dimensional data reorder to
             // goidhw) for 2d conv with groups (five dimensional data reorder
-            // to goihw) auto weights_tz = phi::vectorize(filter->dims());
+            // to goihw) auto weights_tz = common::vectorize(filter->dims());
 
             auto weights_tz = diff_weights_memory_p->get_desc().get_dims();
             dnnl::memory::format_tag out_format =
@@ -151,10 +151,10 @@ void ConvGradKernel(const Context& dev_ctx,
             dnnl::memory::format_tag target_format =
                 weights_tz.size() == 6 ? dnnl::memory::format_tag::oidhw
                                        : dnnl::memory::format_tag::oihw;
-            filter_grad->set_mem_desc(
-                dnnl::memory::desc(phi::vectorize<int64_t>(filter_grad->dims()),
-                                   in_type,
-                                   target_format));
+            filter_grad->set_mem_desc(dnnl::memory::desc(
+                common::vectorize<int64_t>(filter_grad->dims()),
+                in_type,
+                target_format));
           } else {
             filter_grad->set_mem_desc(diff_weights_memory_p->get_desc());
           }
@@ -248,7 +248,7 @@ KernelKey ConvGradGetKernelTypeForVar(const GetKernelTypeForVarContext* ctx) {
       (tensor.layout() != phi::DataLayout::ONEDNN)) {
     auto it = attrs.find("data_format");
     const std::string data_format = PADDLE_GET_CONST(std::string, it->second);
-    auto dl = phi::StringToDataLayout(data_format);
+    auto dl = common::StringToDataLayout(data_format);
     // Some models may have intentionally set "AnyLayout" for pool
     // op. Treat this as NCHW (default data_format value)
     if (dl != phi::DataLayout::kAnyLayout) {
diff --git a/paddle/phi/kernels/onednn/conv_handler.h b/paddle/phi/kernels/onednn/conv_handler.h
index 86baabf45afc10..3d41c274de24e6 100644
--- a/paddle/phi/kernels/onednn/conv_handler.h
+++ b/paddle/phi/kernels/onednn/conv_handler.h
@@ -14,10 +14,10 @@
 
 #pragma once
 
+#include "paddle/common/macros.h"
 #include "paddle/phi/backends/onednn/onednn_helper.h"
 #include "paddle/phi/backends/onednn/onednn_reuse.h"
 #include "paddle/phi/core/expect.h"
-#include "paddle/phi/core/macros.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 
@@ -68,7 +68,7 @@ class ConvOneDNNHandlerT
             onednn_engine,
             cpu_place,
             funcs::CreateKey(
-                dev_ctx, phi::vectorize(input->dims()), unique_name)) {
+                dev_ctx, common::vectorize(input->dims()), unique_name)) {
     if (unlikely(!this->isCached())) {
       PADDLE_ENFORCE_EQ(
           input->layout(),
@@ -133,11 +133,12 @@ class ConvOneDNNHandlerT
                                          bias->dims().size()));
       }
       const auto input_dims = input->dims();
-      const auto data_dims = phi::slice_ddim(input_dims, 2, input_dims.size());
+      const auto data_dims =
+          common::slice_ddim(input_dims, 2, input_dims.size());
       const auto filter_dims = filter->dims();
       const auto filter_data_dims =
-          phi::slice_ddim(filter_dims, 2, filter_dims.size());
-      const auto ksize = phi::vectorize(filter_data_dims);
+          common::slice_ddim(filter_dims, 2, filter_dims.size());
+      const auto ksize = common::vectorize(filter_data_dims);
       std::vector<int64_t> strides(begin(strides_in), end(strides_in));
       std::vector<int64_t> paddings(begin(paddings_in), end(paddings_in));
       std::vector<int64_t> dilations(begin(dilations_in), end(dilations_in));
@@ -148,12 +149,12 @@ class ConvOneDNNHandlerT
             return i - 1;
           });
 
-      const auto src_tz = phi::vectorize(input->dims());
+      const auto src_tz = common::vectorize(input->dims());
 
-      auto weights_tz = phi::vectorize(filter->dims());
+      auto weights_tz = common::vectorize(filter->dims());
       funcs::GetGroupConvWeightsTz(weights_tz, groups);
 
-      const auto dst_tz = phi::vectorize(output->dims());
+      const auto dst_tz = common::vectorize(output->dims());
 
       const dnnl::memory::dims stride_dims = strides;
       const auto onednn_paddings = funcs::ToOneDNNPadding(paddings);
@@ -193,7 +194,7 @@ class ConvOneDNNHandlerT
                                                              fuse_activation);
 
       if (bias) {
-        auto bias_tz = phi::vectorize(bias->dims());
+        auto bias_tz = common::vectorize(bias->dims());
         dnnl::memory::desc bias_md =
             funcs::OneDNNMemDesc(bias_tz,
                                  dnnl::memory::data_type::f32,
@@ -251,7 +252,7 @@ class ConvOneDNNHandlerT
             dev_ctx.GetEngine(),
             cpu_place,
             funcs::CreateKey(
-                dev_ctx, phi::vectorize(in->dims()), unique_name)) {
+                dev_ctx, common::vectorize(in->dims()), unique_name)) {
     if (unlikely(!this->isBwdCached())) {
       PADDLE_ENFORCE_EQ(
           in->layout(),
@@ -288,21 +289,21 @@ class ConvOneDNNHandlerT
       std::vector<int64_t> dilations(begin(dilations_in), end(dilations_in));
 
       auto input_dims = in->dims();
-      auto data_dims = phi::slice_ddim(input_dims, 2, input_dims.size());
+      auto data_dims = common::slice_ddim(input_dims, 2, input_dims.size());
       auto filter_dims = filter->dims();
       auto filter_data_dims =
-          phi::slice_ddim(filter_dims, 2, filter_dims.size());
-      auto ksize = phi::vectorize(filter_data_dims);
+          common::slice_ddim(filter_dims, 2, filter_dims.size());
+      auto ksize = common::vectorize(filter_data_dims);
 
       UpdatePaddingAndDilation(
           &paddings, &dilations, padding_algorithm, data_dims, strides, ksize);
 
-      auto src_tz = phi::vectorize(in->dims());
-      auto weights_tz = phi::vectorize(filter->dims());
+      auto src_tz = common::vectorize(in->dims());
+      auto weights_tz = common::vectorize(filter->dims());
 
       int g = std::max(groups, 1);
       funcs::GetGroupConvWeightsTz(weights_tz, g);
-      auto dst_tz = phi::vectorize(out_grad->dims());
+      auto dst_tz = common::vectorize(out_grad->dims());
 
       /* create memory descriptor for conv backward without specified format
        * ('any') which lets a primitive (conv backward in this case) choose
@@ -335,7 +336,7 @@ class ConvOneDNNHandlerT
       // Recreating FWD PD. For training there are no post ops in convolution
       dnnl::primitive_attr conv_attr;
       if (bias) {
-        auto bias_tz = phi::vectorize(bias->dims());
+        auto bias_tz = common::vectorize(bias->dims());
         dnnl::memory::desc bias_md =
             funcs::OneDNNMemDesc(bias_tz,
                                  dnnl::memory::data_type::f32,
@@ -443,7 +444,7 @@ class ConvOneDNNHandlerT
   AcquireWeightsMemoryWithReorderFromDataPrimitive(
       const phi::DenseTensor* filter, const int groups, const bool is_conv3d) {
     const K* filter_data = filter->data<K>();
-    auto weights_tz = phi::vectorize(filter->dims());
+    auto weights_tz = common::vectorize(filter->dims());
     funcs::GetGroupConvWeightsTz(weights_tz, groups);
 
     auto user_src_md =
@@ -538,7 +539,7 @@ class ConvOneDNNHandlerT
       return weights_mem_p;
     } else if (is_test) {
       const K* filter_data = filter->data<K>();
-      auto weights_tz = phi::vectorize(filter->dims());
+      auto weights_tz = common::vectorize(filter->dims());
       funcs::GetGroupConvWeightsTz(weights_tz, groups);
 
       auto user_src_md =
@@ -556,7 +557,7 @@ class ConvOneDNNHandlerT
                                             mask);
     } else {
       const T* filter_data = filter->data<T>();
-      auto weights_tz = phi::vectorize(filter->dims());
+      auto weights_tz = common::vectorize(filter->dims());
       funcs::GetGroupConvWeightsTz(weights_tz, groups);
 
       auto user_src_md =
diff --git a/paddle/phi/kernels/onednn/conv_kernel.cc b/paddle/phi/kernels/onednn/conv_kernel.cc
index 8039dab862c66c..0007c717a4d9db 100644
--- a/paddle/phi/kernels/onednn/conv_kernel.cc
+++ b/paddle/phi/kernels/onednn/conv_kernel.cc
@@ -124,7 +124,7 @@ KernelKey ConvGetKernelTypeForVar(const GetKernelTypeForVarContext* ctx) {
       (tensor.layout() != phi::DataLayout::ONEDNN)) {
     auto it = attrs.find("data_format");
     const std::string data_format = PADDLE_GET_CONST(std::string, it->second);
-    auto dl = phi::StringToDataLayout(data_format);
+    auto dl = common::StringToDataLayout(data_format);
     // Some models may have intentionally set "AnyLayout" for conv
     // op. Treat this as NCHW (default data_format value)
     if (dl != phi::DataLayout::kAnyLayout) {
diff --git a/paddle/phi/kernels/onednn/conv_transpose_kernel.cc b/paddle/phi/kernels/onednn/conv_transpose_kernel.cc
index 1a056a48859318..fcf13bda144cc1 100644
--- a/paddle/phi/kernels/onednn/conv_transpose_kernel.cc
+++ b/paddle/phi/kernels/onednn/conv_transpose_kernel.cc
@@ -26,7 +26,7 @@ namespace phi {
 
 inline dnnl::memory::dims GetWeightsTz(const phi::DenseTensor* filter,
                                        const int groups) {
-  auto weights_tz = phi::vectorize(filter->dims());
+  auto weights_tz = common::vectorize(filter->dims());
   int g = std::max(groups, 1);
   int g_dim = (g > 1) ? 1 : 0;
   funcs::GetGroupConvWeightsTz(weights_tz, g);
@@ -119,11 +119,11 @@ class ConvTransposeOneDNNHandlerT
             "Now we only support 2d oneDNN convolution transpose op"));
 
     const auto x_dims = x->dims();
-    const auto x_data_dims = phi::slice_ddim(x_dims, 2, x_dims.size());
+    const auto x_data_dims = common::slice_ddim(x_dims, 2, x_dims.size());
     const auto filter_dims = filter->dims();
     const auto filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    const auto ksize = phi::vectorize(filter_data_dims);
+        common::slice_ddim(filter_dims, 2, filter_dims.size());
+    const auto ksize = common::vectorize(filter_data_dims);
     UpdatePaddingAndDilation(
         &paddings, &dilations, padding_algorithm, x_data_dims, strides, ksize);
 
@@ -132,9 +132,9 @@ class ConvTransposeOneDNNHandlerT
           return i - 1;
         });
 
-    const auto src_tz = phi::vectorize(x->dims());
+    const auto src_tz = common::vectorize(x->dims());
     const auto weights_tz = GetWeightsTz(filter, groups);
-    const auto dst_tz = phi::vectorize(out->dims());
+    const auto dst_tz = common::vectorize(out->dims());
     const auto onednn_paddings = funcs::ToOneDNNPadding(paddings);
 
     /* create memory descriptor for convolution without specified format
@@ -164,7 +164,7 @@ class ConvTransposeOneDNNHandlerT
                                   : dnnl::prop_kind::forward_training;
 
     if (bias) {
-      std::vector<int64_t> bias_tz = phi::vectorize(bias->dims());
+      std::vector<int64_t> bias_tz = common::vectorize(bias->dims());
       const auto bias_md = funcs::OneDNNMemDesc(
           bias_tz, data_type, funcs::OneDNNMemoryFormat::x);
       this->AcquireForwardPrimitiveDescriptor(
@@ -312,7 +312,7 @@ class ConvTransposeOneDNNHandlerT
       const std::string& key,
       const phi::DenseTensor* bias) {
     const K* bias_data = bias->data<K>();
-    auto user_bias_md = funcs::OneDNNMemDesc(phi::vectorize(bias->dims()),
+    auto user_bias_md = funcs::OneDNNMemDesc(common::vectorize(bias->dims()),
                                              funcs::OneDNNGetDataType<K>(),
                                              funcs::OneDNNMemoryFormat::x);
     return this->AcquireMemoryWithReorder(dev_ctx,
@@ -446,7 +446,7 @@ KernelKey ConvTransposeGetKernelTypeForVar(
       (tensor.layout() != phi::DataLayout::ONEDNN)) {
     auto it = attrs.find("data_format");
     const std::string data_format = PADDLE_GET_CONST(std::string, it->second);
-    auto dl = phi::StringToDataLayout(data_format);
+    auto dl = common::StringToDataLayout(data_format);
     // Some models may have intentionally set "AnyLayout" for pool
     // op. Treat this as NCHW (default data_format value)
     if (dl != phi::DataLayout::kAnyLayout) {
diff --git a/paddle/phi/kernels/onednn/dequantize_kernel.cc b/paddle/phi/kernels/onednn/dequantize_kernel.cc
index 384ca7ea1e6383..9ce975733f3e46 100644
--- a/paddle/phi/kernels/onednn/dequantize_kernel.cc
+++ b/paddle/phi/kernels/onednn/dequantize_kernel.cc
@@ -44,7 +44,7 @@ void DeQuantKernel(const Context& dev_ctx,
 
   const bool with_shift = q_shift != 0;
 
-  auto x_tz = phi::vectorize<int64_t>(x.dims());
+  auto x_tz = common::vectorize<int64_t>(x.dims());
   auto x_type = phi::funcs::ToOneDNNDataType(x.dtype());
   auto out_type = phi::funcs::ToOneDNNDataType(out->dtype());
 
diff --git a/paddle/phi/kernels/onednn/elementwise_grad_kernel.cc b/paddle/phi/kernels/onednn/elementwise_grad_kernel.cc
index bec2aa8228c21b..c7a3a7ee93e84f 100644
--- a/paddle/phi/kernels/onednn/elementwise_grad_kernel.cc
+++ b/paddle/phi/kernels/onednn/elementwise_grad_kernel.cc
@@ -25,8 +25,8 @@ namespace funcs {
 
 inline std::vector<int64_t> CalculateBroadcastedDims(
     const phi::DenseTensor* x, const phi::DenseTensor* y) {
-  const auto src_tz = phi::vectorize(x->dims());
-  const auto dst_tz = phi::vectorize(y->dims());
+  const auto src_tz = common::vectorize(x->dims());
+  const auto dst_tz = common::vectorize(y->dims());
 
   std::vector<int64_t> dst_tz_ex(src_tz.size(), 1);
 
@@ -103,7 +103,7 @@ inline void BroadcastReduction(const Place& place,
   astream.wait();
   auto grad_shape = grad_tensor->dims().size() == 0
                         ? std::vector<int64_t>{1}
-                        : phi::vectorize<int64_t>(grad_tensor->dims());
+                        : common::vectorize<int64_t>(grad_tensor->dims());
   grad_tensor->set_mem_desc(dst_memory->get_desc().reshape(grad_shape));
 }
 
@@ -135,7 +135,7 @@ void ElementwiseGradKernel(const OneDNNContext& dev_ctx,
     scale = (BINARY_OP == dnnl::algorithm::binary_add) ? 1 : -1;
   }
 
-  auto tz = phi::vectorize<int64_t>(dout.dims());
+  auto tz = common::vectorize<int64_t>(dout.dims());
 
   funcs::ReorderOneDNNHandler reorder_handler(
       tz, dout.dtype(), funcs::ToOneDNNDataType(dout.dtype()), onednn_engine);
diff --git a/paddle/phi/kernels/onednn/expand_grad_kernel.cc b/paddle/phi/kernels/onednn/expand_grad_kernel.cc
index 4f4ef1fd544e44..a8b1beb45832f8 100644
--- a/paddle/phi/kernels/onednn/expand_grad_kernel.cc
+++ b/paddle/phi/kernels/onednn/expand_grad_kernel.cc
@@ -26,8 +26,8 @@ void ExpandGradKernel(const Context& dev_ctx,
                       DenseTensor* in_grad) {
   const auto& onednn_engine = dev_ctx.GetEngine();
 
-  auto in_grad_vec_dims = vectorize(in_grad->dims());
-  auto out_grad_vec_dims = vectorize(out_grad.dims());
+  auto in_grad_vec_dims = common::vectorize(in_grad->dims());
+  auto out_grad_vec_dims = common::vectorize(out_grad.dims());
 
   if (in_grad_vec_dims.size() != out_grad_vec_dims.size()) {
     in_grad_vec_dims.insert(in_grad_vec_dims.begin(),
@@ -81,9 +81,10 @@ void ExpandGradKernel(const Context& dev_ctx,
     reduction_p->execute(astream, reduction_args);
     astream.wait();
     in_grad->set_layout(DataLayout::ONEDNN);
-    const auto in_grad_md_dims = in_grad->dims().size() != 0
-                                     ? vectorize<int64_t>(in_grad->dims())
-                                     : std::vector<int64_t>{1};
+    const auto in_grad_md_dims =
+        in_grad->dims().size() != 0
+            ? common::vectorize<int64_t>(in_grad->dims())
+            : std::vector<int64_t>{1};
     in_grad->set_mem_desc(dst_memory_p->get_desc().reshape(in_grad_md_dims));
   }
 }
diff --git a/paddle/phi/kernels/onednn/expand_kernel.cc b/paddle/phi/kernels/onednn/expand_kernel.cc
index 229a80c6b623bd..140fbbed6fc71d 100644
--- a/paddle/phi/kernels/onednn/expand_kernel.cc
+++ b/paddle/phi/kernels/onednn/expand_kernel.cc
@@ -36,7 +36,7 @@ void ExpandKernel(const Context& dev_ctx,
                   DenseTensor* out) {
   const auto& onednn_engine = dev_ctx.GetEngine();
 
-  auto x_vec_dims = vectorize(x.dims());
+  auto x_vec_dims = common::vectorize(x.dims());
 
   auto out_new_dims = shape.GetData();
 
@@ -48,7 +48,7 @@ void ExpandKernel(const Context& dev_ctx,
     x_vec_dims = GetExtendedXDims(x_vec_dims, out_new_dims.size());
   }
 
-  out->Resize(make_ddim(out_new_dims));
+  out->Resize(common::make_ddim(out_new_dims));
   funcs::BroadcastDataOneDNNHandler<T> handler(dnnl::algorithm::binary_add,
                                                onednn_engine,
                                                dev_ctx.GetPlace(),
diff --git a/paddle/phi/kernels/onednn/full_kernel.cc b/paddle/phi/kernels/onednn/full_kernel.cc
index 6ce5625c7f54cd..886c715693e9f7 100644
--- a/paddle/phi/kernels/onednn/full_kernel.cc
+++ b/paddle/phi/kernels/onednn/full_kernel.cc
@@ -61,7 +61,7 @@ void FullKernel(const Context& dev_ctx,
   const auto& onednn_engine = dev_ctx.GetEngine();
 
   T fill_value = val.to<T>();
-  out->Resize(make_ddim(shape.GetData()));
+  out->Resize(common::make_ddim(shape.GetData()));
 
   funcs::FillConstantOneDNNHandler<T> handler(
       out, onednn_engine, dev_ctx.GetPlace());
@@ -92,7 +92,7 @@ void FullKernel(const Context& dev_ctx,
 
   // src0_memory_p's md was just to allow the usage of a binary
   // primitive as a memset, and now we need to create a real one
-  out->set_mem_desc({vectorize(out->dims()),
+  out->set_mem_desc({common::vectorize(out->dims()),
                      funcs::OneDNNGetDataType<T>(),
                      funcs::GetPlainOneDNNFormat(out->dims().size())});
 }
diff --git a/paddle/phi/kernels/onednn/gaussian_kernel.cc b/paddle/phi/kernels/onednn/gaussian_kernel.cc
index a850aee10c31bb..98197961a9df6b 100644
--- a/paddle/phi/kernels/onednn/gaussian_kernel.cc
+++ b/paddle/phi/kernels/onednn/gaussian_kernel.cc
@@ -41,7 +41,7 @@ void GaussianKernel(const Context& ctx,
     data[i] = dist(*engine);
   }
 
-  out->Resize(phi::make_ddim(shape.GetData()));
+  out->Resize(common::make_ddim(shape.GetData()));
   dnnl::memory::desc out_mem_desc =
       phi::funcs::make_memory_desc(*out, DataLayout::NCHW);
   out->set_mem_desc(out_mem_desc);
diff --git a/paddle/phi/kernels/onednn/interpolate_kernel.cc b/paddle/phi/kernels/onednn/interpolate_kernel.cc
index 082e21bafa0e2e..be3e158cf384a9 100644
--- a/paddle/phi/kernels/onednn/interpolate_kernel.cc
+++ b/paddle/phi/kernels/onednn/interpolate_kernel.cc
@@ -33,7 +33,7 @@ KernelKey InterpolateGetKernelTypeForVar(
       (tensor.layout() != DataLayout::ONEDNN)) {
     auto it = attrs.find("data_layout");
     const std::string data_layout = PADDLE_GET_CONST(std::string, it->second);
-    auto dl = StringToDataLayout(data_layout);
+    auto dl = common::StringToDataLayout(data_layout);
     // Some models may have intentionally set "AnyLayout" for pool
     // op. Treat this as NCHW (default data_format value)
     if (dl != DataLayout::kAnyLayout) {
@@ -62,7 +62,7 @@ class InterpolateOneDNNHandler
                            DenseTensor* out)
       : OneDNNHandlerNoCachingT<T, dnnl::resampling_forward>(engine,
                                                              cpu_place) {
-    const auto dst_tz = vectorize(out->dims());
+    const auto dst_tz = common::vectorize(out->dims());
     const auto dst_md = dnnl::memory::desc(
         dst_tz, OneDNNGetDataType<T>(), OneDNNMemoryFormat::any);
     this->AcquireForwardPrimitiveDescriptor(
@@ -126,7 +126,7 @@ std::vector<int> ComputeOutputShape(
     if (scale.size() == 3 && scale[0] > 0.0f && scale[1] > 0.0f &&
         scale[2] > 0.0f) {
       int j = 0;
-      std::vector<int64_t> in_dhw_vec = vectorize(in_dhw_dims);
+      std::vector<int64_t> in_dhw_vec = common::vectorize(in_dhw_dims);
       std::transform(
           in_dhw_vec.begin(),
           in_dhw_vec.end(),
@@ -176,7 +176,7 @@ void InterpolateKernel(
                                                out_h,
                                                out_w,
                                                scale);
-  DDim dim_out = make_ddim(out_dims_vec);
+  DDim dim_out = common::make_ddim(out_dims_vec);
   out->Resize(dim_out);
 
   funcs::InterpolateOneDNNHandler<T> handler(
diff --git a/paddle/phi/kernels/onednn/matmul_grad_kernel.cc b/paddle/phi/kernels/onednn/matmul_grad_kernel.cc
index 0dcc7195800c1d..3866a2d06ae45c 100644
--- a/paddle/phi/kernels/onednn/matmul_grad_kernel.cc
+++ b/paddle/phi/kernels/onednn/matmul_grad_kernel.cc
@@ -74,9 +74,9 @@ void CalculateGradMatrixDims(const OneDNNContext &dev_ctx,
     }
   }
 
-  dx_tmp->Resize(make_ddim(*dx_bd_dims));
+  dx_tmp->Resize(common::make_ddim(*dx_bd_dims));
   dev_ctx.template Alloc<T>(dx_tmp);
-  dy_tmp->Resize(make_ddim(*dy_bd_dims));
+  dy_tmp->Resize(common::make_ddim(*dy_bd_dims));
   dev_ctx.template Alloc<T>(dy_tmp);
 }
 
@@ -117,9 +117,9 @@ void MatmulGradKernel(const Context &dev_ctx,
                       bool transpose_y,
                       DenseTensor *dx,
                       DenseTensor *dy) {
-  auto x_dims = vectorize(x.dims());
-  auto y_dims = vectorize(y.dims());
-  auto dout_dims = vectorize(dout.dims());
+  auto x_dims = common::vectorize(x.dims());
+  auto y_dims = common::vectorize(y.dims());
+  auto dout_dims = common::vectorize(dout.dims());
 
   size_t ndims = std::max(x_dims.size(), y_dims.size());
   ndims = std::max<size_t>(ndims, 3);
diff --git a/paddle/phi/kernels/onednn/matmul_kernel.cc b/paddle/phi/kernels/onednn/matmul_kernel.cc
index 4a7081dfac0254..d11cf70eaa0251 100644
--- a/paddle/phi/kernels/onednn/matmul_kernel.cc
+++ b/paddle/phi/kernels/onednn/matmul_kernel.cc
@@ -77,7 +77,7 @@ void CalculateMatrixDims(const std::vector<int64_t> &x_dims,
   }
 
   if (x_dims.size() > 2 && y_dims.size() > 2) {
-    auto out_dims = vectorize(out->dims());
+    auto out_dims = common::vectorize(out->dims());
     for (size_t i = 0; i < (*x_bd_dims).size() - 2; ++i) {
       PADDLE_ENFORCE_EQ(
           (*x_bd_dims)[i] == (*y_bd_dims)[i] || (*x_bd_dims)[i] == 1 ||
@@ -93,7 +93,7 @@ void CalculateMatrixDims(const std::vector<int64_t> &x_dims,
               (*y_bd_dims)[i]));
       (out_dims)[i] = std::max((*x_bd_dims)[i], (*y_bd_dims)[i]);
     }
-    out->Resize(make_ddim((out_dims)));
+    out->Resize(common::make_ddim((out_dims)));
   }
 }
 
@@ -123,8 +123,8 @@ void MatmulKernel(const Context &dev_ctx,
           ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("force_fp32_output"))
           : false;
 
-  auto x_dims = vectorize(x.dims());
-  auto y_dims = vectorize(y.dims());
+  auto x_dims = common::vectorize(x.dims());
+  auto y_dims = common::vectorize(y.dims());
   int ndims = std::max(x_dims.size(), y_dims.size());
   ndims = std::max(ndims, 3);
 
@@ -374,7 +374,7 @@ class MulPrimitiveFactory {
       const DenseTensor *tensor,
       funcs::OneDNNMemoryFormat format,
       memory::data_type type = funcs::OneDNNGetDataType<T>()) {
-    auto dims = vectorize<int64_t>(tensor->dims());
+    auto dims = common::vectorize<int64_t>(tensor->dims());
     return funcs::OneDNNMemDesc(dims, type, format);
   }
 
@@ -423,7 +423,7 @@ class MulPrimitiveFactory {
   }
 
   memory TransposeInputY(const DenseTensor *input_y) {
-    auto dims = vectorize<int64_t>(input_y->dims());
+    auto dims = common::vectorize<int64_t>(input_y->dims());
     std::swap(dims[0], dims[1]);  // Correct output dimensions
     auto src_desc =
         CreateMemDescriptor<YT>(dims, funcs::OneDNNMemoryFormat::io);
@@ -451,9 +451,9 @@ std::shared_ptr<MulPrimitiveFactory<XT, YT, OT>> GetPrimitiveFactory(
     const engine &onednn_engine) {
   std::string key = funcs::CreateKey(dev_ctx,
                                      phi::TransToProtoVarType(input_x->dtype()),
-                                     vectorize(input_x->dims()),
+                                     common::vectorize(input_x->dims()),
                                      phi::TransToProtoVarType(input_y->dtype()),
-                                     vectorize(input_y->dims()),
+                                     common::vectorize(input_y->dims()),
                                      dev_ctx.GetOutputsName("Out")[0]);
   key = funcs::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key);
 
@@ -527,8 +527,8 @@ void MatmulWithFlattenKernelINT8(const Context &dev_ctx,
       mul.get_primitive_desc(), dnnl_query_dst_md, 0);
   dnnl_memory_desc_t cloned_in_md = nullptr;
   dnnl_memory_desc_clone(&cloned_in_md, in_md);
-  out->set_mem_desc(
-      memory::desc(cloned_in_md).reshape(vectorize<int64_t>(out->dims())));
+  out->set_mem_desc(memory::desc(cloned_in_md)
+                        .reshape(common::vectorize<int64_t>(out->dims())));
 }
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/onednn/pad3d_kernel.cc b/paddle/phi/kernels/onednn/pad3d_kernel.cc
index 2a489c258889a5..c184dd4cbf4a1e 100644
--- a/paddle/phi/kernels/onednn/pad3d_kernel.cc
+++ b/paddle/phi/kernels/onednn/pad3d_kernel.cc
@@ -30,7 +30,7 @@ KernelKey Pad3dGetKernelTypeForVar(const GetKernelTypeForVarContext* ctx) {
     auto it = attrs.find("data_format");
     const std::string data_format = PADDLE_GET_CONST(std::string, it->second);
     return phi::KernelKey(tensor.place(),
-                          phi::StringToDataLayout(data_format),
+                          common::StringToDataLayout(data_format),
                           expected_kernel_type.dtype());
   }
 #endif
diff --git a/paddle/phi/kernels/onednn/pad_kernel_impl.h b/paddle/phi/kernels/onednn/pad_kernel_impl.h
index eabe18855b796e..0c360e1dabbc31 100644
--- a/paddle/phi/kernels/onednn/pad_kernel_impl.h
+++ b/paddle/phi/kernels/onednn/pad_kernel_impl.h
@@ -112,7 +112,7 @@ void PadOpKernel(const Context& dev_ctx,
   const auto& onednn_engine = dev_ctx.GetEngine();
   auto& astream = OneDNNContext::tls().get_stream();
 
-  std::vector<int64_t> x_tz = vectorize(x.dims());
+  std::vector<int64_t> x_tz = common::vectorize(x.dims());
   // due to the need of supporting NDHWC, inferring out shape
   // must be done inside the kernel
   std::vector<int64_t> out_tz(x_tz);
@@ -120,7 +120,7 @@ void PadOpKernel(const Context& dev_ctx,
   for (size_t i = 0; i < paddings.size() / 2; ++i) {
     out_tz[out_tz.size() - 1 - i] += paddings[2 * i] + paddings[2 * i + 1];
   }
-  out->Resize(make_ddim(out_tz));
+  out->Resize(common::make_ddim(out_tz));
 
   funcs::ReorderOneDNNHandler reorder_handler(
       x_tz, x.dtype(), funcs::ToOneDNNDataType(x.dtype()), onednn_engine);
diff --git a/paddle/phi/kernels/onednn/pool_grad_kernel.cc b/paddle/phi/kernels/onednn/pool_grad_kernel.cc
index 037c6f1b7f35c7..f5b10186a4ebc6 100644
--- a/paddle/phi/kernels/onednn/pool_grad_kernel.cc
+++ b/paddle/phi/kernels/onednn/pool_grad_kernel.cc
@@ -83,7 +83,7 @@ phi::KernelKey PoolOpGradGetKernelTypeForVar(
     auto it = attrs.find("data_format");
     const std::string data_format = PADDLE_GET_CONST(std::string, it->second);
     return phi::KernelKey(tensor.place(),
-                          phi::StringToDataLayout(data_format),
+                          common::StringToDataLayout(data_format),
                           expected_kernel_type.dtype());
   }
 #endif
diff --git a/paddle/phi/kernels/onednn/pool_kernel.cc b/paddle/phi/kernels/onednn/pool_kernel.cc
index 4d853421267a13..655cd67ab52df8 100644
--- a/paddle/phi/kernels/onednn/pool_kernel.cc
+++ b/paddle/phi/kernels/onednn/pool_kernel.cc
@@ -81,7 +81,7 @@ phi::KernelKey PoolOpGetKernelTypeForVar(
     const AttributeMap& attrs = ctx->GetAttrs();
     auto it = attrs.find("data_format");
     const std::string data_format = PADDLE_GET_CONST(std::string, it->second);
-    auto dl = phi::StringToDataLayout(data_format);
+    auto dl = common::StringToDataLayout(data_format);
     // Some models may have intentionally set "AnyLayout" for pool
     // op. Treat this as NCHW (default data_format value)
     if (dl != phi::DataLayout::kAnyLayout) {
diff --git a/paddle/phi/kernels/onednn/reduce_kernel_impl.h b/paddle/phi/kernels/onednn/reduce_kernel_impl.h
index 1381b37d57107e..8542bce6437e2a 100644
--- a/paddle/phi/kernels/onednn/reduce_kernel_impl.h
+++ b/paddle/phi/kernels/onednn/reduce_kernel_impl.h
@@ -23,11 +23,11 @@ inline std::vector<int64_t> CalculateReducedDims(
     const std::vector<int64_t>& reduce_dims,  // NOLINT
     bool reduce_all,
     bool keep_dim) {
-  if (keep_dim) return vectorize(output->dims());
+  if (keep_dim) return common::vectorize(output->dims());
 
   if (reduce_all) return std::vector<int64_t>(input->dims().size(), 1);
 
-  std::vector<int64_t> output_dims(vectorize(input->dims()));
+  std::vector<int64_t> output_dims(common::vectorize(input->dims()));
   for (size_t i = 0; i < reduce_dims.size(); ++i) {
     // handle negative dims, f.e. "-1" means rightmost dimension
     int index = (reduce_dims[i] >= 0) ? reduce_dims[i]
@@ -48,7 +48,7 @@ void ReduceKernel(const Context& dev_ctx,
                   dnnl::algorithm reduction_type) {
   reduce_all = recompute_reduce_all(x, dims, reduce_all);
   const auto& onednn_engine = dev_ctx.GetEngine();
-  auto x_tz = vectorize(x.dims());
+  auto x_tz = common::vectorize(x.dims());
   auto out_tz =
       CalculateReducedDims(&x, out, dims.GetData(), reduce_all, keep_dim);
 
@@ -78,7 +78,7 @@ void ReduceKernel(const Context& dev_ctx,
     astream.wait();
 
     const auto reshape_dims = out->dims().size() != 0
-                                  ? vectorize<int64_t>(out->dims())
+                                  ? common::vectorize<int64_t>(out->dims())
                                   : std::vector<int64_t>{1};
     out->set_mem_desc(reorder_dst_memory_p->get_desc().reshape(reshape_dims));
   } else {
@@ -103,7 +103,7 @@ void ReduceKernel(const Context& dev_ctx,
     astream.wait();
 
     const auto reshape_dims = out->dims().size() != 0
-                                  ? vectorize<int64_t>(out->dims())
+                                  ? common::vectorize<int64_t>(out->dims())
                                   : std::vector<int64_t>{1};
     out->set_mem_desc(dst_memory_p->get_desc().reshape(reshape_dims));
   }
@@ -125,7 +125,7 @@ void ReduceGradKernel(const Context& dev_ctx,
   const auto& onednn_engine = dev_ctx.GetEngine();
   auto out_grad_tz = CalculateReducedDims(
       x_grad, &out_grad, dims.GetData(), reduce_all, keep_dim);
-  auto x_grad_tz = vectorize(x_grad->dims());
+  auto x_grad_tz = common::vectorize(x_grad->dims());
 
   funcs::BroadcastDataOneDNNHandler<T> handler(binary_type,
                                                onednn_engine,
diff --git a/paddle/phi/kernels/onednn/reduce_mean_grad_kernel.cc b/paddle/phi/kernels/onednn/reduce_mean_grad_kernel.cc
index fd566782b182e7..e59a02c1b1a102 100644
--- a/paddle/phi/kernels/onednn/reduce_mean_grad_kernel.cc
+++ b/paddle/phi/kernels/onednn/reduce_mean_grad_kernel.cc
@@ -26,7 +26,7 @@ void MeanGradKernel(const Context& dev_ctx,
                     bool reduce_all,
                     DenseTensor* x_grad) {
   reduce_all = recompute_reduce_all(x, dims, reduce_all);
-  auto input_dims = phi::vectorize(x.dims());
+  auto input_dims = common::vectorize(x.dims());
   std::vector<int64_t> reduce_dims = dims.GetData();
   int number_of_elements = 1;
   if (reduce_all == false) {
diff --git a/paddle/phi/kernels/onednn/reshape_kernel.cc b/paddle/phi/kernels/onednn/reshape_kernel.cc
index 47e1fad0936b29..c7d83ba53569bd 100644
--- a/paddle/phi/kernels/onednn/reshape_kernel.cc
+++ b/paddle/phi/kernels/onednn/reshape_kernel.cc
@@ -17,7 +17,7 @@ namespace phi {
 static DDim ValidateShape(const std::vector<int64_t>& shape,
                           const DDim& in_dims) {
   const int64_t in_size = product(in_dims);
-  auto in_dims_vec = vectorize(in_dims);
+  auto in_dims_vec = common::vectorize(in_dims);
   bool all_positive = std::all_of(in_dims_vec.cbegin(),
                                   in_dims_vec.cend(),
                                   [](int64_t i) { return i > 0; });
@@ -37,7 +37,7 @@ static DDim ValidateShape(const std::vector<int64_t>& shape,
           errors::InvalidArgument(
               "Only one dimension value of 'shape' in ReshapeOp can "
               "be -1. But received shape = [%s], shape[%d] is also -1.",
-              make_ddim(shape),
+              common::make_ddim(shape),
               i));
       unk_dim_idx = i;
     } else if (shape[i] == copy_dim_val) {
@@ -49,7 +49,7 @@ static DDim ValidateShape(const std::vector<int64_t>& shape,
               "the input tensor X's dimensions. "
               "But received shape = [%s], shape[%d] = 0, X's shape = [%s], "
               "X's dimensions = %d.",
-              make_ddim(shape),
+              common::make_ddim(shape),
               i,
               in_dims,
               in_dims.size()));
@@ -61,7 +61,7 @@ static DDim ValidateShape(const std::vector<int64_t>& shape,
               "Each dimension value of 'shape' in ReshapeOp must not "
               "be negative except one unknown dimension. "
               "But received  shape = [%s], shape[%d] = %d.",
-              make_ddim(shape),
+              common::make_ddim(shape),
               i,
               shape[i]));
     }
@@ -88,7 +88,7 @@ static DDim ValidateShape(const std::vector<int64_t>& shape,
               "'shape' is [%s], known capacity of 'shape' is %d.",
               in_dims,
               in_size,
-              make_ddim(shape),
+              common::make_ddim(shape),
               capacity));
     } else {
       output_shape[unk_dim_idx] = -1;
@@ -106,11 +106,11 @@ static DDim ValidateShape(const std::vector<int64_t>& shape,
               "[%s], the capacity of 'shape' is %d.",
               in_dims,
               in_size,
-              make_ddim(shape),
+              common::make_ddim(shape),
               capacity));
     }
   }
-  return make_ddim(output_shape);
+  return common::make_ddim(output_shape);
 }
 
 template <typename T, typename Context>
@@ -143,8 +143,8 @@ void ExecuteReshape(const Context& dev_ctx,
   astream.wait();
 
   out->Resize(out_dims);
-  const auto reshape_dims =
-      out_dims.size() != 0 ? vectorize(out_dims) : std::vector<int64_t>{1};
+  const auto reshape_dims = out_dims.size() != 0 ? common::vectorize(out_dims)
+                                                 : std::vector<int64_t>{1};
   out->set_mem_desc(reorder_dst_memory_p->get_desc().reshape(reshape_dims));
 }
 
diff --git a/paddle/phi/kernels/onednn/shape_kernel.cc b/paddle/phi/kernels/onednn/shape_kernel.cc
index 51b35ae4a3c0c7..0d3b6eda6700f2 100644
--- a/paddle/phi/kernels/onednn/shape_kernel.cc
+++ b/paddle/phi/kernels/onednn/shape_kernel.cc
@@ -30,9 +30,9 @@ void ShapeKernel(const Context& dev_ctx,
   // allocated
   if (OneDNNContext::tls().get_cur_paddle_data_layout() == DataLayout::kNHWC &&
       x_dims.size() >= 3) {
-    auto rdims = vectorize<int>(x_dims);
+    auto rdims = common::vectorize<int>(x_dims);
     std::rotate(rdims.begin() + 1, rdims.begin() + 2, rdims.end());
-    x_dims = make_ddim(rdims);
+    x_dims = common::make_ddim(rdims);
   }
 
   out->Resize({x_dims.size()});
@@ -42,7 +42,7 @@ void ShapeKernel(const Context& dev_ctx,
   }
 
   dnnl::memory::desc out_mem_desc(
-      vectorize(out->dims()),
+      common::vectorize(out->dims()),
       funcs::ToOneDNNDataType(out->dtype()),
       funcs::GetPlainOneDNNFormat(out->dims().size()));
   out->set_mem_desc(out_mem_desc);
diff --git a/paddle/phi/kernels/onednn/slice_grad_kernel.cc b/paddle/phi/kernels/onednn/slice_grad_kernel.cc
index bc8776362193a6..7f8f6b815b4f0e 100644
--- a/paddle/phi/kernels/onednn/slice_grad_kernel.cc
+++ b/paddle/phi/kernels/onednn/slice_grad_kernel.cc
@@ -31,7 +31,7 @@ void SliceGradKernel(const Context& dev_ctx,
                      DenseTensor* input_grad) {
   const auto& onednn_engine = dev_ctx.GetEngine();
 
-  auto dx_dims = vectorize(input_grad->dims());
+  auto dx_dims = common::vectorize(input_grad->dims());
 
   auto starts_vec = starts.GetData();
   auto ends_vec = ends.GetData();
diff --git a/paddle/phi/kernels/onednn/slice_kernel.cc b/paddle/phi/kernels/onednn/slice_kernel.cc
index 9b098a3f14119e..bd59d61c17e794 100644
--- a/paddle/phi/kernels/onednn/slice_kernel.cc
+++ b/paddle/phi/kernels/onednn/slice_kernel.cc
@@ -30,7 +30,7 @@ void SliceKernel(const Context& dev_ctx,
                  DenseTensor* out) {
   const auto& onednn_engine = dev_ctx.GetEngine();
 
-  auto x_vec_dims = vectorize(x.dims());
+  auto x_vec_dims = common::vectorize(x.dims());
 
   auto starts_vec = starts.GetData();
   auto ends_vec = ends.GetData();
@@ -48,7 +48,7 @@ void SliceKernel(const Context& dev_ctx,
         std::max(static_cast<int64_t>(0), ends_vec[i] - starts_vec[i]);
   }
 
-  out->Resize(make_ddim(slice_dims));
+  out->Resize(common::make_ddim(slice_dims));
 
   // Note(0x45f): To support slice Tensors with shapes like [0, 0, 0].
   if (!x.initialized()) {
@@ -93,7 +93,7 @@ void SliceKernel(const Context& dev_ctx,
   }
 
   astream.wait();
-  out->Resize(make_ddim(new_out_dims));
+  out->Resize(common::make_ddim(new_out_dims));
   out->set_mem_desc(reorder_dst_memory_p->get_desc().reshape(new_out_dims));
 }
 
diff --git a/paddle/phi/kernels/onednn/split_kernel.cc b/paddle/phi/kernels/onednn/split_kernel.cc
index a700bc016cd807..cf0cd1d62a0200 100644
--- a/paddle/phi/kernels/onednn/split_kernel.cc
+++ b/paddle/phi/kernels/onednn/split_kernel.cc
@@ -48,7 +48,7 @@ void SplitKernel(const Context& dev_ctx,
 
   auto outs_number = out.size();
   const auto x_dims = x.dims();
-  auto x_vec_dims = vectorize(x_dims);
+  auto x_vec_dims = common::vectorize(x_dims);
 
   dnnl::memory::data_type x_type = funcs::ToOneDNNDataType(x.dtype());
 
@@ -61,7 +61,7 @@ void SplitKernel(const Context& dev_ctx,
       x.mem_desc(), funcs::to_void_cast(x.data<T>()));
 
   for (size_t i = 0; i < outs_number; ++i) {
-    auto out_vec_dims = vectorize(out[i]->dims());
+    auto out_vec_dims = common::vectorize(out[i]->dims());
     auto slice_mem_p = reorder_handler.AcquireSubmemory(
         out_vec_dims, offset, reorder_src_memory_p);
 
diff --git a/paddle/phi/kernels/onednn/squeeze_grad_kernel.cc b/paddle/phi/kernels/onednn/squeeze_grad_kernel.cc
index 724335b7e51af7..d8ff4e72c1b117 100644
--- a/paddle/phi/kernels/onednn/squeeze_grad_kernel.cc
+++ b/paddle/phi/kernels/onednn/squeeze_grad_kernel.cc
@@ -25,7 +25,7 @@ void SqueezeGradKernel(const Context& dev_ctx,
                        const DenseTensor& dout,
                        const IntArray& axes UNUSED,
                        DenseTensor* dx) {
-  auto dout_vec_dims = dout.dims().size() != 0 ? vectorize(dout.dims())
+  auto dout_vec_dims = dout.dims().size() != 0 ? common::vectorize(dout.dims())
                                                : std::vector<int64_t>{1};
 
   auto dout_type = funcs::ToOneDNNDataType(dout.dtype());
@@ -48,7 +48,7 @@ void SqueezeGradKernel(const Context& dev_ctx,
 
   auto dx_dims = slice_ddim(xshape.dims(), 1, xshape.dims().size());
   dx->Resize(dx_dims);
-  reorder_dst_memory_p->get_desc().reshape(vectorize(dx_dims));
+  reorder_dst_memory_p->get_desc().reshape(common::vectorize(dx_dims));
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/onednn/squeeze_kernel.cc b/paddle/phi/kernels/onednn/squeeze_kernel.cc
index 9b86f9e1a9c74d..2de2cbb2ecbab8 100644
--- a/paddle/phi/kernels/onednn/squeeze_kernel.cc
+++ b/paddle/phi/kernels/onednn/squeeze_kernel.cc
@@ -26,7 +26,7 @@ void ExecuteSqueeze(const Context& dev_ctx,
                     const DDim& x_dims,
                     const DDim& out_dims,
                     DenseTensor* out) {
-  auto x_vec_dims = vectorize(x_dims);
+  auto x_vec_dims = common::vectorize(x_dims);
 
   funcs::ReorderOneDNNHandler reorder_handler(
       x_vec_dims,
@@ -48,8 +48,8 @@ void ExecuteSqueeze(const Context& dev_ctx,
 
   out->Resize(out_dims);
 
-  auto reshape_dims =
-      out_dims.size() != 0 ? vectorize(out_dims) : std::vector<int64_t>{1};
+  auto reshape_dims = out_dims.size() != 0 ? common::vectorize(out_dims)
+                                           : std::vector<int64_t>{1};
   out->set_mem_desc(reorder_dst_memory_p->get_desc().reshape(reshape_dims));
 }
 
diff --git a/paddle/phi/kernels/onednn/stack_kernel.cc b/paddle/phi/kernels/onednn/stack_kernel.cc
index 130f91ddf3195e..c6a478c62183ac 100644
--- a/paddle/phi/kernels/onednn/stack_kernel.cc
+++ b/paddle/phi/kernels/onednn/stack_kernel.cc
@@ -36,7 +36,7 @@ class StackOneDNNHandler : public OneDNNHandlerNoCachingT<T, dnnl::concat> {
     }
 
     // in stack op all inputs must have same dims
-    auto input_dims = vectorize<int64_t>(inputs[0]->dims());
+    auto input_dims = common::vectorize<int64_t>(inputs[0]->dims());
 
     dnnl::memory::data_type dt = ToOneDNNDataType(inputs[0]->dtype());
     std::vector<memory::desc> srcs_md;
@@ -56,7 +56,7 @@ class StackOneDNNHandler : public OneDNNHandlerNoCachingT<T, dnnl::concat> {
       input_dims[stack_axis] *= inputs.size();
       dst_md = dnnl::memory::desc(input_dims, dt, OneDNNMemoryFormat::any);
     } else {
-      auto extended_input_dims = vectorize<int64_t>(output->dims());
+      auto extended_input_dims = common::vectorize<int64_t>(output->dims());
       extended_input_dims[stack_axis] = 1;
 
       for (auto input : inputs) {
@@ -67,7 +67,8 @@ class StackOneDNNHandler : public OneDNNHandlerNoCachingT<T, dnnl::concat> {
       // distinguish between f.e. abcd and abdc if last dim is equal to 1 so
       // enforcing is needed for better performance
       dst_fmt = GetPlainOneDNNFormat(extended_input_dims.size());
-      dst_md = dnnl::memory::desc(vectorize(output->dims()), dt, dst_fmt);
+      dst_md =
+          dnnl::memory::desc(common::vectorize(output->dims()), dt, dst_fmt);
     }
 
     this->AcquireForwardPrimitiveDescriptor(dst_md, stack_axis, srcs_md);
@@ -109,7 +110,8 @@ void StackKernel(const Context& dev_ctx,
   concat_p->execute(astream, args);
   astream.wait();
 
-  output->set_mem_desc(dst_mem->get_desc().reshape(vectorize(output->dims())));
+  output->set_mem_desc(
+      dst_mem->get_desc().reshape(common::vectorize(output->dims())));
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/onednn/transpose_grad_kernel.cc b/paddle/phi/kernels/onednn/transpose_grad_kernel.cc
index ca969b5625b5af..24b63dd1dbce7e 100644
--- a/paddle/phi/kernels/onednn/transpose_grad_kernel.cc
+++ b/paddle/phi/kernels/onednn/transpose_grad_kernel.cc
@@ -36,7 +36,7 @@ void TransposeGradKernel(const Context& dev_ctx,
     return;
   }
 
-  std::vector<int64_t> out_grad_tz = vectorize(out_grad.dims());
+  std::vector<int64_t> out_grad_tz = common::vectorize(out_grad.dims());
   funcs::ReorderOneDNNHandler reorder_handler(
       out_grad_tz,
       out_grad.dtype(),
diff --git a/paddle/phi/kernels/onednn/transpose_kernel.cc b/paddle/phi/kernels/onednn/transpose_kernel.cc
index fe744ffef1ca43..44449fa0ea642e 100644
--- a/paddle/phi/kernels/onednn/transpose_kernel.cc
+++ b/paddle/phi/kernels/onednn/transpose_kernel.cc
@@ -40,7 +40,7 @@ void TransposeKernel(const Context& dev_ctx,
         formated_axis[i] = axis[i] + axis_size;
       }
     }
-    auto dims = phi::vectorize<int>(x_dims);
+    auto dims = common::vectorize<int>(x_dims);
 
     std::rotate(dims.begin() + 1, dims.begin() + 2, dims.end());
     x_dims = x_dims.reshape(dims);
@@ -65,7 +65,7 @@ void TransposeKernel(const Context& dev_ctx,
     return;
   }
 
-  auto x_vec_dims = vectorize(x.dims());
+  auto x_vec_dims = common::vectorize(x.dims());
   auto x_type = funcs::ToOneDNNDataType(x.dtype());
   funcs::ReorderOneDNNHandler reorder_handler(
       x_vec_dims, x.dtype(), x_type, dev_ctx.GetEngine());
diff --git a/paddle/phi/kernels/primitive/datamover_primitives.h b/paddle/phi/kernels/primitive/datamover_primitives.h
index 2a3579d99cfe67..a78045aa0dc7ca 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives.h
@@ -20,7 +20,7 @@
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_fp16.h>
 #endif
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 
 namespace phi {
 namespace kps {
diff --git a/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc b/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc
index dccbba6947a1be..74d2bdc0a673fa 100644
--- a/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc
@@ -32,7 +32,7 @@ void MultiplyRawKernel(const Context& dev_ctx,
                        int axis,
                        SelectedRows* out) {
   PADDLE_ENFORCE_EQ(
-      phi::product(y.dims()),
+      common::product(y.dims()),
       1,
       phi::errors::InvalidArgument("For MultiplyKernel, if X is Sparse, Y must "
                                    "contain only one element."));
diff --git a/paddle/phi/kernels/selected_rows/impl/add_n_kernel_impl.h b/paddle/phi/kernels/selected_rows/impl/add_n_kernel_impl.h
index 3fd42fb53b5f76..941878bf419895 100644
--- a/paddle/phi/kernels/selected_rows/impl/add_n_kernel_impl.h
+++ b/paddle/phi/kernels/selected_rows/impl/add_n_kernel_impl.h
@@ -82,7 +82,7 @@ void AddNKernel(const Context &dev_ctx,
     // no data, just set a empty out tensor.
     auto *out_dense = out->mutable_value();
     out_dense->clear();
-    out_dense->Resize(phi::make_ddim({0}));
+    out_dense->Resize(common::make_ddim({0}));
     dev_ctx.template Alloc<T>(out_dense);
   }
 }
diff --git a/paddle/phi/kernels/selected_rows/impl/lamb_kernel_impl.h b/paddle/phi/kernels/selected_rows/impl/lamb_kernel_impl.h
index 0437a48a4c8083..216ab6b55b5242 100644
--- a/paddle/phi/kernels/selected_rows/impl/lamb_kernel_impl.h
+++ b/paddle/phi/kernels/selected_rows/impl/lamb_kernel_impl.h
@@ -293,11 +293,11 @@ void ComputeRowImpl(const Context& dev_ctx,
   // paddle/phi/kernels/impl/lamb_kernel_impl.h Please modify it together
 
   // DenseTensor p_norm_t;
-  // p_norm_t.Resize(phi::make_ddim({1}));
+  // p_norm_t.Resize(common::make_ddim({1}));
   // auto* p_norm_ptr = dev_ctx.template Alloc<MT>(&p_norm_t);
 
   // DenseTensor trust_ratio_div_norm_t;
-  // trust_ratio_div_norm_t.Resize(phi::make_ddim({1}));
+  // trust_ratio_div_norm_t.Resize(common::make_ddim({1}));
   // auto* trust_ratio_div_norm_ptr =
   //     dev_ctx.template Alloc<MT>(&trust_ratio_div_norm_t);
 
diff --git a/paddle/phi/kernels/sparse/cpu/coalesce_kernel.cc b/paddle/phi/kernels/sparse/cpu/coalesce_kernel.cc
index 045b620b9ea957..7a2e2ef96b6e07 100644
--- a/paddle/phi/kernels/sparse/cpu/coalesce_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/coalesce_kernel.cc
@@ -63,9 +63,9 @@ void CoalesceCooCPUKernel(const CPUContext& dev_ctx,
 
   out_indices.Resize({x_indices.dims()[0], out_nnz});
   if (out_values.dims().size() == 1) {
-    out_values.Resize(phi::make_ddim({out_nnz}));
+    out_values.Resize(common::make_ddim({out_nnz}));
   } else {
-    out_values.Resize(phi::make_ddim({out_nnz, x_values.dims()[1]}));
+    out_values.Resize(common::make_ddim({out_nnz, x_values.dims()[1]}));
   }
 
   IntT* out_indices_ptr = out_indices.data<IntT>();
diff --git a/paddle/phi/kernels/sparse/cpu/conv_kernel.cc b/paddle/phi/kernels/sparse/cpu/conv_kernel.cc
index 15b1a54640dbc3..9f51885a94e1c0 100644
--- a/paddle/phi/kernels/sparse/cpu/conv_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/conv_kernel.cc
@@ -53,7 +53,7 @@ void Conv3dCooCPUKernel(const CPUContext& dev_ctx,
 
   int count_tmp = is2D ? 4 : 5;
   std::vector<int> out_dims_vec(count_tmp, 1);
-  DDim out_dims = make_ddim(out_dims_vec);
+  DDim out_dims = common::make_ddim(out_dims_vec);
 
   std::vector<int> kernel_sizes(kernel_dims.size());
   for (int i = 0; i < kernel_dims.size(); i++) {
diff --git a/paddle/phi/kernels/sparse/cpu/elementwise_kernel.cc b/paddle/phi/kernels/sparse/cpu/elementwise_kernel.cc
index 72e3d00962b5dc..daa29515dc77b8 100644
--- a/paddle/phi/kernels/sparse/cpu/elementwise_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/elementwise_kernel.cc
@@ -237,14 +237,14 @@ void ElementWiseCooKernelImpl(const Context& dev_ctx,
   } else {
     DenseTensorMeta indices_meta(
         phi::CppTypeToDataType<IntT>::Type(),
-        phi::make_ddim(
+        common::make_ddim(
             {static_cast<int64_t>(sparse_dim), static_cast<int64_t>(nnz)}),
         DataLayout::NCHW);
-    auto indeces_dim =
-        vectorize(slice_ddim(x.values().dims(), 1, x.values().dims().size()));
+    auto indeces_dim = common::vectorize(
+        slice_ddim(x.values().dims(), 1, x.values().dims().size()));
     indeces_dim.insert(indeces_dim.begin(), nnz);
     DenseTensorMeta values_meta(
-        x.dtype(), phi::make_ddim(indeces_dim), DataLayout::NCHW);
+        x.dtype(), common::make_ddim(indeces_dim), DataLayout::NCHW);
     phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta));
     phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta));
 
diff --git a/paddle/phi/kernels/sparse/cpu/mask_kernel.cc b/paddle/phi/kernels/sparse/cpu/mask_kernel.cc
index d4e240d5e82039..b92ebccbefbc80 100644
--- a/paddle/phi/kernels/sparse/cpu/mask_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/mask_kernel.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/sparse/mask_kernel.h"
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/api/ext/dispatch.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
diff --git a/paddle/phi/kernels/sparse/cpu/reshape_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/reshape_grad_kernel.cc
index fc843f81c31ee1..8b3949badb77da 100644
--- a/paddle/phi/kernels/sparse/cpu/reshape_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/reshape_grad_kernel.cc
@@ -29,7 +29,7 @@ void ReshapeCooGradKernel(const Context& dev_ctx,
                           const SparseCooTensor& dout,
                           SparseCooTensor* dx) {
   EmptyLikeCooKernel<T, Context>(dev_ctx, x, dx);
-  phi::IntArray x_shape(phi::vectorize(x.dims()));
+  phi::IntArray x_shape(common::vectorize(x.dims()));
   ReshapeCooKernel<T, Context>(dev_ctx, dout, x_shape, dx);
 }
 
@@ -39,7 +39,7 @@ void ReshapeCsrGradKernel(const Context& dev_ctx,
                           const SparseCsrTensor& dout,
                           SparseCsrTensor* dx) {
   EmptyLikeCsrKernel<T, Context>(dev_ctx, x, dx);
-  phi::IntArray x_shape(phi::vectorize(x.dims()));
+  phi::IntArray x_shape(common::vectorize(x.dims()));
   ReshapeCsrKernel<T, Context>(dev_ctx, dout, x_shape, dx);
 }
 
diff --git a/paddle/phi/kernels/sparse/cpu/reshape_kernel.cc b/paddle/phi/kernels/sparse/cpu/reshape_kernel.cc
index e8badf3d6e8248..4d5d4285baceef 100644
--- a/paddle/phi/kernels/sparse/cpu/reshape_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/reshape_kernel.cc
@@ -14,7 +14,7 @@
 
 #include "paddle/phi/kernels/sparse/unary_kernel.h"
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 #include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
@@ -59,9 +59,9 @@ void ReshapeCooCPUKernel(const Context& dev_ctx,
   auto* out_indices_data = out_indices.data<IntT>();
 
   const phi::DDim& x_sparse_part_strides =
-      phi::stride(phi::make_ddim(x_sparse_part_dims));
+      common::stride(common::make_ddim(x_sparse_part_dims));
   const phi::DDim& out_sparse_part_strides =
-      phi::stride(phi::make_ddim(out_sparse_part_dims));
+      common::stride(common::make_ddim(out_sparse_part_dims));
   int64_t location = 0;
   for (int64_t j = 0; j < x_nnz; ++j) {
     location = 0;
diff --git a/paddle/phi/kernels/sparse/cpu/slice_kernel.cc b/paddle/phi/kernels/sparse/cpu/slice_kernel.cc
index c40be8a9b15799..81af8339f88a91 100644
--- a/paddle/phi/kernels/sparse/cpu/slice_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/slice_kernel.cc
@@ -14,8 +14,8 @@
 
 #include "paddle/phi/kernels/sparse/unary_kernel.h"
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/slice_utils.h"
diff --git a/paddle/phi/kernels/sparse/cpu/softmax_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/softmax_grad_kernel.cc
index 3c55d12dcf051d..16362bcff8561c 100644
--- a/paddle/phi/kernels/sparse/cpu/softmax_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/softmax_grad_kernel.cc
@@ -100,7 +100,7 @@ void SoftmaxCooGradCPUKernel(const Context& dev_ctx,
   auto out_values = out.values();
   const auto out_dims = out.dims();
   auto sparse_dim = out.sparse_dim();
-  auto sizes = phi::vectorize<IntT>(out_dims);
+  auto sizes = common::vectorize<IntT>(out_dims);
   auto grad_indices = dout.indices();
   auto grad_values = dout.values();
   auto grad_nnz = dout.nnz();
@@ -136,13 +136,13 @@ void SoftmaxCooGradCPUKernel(const Context& dev_ctx,
                                  std::multiplies<>());
 
   DenseTensor values_2(*values);
-  values_2.Resize(phi::make_ddim({nnz, nvalues}));
+  values_2.Resize(common::make_ddim({nnz, nvalues}));
 
   DenseTensor out_values_2(out_values);
-  out_values_2.Resize(phi::make_ddim({nnz, nvalues}));
+  out_values_2.Resize(common::make_ddim({nnz, nvalues}));
 
   DenseTensor grad_values_2(grad_values);
-  grad_values_2.Resize(phi::make_ddim({nnz, nvalues}));
+  grad_values_2.Resize(common::make_ddim({nnz, nvalues}));
   std::map<IntT, std::vector<IntT>> pools;
   phi::funcs::sparse::GetPoolsSoftmax(out_indices, sizes, dim, &pools);
 
diff --git a/paddle/phi/kernels/sparse/cpu/softmax_kernel.cc b/paddle/phi/kernels/sparse/cpu/softmax_kernel.cc
index 5ccf157a9dd40b..ea790508ab1679 100644
--- a/paddle/phi/kernels/sparse/cpu/softmax_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/softmax_kernel.cc
@@ -111,7 +111,7 @@ void SoftmaxCooCPUKernel(const Context& dev_ctx,
     return;
   }
 
-  const std::vector<IntT> sizes = phi::vectorize<IntT>(x_dims);
+  const std::vector<IntT> sizes = common::vectorize<IntT>(x_dims);
   std::map<IntT, std::vector<IntT>> pools;
   IntT nvalues = std::accumulate(sizes.begin() + sparse_dim,
                                  sizes.end(),
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
index f3d26568f50682..af84f14458e616 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
@@ -265,7 +265,7 @@ void CooToDenseCPUKernel(const CPUContext& dev_ctx,
   const auto dense_dims = x.dims();
   const auto indices = x.indices();
   const auto values = x.values();
-  const auto indices_dims = phi::vectorize<int>(indices.dims());
+  const auto indices_dims = common::vectorize<int>(indices.dims());
   int64_t sparse_dim = indices_dims[0];
   if (indices_dims.size() == 1) {
     sparse_dim = 1;
diff --git a/paddle/phi/kernels/sparse/cpu/sum_kernel.cc b/paddle/phi/kernels/sparse/cpu/sum_kernel.cc
index 2b4b11bea89e45..5b96203b7a2db3 100644
--- a/paddle/phi/kernels/sparse/cpu/sum_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sum_kernel.cc
@@ -44,10 +44,10 @@ void SumCooCPUKernel(const Context& dev_ctx,
   if (n_dim == 0) {
     std::vector<int64_t> out_indices_shape;
     if (keep_dim) {
-      out_dims = make_ddim(std::vector<int64_t>(x_dims.size(), 1));
+      out_dims = common::make_ddim(std::vector<int64_t>(x_dims.size(), 1));
       out_indices_shape = {sparse_dim, 1};
     } else {
-      out_dims = make_ddim({1});
+      out_dims = common::make_ddim({1});
       out_indices_shape = {1};
     }
     out_indices = Empty<IntT, Context>(dev_ctx, out_indices_shape);
@@ -70,7 +70,7 @@ void SumCooCPUKernel(const Context& dev_ctx,
       dims.emplace_back(1);
     }
   }
-  out_dims = make_ddim(dims);
+  out_dims = common::make_ddim(dims);
 
   if (dim >= sparse_dim) {
     out_indices = x_indices;
@@ -160,9 +160,9 @@ void SumCsrKernel(const Context& dev_ctx,
   DDim out_dims;
   if (n_dim == 0) {
     if (keep_dim && x.dims().size() == 3) {
-      out_dims = make_ddim({1, 1, 1});
+      out_dims = common::make_ddim({1, 1, 1});
     } else {
-      out_dims = make_ddim({1, 1});
+      out_dims = common::make_ddim({1, 1});
     }
     out_crows = Empty<int64_t, Context>(dev_ctx, {2});  // crows = [0, 1]
     auto* out_crows_data = out_crows.data<int64_t>();
@@ -184,7 +184,7 @@ void SumCsrKernel(const Context& dev_ctx,
     std::vector<T> out_data;
     if (x.dims().size() == 2) {
       out_crows_data[0] = 0;
-      out_dims = make_ddim({x.dims()[0], 1});
+      out_dims = common::make_ddim({x.dims()[0], 1});
       for (int i = 0; i < x.dims()[0]; ++i) {
         if (x_crows_data[i] != x_crows_data[i + 1]) {
           T sum_value = 0;
@@ -199,9 +199,9 @@ void SumCsrKernel(const Context& dev_ctx,
       }
     } else {
       if (keep_dim) {
-        out_dims = make_ddim({x.dims()[0], x.dims()[1], 1});
+        out_dims = common::make_ddim({x.dims()[0], x.dims()[1], 1});
       } else {
-        out_dims = make_ddim({x.dims()[0], x.dims()[1]});
+        out_dims = common::make_ddim({x.dims()[0], x.dims()[1]});
       }
       int j = 0;
       for (int batch = 0; batch < x.dims()[0]; ++batch) {
diff --git a/paddle/phi/kernels/sparse/gpu/addmm_kernel.cu b/paddle/phi/kernels/sparse/gpu/addmm_kernel.cu
index 1a43009c519b6c..d668cd518e4c70 100644
--- a/paddle/phi/kernels/sparse/gpu/addmm_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/addmm_kernel.cu
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include <vector>
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
@@ -35,9 +35,9 @@ void AddmmKernelImpl(const Context& dev_ctx,
                      float alpha,
                      DenseTensor* out) {
 #if CUDA_VERSION >= 11000
-  std::vector<int64_t> input_dim = phi::vectorize(input.dims());
-  std::vector<int64_t> x_dim = phi::vectorize(x.dims());
-  std::vector<int64_t> y_dim = phi::vectorize(y.dims());
+  std::vector<int64_t> input_dim = common::vectorize(input.dims());
+  std::vector<int64_t> x_dim = common::vectorize(x.dims());
+  std::vector<int64_t> y_dim = common::vectorize(y.dims());
   auto rank = input_dim.size();
 
   PADDLE_ENFORCE_GE(
diff --git a/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu b/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
index aaed804c926576..67785d89505b4f 100644
--- a/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
@@ -120,9 +120,9 @@ void CoalesceCooGPUKernel(const GPUContext& dev_ctx,
 
   out_indices.Resize({x_indices.dims()[0], out_nnz});
   if (out_values.dims().size() == 1) {
-    out_values.Resize(phi::make_ddim({out_nnz}));
+    out_values.Resize(common::make_ddim({out_nnz}));
   } else {
-    out_values.Resize(phi::make_ddim({out_nnz, x_values.dims()[1]}));
+    out_values.Resize(common::make_ddim({out_nnz, x_values.dims()[1]}));
   }
 
   // 5. scatter the values
diff --git a/paddle/phi/kernels/sparse/gpu/conv_kernel.cu b/paddle/phi/kernels/sparse/gpu/conv_kernel.cu
index 048fef31d2fee4..8c05ae2f3ceabc 100644
--- a/paddle/phi/kernels/sparse/gpu/conv_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/conv_kernel.cu
@@ -91,7 +91,7 @@ void Conv3dCooGPUKernel(const GPUContext& dev_ctx,
 
   int rank = is2D ? 4 : 5;
   std::vector<int> out_dims_vec(rank, 1);
-  DDim out_dims = make_ddim(out_dims_vec);
+  DDim out_dims = common::make_ddim(out_dims_vec);
 
   std::vector<int> kernel_sizes(kernel_dims.size());
   for (int i = 0; i < kernel_dims.size(); i++) {
diff --git a/paddle/phi/kernels/sparse/gpu/full_kernel.cu b/paddle/phi/kernels/sparse/gpu/full_kernel.cu
index b530c3323e330a..1bad453fea8d6f 100644
--- a/paddle/phi/kernels/sparse/gpu/full_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/full_kernel.cu
@@ -34,7 +34,7 @@ void FullLikeCooKernel(const Context& dev_ctx,
 
   DenseTensor* values = out->mutable_values();
   phi::Full<T, Context>(
-      dev_ctx, phi::vectorize(x.values().dims()), val, values);
+      dev_ctx, common::vectorize(x.values().dims()), val, values);
   out->set_dims(x.dims());
 }
 
@@ -52,7 +52,7 @@ void FullLikeCsrKernel(const Context& dev_ctx,
 
   DenseTensor* values = out->mutable_values();
   phi::Full<T, Context>(
-      dev_ctx, phi::vectorize(x.values().dims()), val, values);
+      dev_ctx, common::vectorize(x.values().dims()), val, values);
 
   out->set_dims(x.dims());
 }
diff --git a/paddle/phi/kernels/sparse/gpu/fused_attention_kernel.cu b/paddle/phi/kernels/sparse/gpu/fused_attention_kernel.cu
index cd8013b4ee8399..5834c525e388c6 100644
--- a/paddle/phi/kernels/sparse/gpu/fused_attention_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/fused_attention_kernel.cu
@@ -211,7 +211,8 @@ void FusedAttentionCsrKernel(
       q_dim[1],
       batch_nnz);
 
-  softmax->set_dims(phi::make_ddim({q_dim[0], q_dim[1], q_dim[2], q_dim[2]}));
+  softmax->set_dims(
+      common::make_ddim({q_dim[0], q_dim[1], q_dim[2], q_dim[2]}));
   MatmulCsrDenseKernel<T, Context>(dev_ctx, *softmax, value, out);
 #else
   PADDLE_THROW(
diff --git a/paddle/phi/kernels/sparse/gpu/mask_kernel.cu b/paddle/phi/kernels/sparse/gpu/mask_kernel.cu
index 3b93ff9638c052..ab367efb11fd6d 100644
--- a/paddle/phi/kernels/sparse/gpu/mask_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/mask_kernel.cu
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/sparse/mask_kernel.h"
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
diff --git a/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu
index 7dbdbe2acc9925..5878b6662f8771 100644
--- a/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu
@@ -164,7 +164,7 @@ void MaskedMatmulCsrGradKernel(const Context& dev_ctx,
   // dy{Dense} = x'{Dense} * dout{SparseCsr}
   // That is: dy'{Dense} = dout'{SparseCsr} * x{Dense}
   if (dy) {
-    std::vector<int> trans_dim_vec = phi::vectorize<int>(y.dims());
+    std::vector<int> trans_dim_vec = common::vectorize<int>(y.dims());
     size_t rank = trans_dim_vec.size();
     std::swap(trans_dim_vec[rank - 1], trans_dim_vec[rank - 2]);
     DenseTensor trans_dy = phi::Empty<T, Context>(dev_ctx, trans_dim_vec);
diff --git a/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu b/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu
index f39209e9b8604d..9a808f5ddcc0b9 100644
--- a/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include <vector>
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/meta_tensor.h"
@@ -38,8 +38,8 @@ void MatmulKernelImpl(const Context& dev_ctx,
                       const DenseTensor& y,
                       DenseTensor* out) {
 #if CUDA_VERSION >= 11000 || HIP_VERSION >= 402
-  std::vector<int64_t> xdim_vec = phi::vectorize(x.dims());
-  std::vector<int64_t> ydim_vec = phi::vectorize(y.dims());
+  std::vector<int64_t> xdim_vec = common::vectorize(x.dims());
+  std::vector<int64_t> ydim_vec = common::vectorize(y.dims());
   auto x_ndims = xdim_vec.size();
   auto y_ndims = ydim_vec.size();
   PADDLE_ENFORCE_EQ(
@@ -76,7 +76,7 @@ void MatmulKernelImpl(const Context& dev_ctx,
   out_dim_vec[y_ndims - 2] = xdim_vec[x_ndims - 2];
   out_dim_vec[y_ndims - 1] = ydim_vec[y_ndims - 1];
   MetaTensor meta_out(out);
-  meta_out.set_dims(phi::make_ddim(out_dim_vec));
+  meta_out.set_dims(common::make_ddim(out_dim_vec));
   meta_out.set_dtype(y.dtype());
 
   dev_ctx.template Alloc<T>(out);
@@ -125,9 +125,9 @@ void MaskedMatmulCsrKernel(const Context& dev_ctx,
                            const SparseCsrTensor& mask,
                            SparseCsrTensor* out) {
 #if CUDA_VERSION >= 11030
-  std::vector<int64_t> xdim_vec = phi::vectorize(x.dims());
-  std::vector<int64_t> ydim_vec = phi::vectorize(y.dims());
-  std::vector<int64_t> maskdim_vec = phi::vectorize(mask.dims());
+  std::vector<int64_t> xdim_vec = common::vectorize(x.dims());
+  std::vector<int64_t> ydim_vec = common::vectorize(y.dims());
+  std::vector<int64_t> maskdim_vec = common::vectorize(mask.dims());
 
   auto x_ndims = xdim_vec.size();
   auto y_ndims = ydim_vec.size();
diff --git a/paddle/phi/kernels/sparse/gpu/mv_kernel.cu b/paddle/phi/kernels/sparse/gpu/mv_kernel.cu
index 27f094fb0fa982..7b442eb8e1bc6e 100644
--- a/paddle/phi/kernels/sparse/gpu/mv_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/mv_kernel.cu
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include <vector>
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/sparse/sparse_blas.h"
 
@@ -30,8 +30,8 @@ void MvKernelImpl(const Context& dev_ctx,
                   const DenseTensor& vec,
                   DenseTensor* out) {
 #if CUDA_VERSION >= 11000
-  std::vector<int64_t> x_dim = phi::vectorize(x.dims());
-  std::vector<int64_t> vec_dim = phi::vectorize(vec.dims());
+  std::vector<int64_t> x_dim = common::vectorize(x.dims());
+  std::vector<int64_t> vec_dim = common::vectorize(vec.dims());
   auto x_ndims = x_dim.size();
   auto vec_ndims = vec_dim.size();
   PADDLE_ENFORCE_EQ(x_ndims,
@@ -49,7 +49,7 @@ void MvKernelImpl(const Context& dev_ctx,
                         "suitable for mv opetation, "
                         "x_dim[-1] must be eaqual to vec_dim[-1]."));
   std::vector<int64_t> out_dim = {x_dim[x_ndims - 2]};
-  out->Resize(phi::make_ddim(out_dim));
+  out->Resize(common::make_ddim(out_dim));
   dev_ctx.template Alloc<T>(out);
   auto sparse_blas = phi::funcs::sparse::GetSparseBlas<Context, T>(dev_ctx);
   sparse_blas.SPMV(false, static_cast<T>(1), x, vec, static_cast<T>(0), out);
diff --git a/paddle/phi/kernels/sparse/gpu/reshape_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/reshape_grad_kernel.cu
index bfc81676eb8041..a4523a82018f8d 100644
--- a/paddle/phi/kernels/sparse/gpu/reshape_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/reshape_grad_kernel.cu
@@ -30,7 +30,7 @@ void ReshapeCooGradKernel(const Context& dev_ctx,
                           const SparseCooTensor& dout,
                           SparseCooTensor* dx) {
   EmptyLikeCooKernel<T, Context>(dev_ctx, x, dx);
-  phi::IntArray x_shape(phi::vectorize(x.dims()));
+  phi::IntArray x_shape(common::vectorize(x.dims()));
   ReshapeCooKernel<T, Context>(dev_ctx, dout, x_shape, dx);
 }
 
@@ -41,7 +41,7 @@ void ReshapeCsrGradKernel(const Context& dev_ctx,
                           const SparseCsrTensor& dout,
                           SparseCsrTensor* dx) {
   EmptyLikeCsrKernel<T, Context>(dev_ctx, x, dx);
-  phi::IntArray x_shape(phi::vectorize(x.dims()));
+  phi::IntArray x_shape(common::vectorize(x.dims()));
   ReshapeCsrKernel<T, Context>(dev_ctx, dout, x_shape, dx);
 }
 
diff --git a/paddle/phi/kernels/sparse/gpu/reshape_kernel.cu b/paddle/phi/kernels/sparse/gpu/reshape_kernel.cu
index 0d04bb2477f6be..33a11639b88058 100644
--- a/paddle/phi/kernels/sparse/gpu/reshape_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/reshape_kernel.cu
@@ -75,9 +75,9 @@ void ReshapeCooGPUKernel(const Context& dev_ctx,
   const auto* x_indices_data = x.indices().data<IntT>();
   auto* out_indices_data = out_indices.data<IntT>();
   const phi::DDim& x_sparse_part_strides =
-      phi::stride(phi::make_ddim(x_sparse_part_dims));
+      common::stride(common::make_ddim(x_sparse_part_dims));
   const phi::DDim& out_sparse_part_strides =
-      phi::stride(phi::make_ddim(out_sparse_part_dims));
+      common::stride(common::make_ddim(out_sparse_part_dims));
 
   int64_t *destination_x_sparse_part_strides,
       *destination_out_sparse_part_strides;
diff --git a/paddle/phi/kernels/sparse/gpu/slice_kernel.cu b/paddle/phi/kernels/sparse/gpu/slice_kernel.cu
index f47accfc8eff81..b96883c0ea3e17 100644
--- a/paddle/phi/kernels/sparse/gpu/slice_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/slice_kernel.cu
@@ -17,11 +17,11 @@
 
 #include "paddle/phi/kernels/sparse/unary_kernel.h"
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/memory_utils.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/empty_kernel.h"
diff --git a/paddle/phi/kernels/sparse/gpu/softmax_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/softmax_grad_kernel.cu
index cf3dc79c8edd0b..aeb09b3fc7c981 100644
--- a/paddle/phi/kernels/sparse/gpu/softmax_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/softmax_grad_kernel.cu
@@ -189,7 +189,7 @@ void SoftmaxCooGradGPUKernel(const Context& dev_ctx,
   const auto output_indices_dims = out.indices().dims();
   const auto out_dims = out.dims();
   auto sparse_dim = out.sparse_dim();
-  auto sizes = phi::vectorize<IntT>(out_dims);
+  auto sizes = common::vectorize<IntT>(out_dims);
   auto grad_indices = dout.indices();
   auto grad_values = dout.values();
   auto grad_values_ptr = grad_values.data<T>();
@@ -243,7 +243,7 @@ void SoftmaxCooGradGPUKernel(const Context& dev_ctx,
                                  std::multiplies<>());
 
   DenseTensor values_2(*values);
-  values_2.Resize(phi::make_ddim({nnz, nvalues}));
+  values_2.Resize(common::make_ddim({nnz, nvalues}));
 
   DenseTensor sorted_indices;
   DenseTensor pool_offsets;
diff --git a/paddle/phi/kernels/sparse/gpu/softmax_kernel.cu b/paddle/phi/kernels/sparse/gpu/softmax_kernel.cu
index 253a5b2141dd03..8a510c6ed30a37 100644
--- a/paddle/phi/kernels/sparse/gpu/softmax_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/softmax_kernel.cu
@@ -194,7 +194,7 @@ void SoftmaxCooGPUKernel(const Context& dev_ctx,
   auto indices = x.indices();
   auto values = x.values();
   const auto x_dims = x.dims();
-  const std::vector<IntT> sizes = phi::vectorize<IntT>(x_dims);
+  const std::vector<IntT> sizes = common::vectorize<IntT>(x_dims);
   const auto sparse_dim = x.sparse_dim();
   const IntT x_nnz = x.nnz();
   DenseTensor out_indices(indices);
diff --git a/paddle/phi/kernels/sparse/gpu/sum_kernel.cu b/paddle/phi/kernels/sparse/gpu/sum_kernel.cu
index 594e1ec48b2e1f..4f53b8886f4936 100644
--- a/paddle/phi/kernels/sparse/gpu/sum_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sum_kernel.cu
@@ -175,10 +175,10 @@ void SumCooGPU0Kernel(const Context& dev_ctx,
   DenseTensor out_indices;
   DenseTensor out_values;
   if (keep_dim) {
-    out_dims = make_ddim(std::vector<int64_t>(x_dims.size(), 1));
+    out_dims = common::make_ddim(std::vector<int64_t>(x_dims.size(), 1));
     out_indices = Empty<IntT, Context>(dev_ctx, {sparse_dim, 1});
   } else {
-    out_dims = make_ddim({1});
+    out_dims = common::make_ddim({1});
     out_indices = Empty<IntT, Context>(dev_ctx, {1, 1});
   }
   phi::funcs::SetConstant<Context, IntT> set_out_indices;
@@ -213,7 +213,7 @@ void SumCooGPU1Kernel(const Context& dev_ctx,
       dims.emplace_back(1);
     }
   }
-  out_dims = make_ddim(dims);
+  out_dims = common::make_ddim(dims);
 
   if (dim >= sparse_dim) {
     out_indices = x_indices;
@@ -308,9 +308,9 @@ void SumCsr0Kernel(const Context& dev_ctx,
   DenseTensor out_crows, out_cols, out_values;
   DDim out_dims;
   if (keep_dim && x.dims().size() == 3) {
-    out_dims = make_ddim({1, 1, 1});
+    out_dims = common::make_ddim({1, 1, 1});
   } else {
-    out_dims = make_ddim({1, 1});
+    out_dims = common::make_ddim({1, 1});
   }
   out_crows = Empty<int64_t, Context>(dev_ctx, {2});  // crows = [0, 1]
   out_cols = Empty<int64_t, Context>(dev_ctx, {1});   // crows = [0]
@@ -351,7 +351,7 @@ void SumCsr1Kernel(const Context& dev_ctx,
     out_values = Empty<T, Context>(dev_ctx, {x_dim0});
     auto* out_cols_data = out_cols.data<int64_t>();
     auto* out_values_data = out_values.data<T>();
-    out_dims = make_ddim({x_dim0, 1});
+    out_dims = common::make_ddim({x_dim0, 1});
     auto config =
         phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_dim0 + 1, 1);
     SumCsr2DCudaKernel<T><<<config.block_per_grid.x,
@@ -370,9 +370,9 @@ void SumCsr1Kernel(const Context& dev_ctx,
     auto* out_cols_data = out_cols.data<int64_t>();
     auto* out_values_data = out_values.data<T>();
     if (keep_dim) {
-      out_dims = make_ddim({x_dim0, x_dim1, 1});
+      out_dims = common::make_ddim({x_dim0, x_dim1, 1});
     } else {
-      out_dims = make_ddim({x_dim0, x_dim1});
+      out_dims = common::make_ddim({x_dim0, x_dim1});
     }
 
     DenseTensor x_crows_reshape =
diff --git a/paddle/phi/kernels/sparse/sparse_utils_kernel.h b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
index 8d92b312bd857a..a072e721887b1a 100644
--- a/paddle/phi/kernels/sparse/sparse_utils_kernel.h
+++ b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
@@ -170,7 +170,7 @@ void SparseCooTensorKernel(const Context& dev_ctx UNUSED,
                            const DenseTensor& indices,
                            const std::vector<int64_t>& shape,
                            SparseCooTensor* out) {
-  *out = SparseCooTensor(indices, values, phi::make_ddim(shape));
+  *out = SparseCooTensor(indices, values, common::make_ddim(shape));
 }
 
 }  // namespace sparse
diff --git a/paddle/phi/kernels/sparse/unary_kernel.h b/paddle/phi/kernels/sparse/unary_kernel.h
index 24bf4f131f6101..dff8742f5afc79 100644
--- a/paddle/phi/kernels/sparse/unary_kernel.h
+++ b/paddle/phi/kernels/sparse/unary_kernel.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/common/int_array.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
 
diff --git a/paddle/phi/kernels/squeeze_grad_kernel.cc b/paddle/phi/kernels/squeeze_grad_kernel.cc
index 75294557ace259..d39bd0c4952b4c 100644
--- a/paddle/phi/kernels/squeeze_grad_kernel.cc
+++ b/paddle/phi/kernels/squeeze_grad_kernel.cc
@@ -26,7 +26,7 @@ void SqueezeGradKernel(const Context& dev_ctx,
                        const IntArray& axes UNUSED,
                        DenseTensor* dx) {
   auto xshape_dims = xshape.dims();
-  auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
+  auto x_dims = common::slice_ddim(xshape_dims, 1, xshape_dims.size());
 
   dev_ctx.template Alloc<T>(dx);
   phi::Copy(dev_ctx, dout, dev_ctx.GetPlace(), false, dx);
diff --git a/paddle/phi/kernels/stride/as_strided_grad_kernel.cc b/paddle/phi/kernels/stride/as_strided_grad_kernel.cc
index 8cbcc5dfedc106..edf72e5da026cf 100644
--- a/paddle/phi/kernels/stride/as_strided_grad_kernel.cc
+++ b/paddle/phi/kernels/stride/as_strided_grad_kernel.cc
@@ -42,8 +42,8 @@ void AsStridedGradKernel(const Context& dev_ctx,
                        phi::StridedCopyKernel<data_t, Context>(
                            dev_ctx,
                            out_grad,
-                           phi::vectorize<int64_t>(tmp.dims()),
-                           phi::vectorize<int64_t>(tmp.strides()),
+                           common::vectorize<int64_t>(tmp.dims()),
+                           common::vectorize<int64_t>(tmp.strides()),
                            tmp.offset(),
                            &tmp);
                      }));
diff --git a/paddle/phi/kernels/stride/complex_grad_kernel.cc b/paddle/phi/kernels/stride/complex_grad_kernel.cc
index 91640c70a39297..800e484ea7eb88 100644
--- a/paddle/phi/kernels/stride/complex_grad_kernel.cc
+++ b/paddle/phi/kernels/stride/complex_grad_kernel.cc
@@ -37,8 +37,8 @@ void RealGradStridedKernel(const Context& dev_ctx,
                        phi::StridedCopyKernel<data_t, Context>(
                            dev_ctx,
                            dout,
-                           phi::vectorize<int64_t>(tmp.dims()),
-                           phi::vectorize<int64_t>(tmp.strides()),
+                           common::vectorize<int64_t>(tmp.dims()),
+                           common::vectorize<int64_t>(tmp.strides()),
                            tmp.offset(),
                            &tmp);
                      }));
@@ -61,8 +61,8 @@ void ImagGradStridedKernel(const Context& dev_ctx,
                        phi::StridedCopyKernel<data_t, Context>(
                            dev_ctx,
                            dout,
-                           phi::vectorize<int64_t>(tmp.dims()),
-                           phi::vectorize<int64_t>(tmp.strides()),
+                           common::vectorize<int64_t>(tmp.dims()),
+                           common::vectorize<int64_t>(tmp.strides()),
                            tmp.offset(),
                            &tmp);
                      }));
diff --git a/paddle/phi/kernels/stride/diagonal_grad_kernel.cc b/paddle/phi/kernels/stride/diagonal_grad_kernel.cc
index d5ebcd6f4ab8a5..fc44c09118fad8 100644
--- a/paddle/phi/kernels/stride/diagonal_grad_kernel.cc
+++ b/paddle/phi/kernels/stride/diagonal_grad_kernel.cc
@@ -46,8 +46,8 @@ void DiagonalGradStridedKernel(const Context& dev_ctx,
                        phi::StridedCopyKernel<data_t, Context>(
                            dev_ctx,
                            out_grad,
-                           phi::vectorize<int64_t>(tmp.dims()),
-                           phi::vectorize<int64_t>(tmp.strides()),
+                           common::vectorize<int64_t>(tmp.dims()),
+                           common::vectorize<int64_t>(tmp.strides()),
                            tmp.offset(),
                            &tmp);
                      }));
diff --git a/paddle/phi/kernels/stride/diagonal_kernel.cc b/paddle/phi/kernels/stride/diagonal_kernel.cc
index 31b2aa97e96fc6..f21ea6c24ac6f9 100644
--- a/paddle/phi/kernels/stride/diagonal_kernel.cc
+++ b/paddle/phi/kernels/stride/diagonal_kernel.cc
@@ -54,8 +54,8 @@ void DiagonalStridedKernel(const Context& dev_ctx,
     }
   }
 
-  std::vector<int64_t> shape = phi::vectorize<int64_t>(x.dims());
-  std::vector<int64_t> stride = phi::vectorize<int64_t>(x.strides());
+  std::vector<int64_t> shape = common::vectorize<int64_t>(x.dims());
+  std::vector<int64_t> stride = common::vectorize<int64_t>(x.strides());
   shape.erase(shape.begin() + std::max(axis1, axis2));
   stride.erase(stride.begin() + std::max(axis1, axis2));
   shape.erase(shape.begin() + std::min(axis1, axis2));
diff --git a/paddle/phi/kernels/stride/flatten_grad_kernel.cc b/paddle/phi/kernels/stride/flatten_grad_kernel.cc
index a4cfe5b3d3941e..be7ed0721fdd2f 100644
--- a/paddle/phi/kernels/stride/flatten_grad_kernel.cc
+++ b/paddle/phi/kernels/stride/flatten_grad_kernel.cc
@@ -24,10 +24,10 @@ void FlattenGradStridedKernel(const Context& dev_ctx,
                               const DenseTensor& out_grad,
                               DenseTensor* x_grad) {
   auto xshape_dims = xshape.dims();
-  auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
+  auto x_dims = common::slice_ddim(xshape_dims, 1, xshape_dims.size());
   ReshapeStridedKernel<Context>(dev_ctx,
                                 out_grad,
-                                IntArray(phi::vectorize<int64_t>(x_dims)),
+                                IntArray(common::vectorize<int64_t>(x_dims)),
                                 x_grad,
                                 nullptr);
 }
diff --git a/paddle/phi/kernels/stride/flatten_kernel.cc b/paddle/phi/kernels/stride/flatten_kernel.cc
index 3bba86123708d5..94b4ae0a89890f 100644
--- a/paddle/phi/kernels/stride/flatten_kernel.cc
+++ b/paddle/phi/kernels/stride/flatten_kernel.cc
@@ -25,7 +25,11 @@ void FlattenInferStridedKernel(const Context& dev_ctx,
                                int stop_axis UNUSED,
                                DenseTensor* out) {
   ReshapeStridedKernel<Context>(
-      dev_ctx, x, IntArray(phi::vectorize<int64_t>(out->dims())), out, nullptr);
+      dev_ctx,
+      x,
+      IntArray(common::vectorize<int64_t>(out->dims())),
+      out,
+      nullptr);
 }
 
 template <typename Context>
diff --git a/paddle/phi/kernels/stride/index_select_grad_kernel.cc b/paddle/phi/kernels/stride/index_select_grad_kernel.cc
index 15ab602fe5304d..99705b396f19ef 100644
--- a/paddle/phi/kernels/stride/index_select_grad_kernel.cc
+++ b/paddle/phi/kernels/stride/index_select_grad_kernel.cc
@@ -44,8 +44,8 @@ void IndexSelectGradStridedKernel(const Context& dev_ctx,
                        phi::StridedCopyKernel<data_t, Context>(
                            dev_ctx,
                            out_grad,
-                           phi::vectorize<int64_t>(tmp.dims()),
-                           phi::vectorize<int64_t>(tmp.strides()),
+                           common::vectorize<int64_t>(tmp.dims()),
+                           common::vectorize<int64_t>(tmp.strides()),
                            tmp.offset(),
                            &tmp);
                      }));
diff --git a/paddle/phi/kernels/stride/index_select_kernel.cc b/paddle/phi/kernels/stride/index_select_kernel.cc
index b7f96be147532a..ea278226ee6c2c 100644
--- a/paddle/phi/kernels/stride/index_select_kernel.cc
+++ b/paddle/phi/kernels/stride/index_select_kernel.cc
@@ -30,8 +30,8 @@ void IndexSelectStridedKernel(const Context& ctx,
   auto input_dim = x.dims();
   dim = dim >= 0 ? dim : dim + input_dim.size();
 
-  std::vector<int64_t> shape = phi::vectorize<int64_t>(x.dims());
-  std::vector<int64_t> stride = phi::vectorize<int64_t>(x.strides());
+  std::vector<int64_t> shape = common::vectorize<int64_t>(x.dims());
+  std::vector<int64_t> stride = common::vectorize<int64_t>(x.strides());
   int64_t offset = static_cast<int64_t>(x.offset());
 
   offset = static_cast<int64_t>(offset +
diff --git a/paddle/phi/kernels/stride/reshape_grad_kernel.cc b/paddle/phi/kernels/stride/reshape_grad_kernel.cc
index 817baf7287c140..4d55c67fbcf0b0 100644
--- a/paddle/phi/kernels/stride/reshape_grad_kernel.cc
+++ b/paddle/phi/kernels/stride/reshape_grad_kernel.cc
@@ -26,7 +26,7 @@ void ReshapeGradStridedKernel(const Context& dev_ctx,
   ReshapeStridedKernel<Context>(
       dev_ctx,
       out_grad,
-      IntArray(phi::vectorize<int64_t>(x_grad->dims())),
+      IntArray(common::vectorize<int64_t>(x_grad->dims())),
       x_grad,
       nullptr);
 }
diff --git a/paddle/phi/kernels/stride/slice_grad_kernel.cc b/paddle/phi/kernels/stride/slice_grad_kernel.cc
index 32ec2c75974d3c..171c20b3b83acd 100644
--- a/paddle/phi/kernels/stride/slice_grad_kernel.cc
+++ b/paddle/phi/kernels/stride/slice_grad_kernel.cc
@@ -51,8 +51,8 @@ void SliceGradStridedKernel(const Context& dev_ctx,
                        phi::StridedCopyKernel<data_t, Context>(
                            dev_ctx,
                            out_grad,
-                           phi::vectorize<int64_t>(tmp.dims()),
-                           phi::vectorize<int64_t>(tmp.strides()),
+                           common::vectorize<int64_t>(tmp.dims()),
+                           common::vectorize<int64_t>(tmp.strides()),
                            tmp.offset(),
                            &tmp);
                      }));
diff --git a/paddle/phi/kernels/stride/slice_kernel.cc b/paddle/phi/kernels/stride/slice_kernel.cc
index 998bc2700df4fb..4e693ab4b0d32d 100644
--- a/paddle/phi/kernels/stride/slice_kernel.cc
+++ b/paddle/phi/kernels/stride/slice_kernel.cc
@@ -47,8 +47,9 @@ void SliceStridedKernel(const Context& ctx,
   phi::funcs::CheckAndUpdateSliceAttrs<int64_t>(
       in_dims, new_axes, &starts, &ends, nullptr, nullptr);
 
-  std::vector<int64_t> output_dims = phi::vectorize<int64_t>(input.dims());
-  std::vector<int64_t> output_stride = phi::vectorize<int64_t>(input.strides());
+  std::vector<int64_t> output_dims = common::vectorize<int64_t>(input.dims());
+  std::vector<int64_t> output_stride =
+      common::vectorize<int64_t>(input.strides());
   int64_t output_offset = static_cast<int64_t>(input.offset());
 
   for (size_t i = 0; i < new_axes.size(); ++i) {
diff --git a/paddle/phi/kernels/stride/squeeze_grad_kernel.cc b/paddle/phi/kernels/stride/squeeze_grad_kernel.cc
index c472c67e651ab5..27361211e8fc02 100644
--- a/paddle/phi/kernels/stride/squeeze_grad_kernel.cc
+++ b/paddle/phi/kernels/stride/squeeze_grad_kernel.cc
@@ -25,9 +25,9 @@ void SqueezeGradStridedKernel(const Context& dev_ctx,
                               const IntArray& axes UNUSED,
                               DenseTensor* dx) {
   auto xshape_dims = xshape.dims();
-  auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
+  auto x_dims = common::slice_ddim(xshape_dims, 1, xshape_dims.size());
   ReshapeStridedKernel<Context>(
-      dev_ctx, dout, IntArray(phi::vectorize<int64_t>(x_dims)), dx, nullptr);
+      dev_ctx, dout, IntArray(common::vectorize<int64_t>(x_dims)), dx, nullptr);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/stride/squeeze_kernel.cc b/paddle/phi/kernels/stride/squeeze_kernel.cc
index 33895dfcf8e66b..b03652baee624c 100644
--- a/paddle/phi/kernels/stride/squeeze_kernel.cc
+++ b/paddle/phi/kernels/stride/squeeze_kernel.cc
@@ -36,7 +36,7 @@ void SqueezeInferStridedKernel(const Context& dev_ctx,
   auto input_stride = input.strides();
 
   if (input.Holder() == out->Holder() && input.meta() == out->meta()) {
-    output_dims = phi::vectorize<int64_t>(out->dims());
+    output_dims = common::vectorize<int64_t>(out->dims());
     if (axes.empty()) {
       for (int i = input_stride.size() - 1; i > 0; --i) {
         if (input_stride[i] != input_stride[i - 1]) {
diff --git a/paddle/phi/kernels/stride/strided_slice_grad_kernel.cc b/paddle/phi/kernels/stride/strided_slice_grad_kernel.cc
index 9b2d03a00e86eb..f0cd2d53bc8238 100644
--- a/paddle/phi/kernels/stride/strided_slice_grad_kernel.cc
+++ b/paddle/phi/kernels/stride/strided_slice_grad_kernel.cc
@@ -56,8 +56,8 @@ void StridedSliceRawGradStridedKernel(const Context& dev_ctx,
         phi::StridedCopyKernel<data_t, Context>(
             dev_ctx,
             out_grad,
-            phi::vectorize<int64_t>(tmp.dims()),
-            phi::vectorize<int64_t>(tmp.strides()),
+            common::vectorize<int64_t>(tmp.dims()),
+            common::vectorize<int64_t>(tmp.strides()),
             tmp.offset(),
             &tmp);
       }));
diff --git a/paddle/phi/kernels/stride/strided_slice_kernel.cc b/paddle/phi/kernels/stride/strided_slice_kernel.cc
index a57ed98d119a99..77919f8d000a00 100644
--- a/paddle/phi/kernels/stride/strided_slice_kernel.cc
+++ b/paddle/phi/kernels/stride/strided_slice_kernel.cc
@@ -35,8 +35,9 @@ void StridedSliceRawStridedKernel(const Context& dev_ctx,
   std::vector<int64_t> ends = ends_arr.GetData();
   std::vector<int64_t> strides = strides_arr.GetData();
 
-  std::vector<int64_t> output_dims = phi::vectorize<int64_t>(input.dims());
-  std::vector<int64_t> output_stride = phi::vectorize<int64_t>(input.strides());
+  std::vector<int64_t> output_dims = common::vectorize<int64_t>(input.dims());
+  std::vector<int64_t> output_stride =
+      common::vectorize<int64_t>(input.strides());
   int64_t output_offset = static_cast<int64_t>(input.offset());
   for (size_t i = 0; i < axes.size(); ++i) {
     int64_t axis_size = input.dims()[axes[i]];
diff --git a/paddle/phi/kernels/stride/tensor_unfold_grad_kernel.cc b/paddle/phi/kernels/stride/tensor_unfold_grad_kernel.cc
index 620d7bbb46ddc4..7dc3e6e46361ba 100644
--- a/paddle/phi/kernels/stride/tensor_unfold_grad_kernel.cc
+++ b/paddle/phi/kernels/stride/tensor_unfold_grad_kernel.cc
@@ -50,8 +50,8 @@ void TensorUnfoldGradKernel(const Context& dev_ctx,
                        phi::StridedCopyKernel<data_t, Context>(
                            dev_ctx,
                            out_grad,
-                           phi::vectorize<int64_t>(tmp.dims()),
-                           phi::vectorize<int64_t>(tmp.strides()),
+                           common::vectorize<int64_t>(tmp.dims()),
+                           common::vectorize<int64_t>(tmp.strides()),
                            tmp.offset(),
                            &tmp);
                      }));
diff --git a/paddle/phi/kernels/stride/unsqueeze_grad_kernel.cc b/paddle/phi/kernels/stride/unsqueeze_grad_kernel.cc
index 34a52f4659b274..c6c5c117cd94e4 100644
--- a/paddle/phi/kernels/stride/unsqueeze_grad_kernel.cc
+++ b/paddle/phi/kernels/stride/unsqueeze_grad_kernel.cc
@@ -24,9 +24,9 @@ void UnsqueezeGradStridedKernel(const Context& dev_ctx,
                                 const DenseTensor& dout,
                                 DenseTensor* dx) {
   auto xshape_dims = x_shape.dims();
-  auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
+  auto x_dims = common::slice_ddim(xshape_dims, 1, xshape_dims.size());
   ReshapeStridedKernel<Context>(
-      dev_ctx, dout, IntArray(phi::vectorize<int64_t>(x_dims)), dx, nullptr);
+      dev_ctx, dout, IntArray(common::vectorize<int64_t>(x_dims)), dx, nullptr);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/stride/unsqueeze_kernel.cc b/paddle/phi/kernels/stride/unsqueeze_kernel.cc
index b97a0222e6cd62..bd1a200ea0eaae 100644
--- a/paddle/phi/kernels/stride/unsqueeze_kernel.cc
+++ b/paddle/phi/kernels/stride/unsqueeze_kernel.cc
@@ -28,11 +28,12 @@ void UnsqueezeInferStridedKernel(const Context& dev_ctx,
                                  const IntArray& axes_arr,
                                  DenseTensor* out) {
   std::vector<int64_t> axes = axes_arr.GetData();
-  std::vector<int64_t> input_dims = phi::vectorize<int64_t>(input.dims());
-  std::vector<int64_t> input_stride = phi::vectorize<int64_t>(input.strides());
+  std::vector<int64_t> input_dims = common::vectorize<int64_t>(input.dims());
+  std::vector<int64_t> input_stride =
+      common::vectorize<int64_t>(input.strides());
 
   if (input.Holder() == out->Holder() && input.meta() == out->meta()) {
-    input_dims = phi::vectorize<int64_t>(out->dims());
+    input_dims = common::vectorize<int64_t>(out->dims());
     for (int64_t i = static_cast<int64_t>(axes.size() - 1); i >= 0; --i) {
       axes[i] = static_cast<int64_t>(axes[i] < 0 ? axes[i] + input_dims.size()
                                                  : axes[i]);
diff --git a/paddle/phi/kernels/stride/view_grad_kernel.cc b/paddle/phi/kernels/stride/view_grad_kernel.cc
index e63598b5b58735..d04998c95622e5 100644
--- a/paddle/phi/kernels/stride/view_grad_kernel.cc
+++ b/paddle/phi/kernels/stride/view_grad_kernel.cc
@@ -25,7 +25,7 @@ void ViewShapeGradKernel(const Context& dev_ctx,
                          const std::vector<int64_t>& dims,
                          DenseTensor* input_grad) {
   ViewShapeKernel<Context>(
-      dev_ctx, out_grad, phi::vectorize<int64_t>(input.dims()), input_grad);
+      dev_ctx, out_grad, common::vectorize<int64_t>(input.dims()), input_grad);
 }
 
 template <typename Context>
diff --git a/paddle/phi/kernels/strings/gpu/copy_utils.h b/paddle/phi/kernels/strings/gpu/copy_utils.h
index 36cad026184242..6e413ef73098dd 100644
--- a/paddle/phi/kernels/strings/gpu/copy_utils.h
+++ b/paddle/phi/kernels/strings/gpu/copy_utils.h
@@ -112,7 +112,7 @@ void SerializeOnCPU(const Context& dev_ctx,
   for (int64_t i = 0; i < numel; ++i) {
     num += src_str[i].length() + 1;
   }
-  dst->Resize(phi::make_ddim({num}));
+  dst->Resize(common::make_ddim({num}));
   uint8_t* strings_data = dev_ctx.template HostAlloc<uint8_t>(dst);
   auto* strings_offset = reinterpret_cast<int*>(strings_data);
   int start_offset = sizeof(int) * (numel + 1);
@@ -137,7 +137,7 @@ void DeserializeOnCPU(const Context& dev_ctx,
   auto* strings_data = reinterpret_cast<const char*>(src.data<uint8_t>());
   auto* strings_offset = reinterpret_cast<const int*>(strings_data);
   int numel = strings_offset[0] / sizeof(int) - 1;
-  dst->Resize(phi::make_ddim({numel}));
+  dst->Resize(common::make_ddim({numel}));
   dtype::pstring* dst_str = dev_ctx.template HostAlloc<dtype::pstring>(dst);
   for (int i = 0; i < numel; ++i) {
     // -1 not include '\0'
@@ -156,7 +156,7 @@ void SerializeOnGPU(const phi::GPUContext& dev_ctx,
   auto strings_size = GetAllStringsSize(dev_ctx, src_str, numel);
   strings_size += sizeof(int32_t) * (numel + 1);
 
-  dst->Resize(phi::make_ddim({strings_size}));
+  dst->Resize(common::make_ddim({strings_size}));
   uint8_t* strings_data = dev_ctx.template Alloc<uint8_t>(dst);
   auto* strings_offset = reinterpret_cast<int*>(strings_data);
 
@@ -184,7 +184,7 @@ void DeserializeOnGPU(const phi::GPUContext& dev_ctx,
       &numel, strings_data, sizeof(numel), cudaMemcpyDeviceToHost);
 #endif
   numel = numel / sizeof(int) - 1;
-  dst->Resize(phi::make_ddim({numel}));
+  dst->Resize(common::make_ddim({numel}));
   dtype::pstring* dst_str = dev_ctx.template Alloc<dtype::pstring>(dst);
 
   dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
diff --git a/paddle/phi/kernels/strings/strings_empty_kernel.cc b/paddle/phi/kernels/strings/strings_empty_kernel.cc
index 22a43ceaff1c17..10d958f354e2d3 100644
--- a/paddle/phi/kernels/strings/strings_empty_kernel.cc
+++ b/paddle/phi/kernels/strings/strings_empty_kernel.cc
@@ -24,7 +24,7 @@ template <typename Context>
 void EmptyKernel(const Context& dev_ctx,
                  const IntArray& shape,
                  StringTensor* out) {
-  out->Resize(phi::make_ddim(shape.GetData()));
+  out->Resize(common::make_ddim(shape.GetData()));
   dev_ctx.template Alloc<dtype::pstring>(out);
 }
 
diff --git a/paddle/phi/kernels/strings/unicode.h b/paddle/phi/kernels/strings/unicode.h
index 45e41b72d086c0..410543c27d68fc 100644
--- a/paddle/phi/kernels/strings/unicode.h
+++ b/paddle/phi/kernels/strings/unicode.h
@@ -17,8 +17,8 @@ limitations under the License. */
 #include <cstring>
 #include <memory>
 
+#include "paddle/common/macros.h"
 #include "paddle/phi/core/hostdevice.h"
-#include "paddle/phi/core/macros.h"
 
 namespace phi {
 namespace strings {
diff --git a/paddle/phi/kernels/strings/unicode_flag.h b/paddle/phi/kernels/strings/unicode_flag.h
index 7e97b80c2c642a..c09104f6bfb8a8 100644
--- a/paddle/phi/kernels/strings/unicode_flag.h
+++ b/paddle/phi/kernels/strings/unicode_flag.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/phi/core/macros.h"
+#include "paddle/common/macros.h"
 
 namespace phi {
 namespace strings {
diff --git a/paddle/phi/kernels/transfer_layout_kernel.cc b/paddle/phi/kernels/transfer_layout_kernel.cc
index d001822b21fc8f..dbaf74f8c0c984 100644
--- a/paddle/phi/kernels/transfer_layout_kernel.cc
+++ b/paddle/phi/kernels/transfer_layout_kernel.cc
@@ -69,7 +69,7 @@ void TransferLayoutGeneral(const Context& dev_ctx,
     dst_dim[i] = src_dim[axis[i]];
   }
 
-  out->Resize(phi::make_ddim(dst_dim));
+  out->Resize(common::make_ddim(dst_dim));
   dev_ctx.Alloc(out, x.dtype());
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   // In GPU fp16 model, we will insert many transfer_layout ops in
diff --git a/paddle/phi/kernels/triangular_solve_grad_kernel.h b/paddle/phi/kernels/triangular_solve_grad_kernel.h
index eb5a5ab461a1dc..1b51ad50d3246a 100644
--- a/paddle/phi/kernels/triangular_solve_grad_kernel.h
+++ b/paddle/phi/kernels/triangular_solve_grad_kernel.h
@@ -14,9 +14,9 @@
 
 #pragma once
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/unsqueeze_grad_kernel.cc b/paddle/phi/kernels/unsqueeze_grad_kernel.cc
index a281bb66b4c671..1603b1e2f63987 100644
--- a/paddle/phi/kernels/unsqueeze_grad_kernel.cc
+++ b/paddle/phi/kernels/unsqueeze_grad_kernel.cc
@@ -26,7 +26,7 @@ void UnsqueezeGradKernel(const Context& dev_ctx,
                          const DenseTensor& dout,
                          DenseTensor* dx) {
   auto xshape_dims = x_shape.dims();
-  auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
+  auto x_dims = common::slice_ddim(xshape_dims, 1, xshape_dims.size());
   dev_ctx.template Alloc<T>(dx);
   phi::Copy(dev_ctx, dout, dev_ctx.GetPlace(), true, dx);
   dx->Resize(x_dims);
diff --git a/paddle/phi/kernels/xpu/activation_grad_kernel.cc b/paddle/phi/kernels/xpu/activation_grad_kernel.cc
index 89732be24cc918..7cada9005c33eb 100644
--- a/paddle/phi/kernels/xpu/activation_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/activation_grad_kernel.cc
@@ -195,7 +195,7 @@ struct XPULogGradFunctor : public funcs::BaseActivationFunctor<T> {
         dev_ctx.x_context(), tmp, x->numel(), static_cast<T>(1.0));
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
 
-    auto x_dims = vectorize<int>(x->dims());
+    auto x_dims = common::vectorize<int>(x->dims());
 
     // use [1] to replace [], because xpu not support []
     if (x_dims.size() == 0) {
@@ -471,9 +471,9 @@ void PowGradKernel(const Context& dev_ctx,
   T* x_grad = dx->data<T>();
 
   // check dims: all dims should equal
-  auto x_dims = vectorize<int>(x.dims());
-  auto dy_dims = vectorize<int>(dout.dims());
-  auto dx_dims = vectorize<int>(dx->dims());
+  auto x_dims = common::vectorize<int>(x.dims());
+  auto dy_dims = common::vectorize<int>(dout.dims());
+  auto dx_dims = common::vectorize<int>(dx->dims());
   PADDLE_ENFORCE_EQ(x_dims,
                     dy_dims,
                     errors::PreconditionNotMet("x_dims should match dy_dims."));
diff --git a/paddle/phi/kernels/xpu/activation_kernel.cc b/paddle/phi/kernels/xpu/activation_kernel.cc
index 54064e90b82829..0608225281f10e 100644
--- a/paddle/phi/kernels/xpu/activation_kernel.cc
+++ b/paddle/phi/kernels/xpu/activation_kernel.cc
@@ -212,7 +212,7 @@ void PowKernel(const Context& dev_ctx,
                      static_cast<void*>(&pow_factor),
                      sizeof(T));
 
-  auto x_dims = vectorize<int>(x.dims());
+  auto x_dims = common::vectorize<int>(x.dims());
   // use [1] to replace [], because xpu not support []
   if (x_dims.size() == 0) {
     x_dims = std::vector<int>({1});
diff --git a/paddle/phi/kernels/xpu/arange_kernel.cc b/paddle/phi/kernels/xpu/arange_kernel.cc
index 7afdfcd60daf87..5c9ba973340b7e 100644
--- a/paddle/phi/kernels/xpu/arange_kernel.cc
+++ b/paddle/phi/kernels/xpu/arange_kernel.cc
@@ -32,7 +32,7 @@ void ArangeTensorKernel(const Context& dev_ctx,
 
   int64_t size = 0;
   phi::funcs::GetSize(start_value, end_value, step_value, &size);
-  out->Resize(phi::make_ddim({size}));
+  out->Resize(common::make_ddim({size}));
   auto* out_data = dev_ctx.template Alloc<T>(out);
 
   int ret = xpu::range<T>(
diff --git a/paddle/phi/kernels/xpu/arg_min_max_kernel.cc b/paddle/phi/kernels/xpu/arg_min_max_kernel.cc
index b5b2ed7d328884..dda71ebe46120b 100644
--- a/paddle/phi/kernels/xpu/arg_min_max_kernel.cc
+++ b/paddle/phi/kernels/xpu/arg_min_max_kernel.cc
@@ -14,8 +14,8 @@
 
 #include "paddle/phi/kernels/arg_min_max_kernel.h"
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -46,14 +46,14 @@ void ArgMaxKernel(const Context& dev_ctx,
   DDim x_dims;
   int axis_val = axis.to<int>();
   if (flatten) {
-    x_dims = phi::make_ddim({x.numel()});
+    x_dims = common::make_ddim({x.numel()});
     // if flatten, the axis just as 0
     axis_val = 0;
   } else {
     x_dims = x.dims();
     if (axis_val < 0) axis_val += x_dims.size();
   }
-  auto xdims_vec = phi::vectorize<int>(x_dims);
+  auto xdims_vec = common::vectorize<int>(x_dims);
   int r = 0;
   if (dtype != DataType::INT32) {
     dev_ctx.template Alloc<int64_t>(out);
diff --git a/paddle/phi/kernels/xpu/argsort_grad_kernel.cc b/paddle/phi/kernels/xpu/argsort_grad_kernel.cc
index 4ebab7b37fc301..a96c3ade04163f 100644
--- a/paddle/phi/kernels/xpu/argsort_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/argsort_grad_kernel.cc
@@ -50,9 +50,9 @@ void ArgsortGradKernel(const Context& dev_ctx,
   if (axis == -1 || axis + 1 == in_dims.size()) {
     is_need_transpose = false;
   }
-  int len_before = phi::product(phi::slice_ddim(in_dims, 0, axis));
+  int len_before = common::product(common::slice_ddim(in_dims, 0, axis));
   int len_after =
-      phi::product(phi::slice_ddim(in_dims, axis + 1, in_dims.size()));
+      common::product(common::slice_ddim(in_dims, axis + 1, in_dims.size()));
   int m = len_before * len_after;
   int n = in_dims[axis];
   int len = m * n;
diff --git a/paddle/phi/kernels/xpu/argsort_kernel.cc b/paddle/phi/kernels/xpu/argsort_kernel.cc
index e1875b8f52c788..1158045a2e602b 100644
--- a/paddle/phi/kernels/xpu/argsort_kernel.cc
+++ b/paddle/phi/kernels/xpu/argsort_kernel.cc
@@ -186,9 +186,9 @@ void ArgsortKernel(const Context& dev_ctx,
     return;
   }
 
-  int len_before = phi::product(phi::slice_ddim(in_dims, 0, axis));
+  int len_before = common::product(common::slice_ddim(in_dims, 0, axis));
   int len_after =
-      phi::product(phi::slice_ddim(in_dims, axis + 1, in_dims.size()));
+      common::product(common::slice_ddim(in_dims, axis + 1, in_dims.size()));
   std::vector<int> permute_vec{0, 2, 1};
   std::vector<int> data_shape{len_before, n, len_after};
 
diff --git a/paddle/phi/kernels/xpu/batch_norm_grad_kernel.cc b/paddle/phi/kernels/xpu/batch_norm_grad_kernel.cc
index 863bc2759b39a3..454141ff4c3ea4 100644
--- a/paddle/phi/kernels/xpu/batch_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/batch_norm_grad_kernel.cc
@@ -99,7 +99,7 @@ void BatchNormGradKernel(const Context &dev_ctx,
                         "But recevived 'data_layout' is [%s].",
                         data_layout));
 
-  const auto data_layout_val = phi::StringToDataLayout(data_layout);
+  const auto data_layout_val = common::StringToDataLayout(data_layout);
 
   use_global_stats = is_test || use_global_stats;
 
diff --git a/paddle/phi/kernels/xpu/batch_norm_kernel.cc b/paddle/phi/kernels/xpu/batch_norm_kernel.cc
index 2abb1686daed98..8427c49b43d42f 100644
--- a/paddle/phi/kernels/xpu/batch_norm_kernel.cc
+++ b/paddle/phi/kernels/xpu/batch_norm_kernel.cc
@@ -43,7 +43,7 @@ void BatchNormKernel(const Context& dev_ctx,
   using XPUType = typename XPUTypeTrait<T>::Type;
   bool test_mode = is_test && (!trainable_statistics);
   bool global_stats = test_mode || use_global_stats;
-  const auto data_layout = phi::StringToDataLayout(data_layout_str);
+  const auto data_layout = common::StringToDataLayout(data_layout_str);
   PADDLE_ENFORCE_EQ(data_layout_str == "NCHW" || data_layout_str == "NHWC",
                     true,
                     phi::errors::InvalidArgument(
diff --git a/paddle/phi/kernels/xpu/c_split_kernel.cc b/paddle/phi/kernels/xpu/c_split_kernel.cc
index f330323059e2b7..1d0a6ca31f66ec 100644
--- a/paddle/phi/kernels/xpu/c_split_kernel.cc
+++ b/paddle/phi/kernels/xpu/c_split_kernel.cc
@@ -56,8 +56,8 @@ void CSplitKernel(const Context& dev_ctx,
   int64_t end_size = dims[dims_size - 1];
 
   // remain dim
-  auto remain_ddim = phi::slice_ddim(dims, 0, dims_size - 1);
-  int64_t remain_numel = phi::product(remain_ddim);
+  auto remain_ddim = common::slice_ddim(dims, 0, dims_size - 1);
+  int64_t remain_numel = common::product(remain_ddim);
 
   dims[dims_size - 1] /= nranks;
   out->Resize(dims);
diff --git a/paddle/phi/kernels/xpu/compare_kernel.cc b/paddle/phi/kernels/xpu/compare_kernel.cc
index 51b392b7144ae2..2732823fd94282 100644
--- a/paddle/phi/kernels/xpu/compare_kernel.cc
+++ b/paddle/phi/kernels/xpu/compare_kernel.cc
@@ -33,8 +33,8 @@ void XPUCompareKernelImpl(const Context& dev_ctx,
                                             bool*,
                                             const std::vector<int>&,
                                             const std::vector<int>&)> func) {
-  auto x_shape = vectorize<int>(x.dims());
-  auto y_shape = vectorize<int>(y.dims());
+  auto x_shape = common::vectorize<int>(x.dims());
+  auto y_shape = common::vectorize<int>(y.dims());
 
   if (x.dims().size() == 0) {
     x_shape = std::vector<int>({1});
diff --git a/paddle/phi/kernels/xpu/contiguous_kernel.cc b/paddle/phi/kernels/xpu/contiguous_kernel.cc
index 8e3c0a95954527..922bda579cbdbe 100644
--- a/paddle/phi/kernels/xpu/contiguous_kernel.cc
+++ b/paddle/phi/kernels/xpu/contiguous_kernel.cc
@@ -39,8 +39,8 @@ void ContiguousKernel(const Context& dev_ctx,
       r = xpu::as_strided<float>(dev_ctx.x_context(),
                                  input_data,
                                  output_data,
-                                 phi::vectorize<int64_t>(input.dims()),
-                                 phi::vectorize<int64_t>(input.strides()),
+                                 common::vectorize<int64_t>(input.dims()),
+                                 common::vectorize<int64_t>(input.strides()),
                                  0);
     }
   } else if (std::is_same<T, double>::value) {
@@ -53,8 +53,8 @@ void ContiguousKernel(const Context& dev_ctx,
       r = xpu::as_strided<int64_t>(dev_ctx.x_context(),
                                    input_data,
                                    output_data,
-                                   phi::vectorize<int64_t>(input.dims()),
-                                   phi::vectorize<int64_t>(input.strides()),
+                                   common::vectorize<int64_t>(input.dims()),
+                                   common::vectorize<int64_t>(input.strides()),
                                    0);
     }
   } else if (std::is_same<T, ::phi::dtype::float16>::value) {
@@ -66,12 +66,13 @@ void ContiguousKernel(const Context& dev_ctx,
       r = xpu::copy<XPUFLOAT16>(
           dev_ctx.x_context(), input_data, output_data, 1);
     } else {
-      r = xpu::as_strided<XPUFLOAT16>(dev_ctx.x_context(),
-                                      input_data,
-                                      output_data,
-                                      phi::vectorize<int64_t>(input.dims()),
-                                      phi::vectorize<int64_t>(input.strides()),
-                                      0);
+      r = xpu::as_strided<XPUFLOAT16>(
+          dev_ctx.x_context(),
+          input_data,
+          output_data,
+          common::vectorize<int64_t>(input.dims()),
+          common::vectorize<int64_t>(input.strides()),
+          0);
     }
   } else if (std::is_same<T, ::phi::dtype::bfloat16>::value) {
     using XPUFLOAT16 = typename XPUTypeTrait<float16>::Type;
@@ -82,12 +83,13 @@ void ContiguousKernel(const Context& dev_ctx,
       r = xpu::copy<XPUFLOAT16>(
           dev_ctx.x_context(), input_data, output_data, 1);
     } else {
-      r = xpu::as_strided<XPUFLOAT16>(dev_ctx.x_context(),
-                                      input_data,
-                                      output_data,
-                                      phi::vectorize<int64_t>(input.dims()),
-                                      phi::vectorize<int64_t>(input.strides()),
-                                      0);
+      r = xpu::as_strided<XPUFLOAT16>(
+          dev_ctx.x_context(),
+          input_data,
+          output_data,
+          common::vectorize<int64_t>(input.dims()),
+          common::vectorize<int64_t>(input.strides()),
+          0);
     }
   } else if (std::is_same<T, int16_t>::value) {
     using XPUFLOAT16 = typename XPUTypeTrait<float16>::Type;
@@ -98,12 +100,13 @@ void ContiguousKernel(const Context& dev_ctx,
       r = xpu::copy<XPUFLOAT16>(
           dev_ctx.x_context(), input_data, output_data, 1);
     } else {
-      r = xpu::as_strided<XPUFLOAT16>(dev_ctx.x_context(),
-                                      input_data,
-                                      output_data,
-                                      phi::vectorize<int64_t>(input.dims()),
-                                      phi::vectorize<int64_t>(input.strides()),
-                                      0);
+      r = xpu::as_strided<XPUFLOAT16>(
+          dev_ctx.x_context(),
+          input_data,
+          output_data,
+          common::vectorize<int64_t>(input.dims()),
+          common::vectorize<int64_t>(input.strides()),
+          0);
     }
   } else if (std::is_same<T, uint8_t>::value) {
     auto input_data = reinterpret_cast<const int8_t*>(input.data<T>());
@@ -115,8 +118,8 @@ void ContiguousKernel(const Context& dev_ctx,
       r = xpu::as_strided<int8_t>(dev_ctx.x_context(),
                                   input_data,
                                   output_data,
-                                  phi::vectorize<int64_t>(input.dims()),
-                                  phi::vectorize<int64_t>(input.strides()),
+                                  common::vectorize<int64_t>(input.dims()),
+                                  common::vectorize<int64_t>(input.strides()),
                                   0);
     }
   } else if (std::is_same<T, int8_t>::value) {
@@ -129,8 +132,8 @@ void ContiguousKernel(const Context& dev_ctx,
       r = xpu::as_strided<int8_t>(dev_ctx.x_context(),
                                   input_data,
                                   output_data,
-                                  phi::vectorize<int64_t>(input.dims()),
-                                  phi::vectorize<int64_t>(input.strides()),
+                                  common::vectorize<int64_t>(input.dims()),
+                                  common::vectorize<int64_t>(input.strides()),
                                   0);
     }
   } else if (std::is_same<T, int32_t>::value) {
@@ -143,8 +146,8 @@ void ContiguousKernel(const Context& dev_ctx,
       r = xpu::as_strided<int32_t>(dev_ctx.x_context(),
                                    input_data,
                                    output_data,
-                                   phi::vectorize<int64_t>(input.dims()),
-                                   phi::vectorize<int64_t>(input.strides()),
+                                   common::vectorize<int64_t>(input.dims()),
+                                   common::vectorize<int64_t>(input.strides()),
                                    0);
     }
   } else if (std::is_same<T, int64_t>::value) {
@@ -157,8 +160,8 @@ void ContiguousKernel(const Context& dev_ctx,
       r = xpu::as_strided<int64_t>(dev_ctx.x_context(),
                                    input_data,
                                    output_data,
-                                   phi::vectorize<int64_t>(input.dims()),
-                                   phi::vectorize<int64_t>(input.strides()),
+                                   common::vectorize<int64_t>(input.dims()),
+                                   common::vectorize<int64_t>(input.strides()),
                                    0);
     }
   } else if (std::is_same<T, bool>::value) {
@@ -170,8 +173,8 @@ void ContiguousKernel(const Context& dev_ctx,
       r = xpu::as_strided<bool>(dev_ctx.x_context(),
                                 input_data,
                                 output_data,
-                                phi::vectorize<int64_t>(input.dims()),
-                                phi::vectorize<int64_t>(input.strides()),
+                                common::vectorize<int64_t>(input.dims()),
+                                common::vectorize<int64_t>(input.strides()),
                                 0);
     }
   } else {
diff --git a/paddle/phi/kernels/xpu/conv_grad_kernel.cc b/paddle/phi/kernels/xpu/conv_grad_kernel.cc
index 0c40e09d2202f4..03276ebd53b5f1 100644
--- a/paddle/phi/kernels/xpu/conv_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/conv_grad_kernel.cc
@@ -48,11 +48,11 @@ void ConvGradKernel(const Context& dev_ctx,
           ("XPU doesn't support data_format is NDHWC in conv grad op.")));
 
   phi::DDim in_data_dims =
-      phi::slice_ddim(input.dims(), 2, input.dims().size());
+      common::slice_ddim(input.dims(), 2, input.dims().size());
   phi::DDim filter_data_dims =
-      phi::slice_ddim(filter.dims(), 2, filter.dims().size());
-  std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-  std::vector<int> filter_shape = phi::vectorize<int>(filter.dims());
+      common::slice_ddim(filter.dims(), 2, filter.dims().size());
+  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
+  std::vector<int> filter_shape = common::vectorize<int>(filter.dims());
   UpdatePaddingAndDilation(
       &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
 
@@ -269,11 +269,11 @@ void Conv3DGradKernel(const Context& dev_ctx,
   if (!input_grad && !filter_grad) return;
 
   phi::DDim in_data_dims =
-      phi::slice_ddim(input.dims(), 2, input.dims().size());
+      common::slice_ddim(input.dims(), 2, input.dims().size());
   phi::DDim filter_data_dims =
-      phi::slice_ddim(filter.dims(), 2, filter.dims().size());
-  std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-  std::vector<int> filter_shape = phi::vectorize<int>(filter.dims());
+      common::slice_ddim(filter.dims(), 2, filter.dims().size());
+  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
+  std::vector<int> filter_shape = common::vectorize<int>(filter.dims());
   UpdatePaddingAndDilation(
       &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
 
diff --git a/paddle/phi/kernels/xpu/conv_kernel.cc b/paddle/phi/kernels/xpu/conv_kernel.cc
index 7a699225f3b01b..0dc93d676186bf 100644
--- a/paddle/phi/kernels/xpu/conv_kernel.cc
+++ b/paddle/phi/kernels/xpu/conv_kernel.cc
@@ -47,10 +47,10 @@ void ConvKernel(const Context& dev_ctx,
           ("XPU does not support data_format is NDHWC in conv op.")));
 
   phi::DDim in_data_dims =
-      phi::slice_ddim(input.dims(), 2, input.dims().size());
+      common::slice_ddim(input.dims(), 2, input.dims().size());
   phi::DDim filter_data_dims =
-      phi::slice_ddim(filter.dims(), 2, filter.dims().size());
-  std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
+      common::slice_ddim(filter.dims(), 2, filter.dims().size());
+  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
   UpdatePaddingAndDilation(
       &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
 
@@ -78,7 +78,7 @@ void ConvKernel(const Context& dev_ctx,
   if (data_format == "NHWC") {
     filter_data_tmp = RAII_GUARD.alloc<XPUT>(filter.numel());
     PADDLE_ENFORCE_XDNN_NOT_NULL(filter_data_tmp);
-    std::vector<int> filter_shape = phi::vectorize<int>(filter.dims());
+    std::vector<int> filter_shape = common::vectorize<int>(filter.dims());
     int r = xpu::transpose<XPUT>(dev_ctx.x_context(),
                                  filter_data,
                                  filter_data_tmp,
@@ -215,10 +215,10 @@ void Conv3DKernel(const Context& dev_ctx,
   dev_ctx.template Alloc<T>(out);
 
   phi::DDim in_data_dims =
-      phi::slice_ddim(input.dims(), 2, input.dims().size());
+      common::slice_ddim(input.dims(), 2, input.dims().size());
   phi::DDim filter_data_dims =
-      phi::slice_ddim(filter.dims(), 2, filter.dims().size());
-  std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
+      common::slice_ddim(filter.dims(), 2, filter.dims().size());
+  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
   UpdatePaddingAndDilation(
       &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
 
@@ -248,7 +248,7 @@ void Conv3DKernel(const Context& dev_ctx,
   if (data_format == "NDHWC") {
     filter_data_tmp = RAII_GUARD.alloc<XPUT>(filter.numel());
     PADDLE_ENFORCE_XDNN_NOT_NULL(filter_data_tmp);
-    std::vector<int> filter_shape = phi::vectorize<int>(filter.dims());
+    std::vector<int> filter_shape = common::vectorize<int>(filter.dims());
     int r = xpu::transpose<XPUT>(dev_ctx.x_context(),
                                  filter_data,
                                  filter_data_tmp,
diff --git a/paddle/phi/kernels/xpu/conv_transpose_grad_kernel.cc b/paddle/phi/kernels/xpu/conv_transpose_grad_kernel.cc
index f6090980745bf8..296e02c28016d1 100644
--- a/paddle/phi/kernels/xpu/conv_transpose_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/conv_transpose_grad_kernel.cc
@@ -52,7 +52,7 @@ void Conv2dTransposeGradKernel(const Context& ctx,
 
   DDim in_data_dims = slice_ddim(x.dims(), 2, x.dims().size());
   DDim filter_data_dims = slice_ddim(filter_.dims(), 2, filter_.dims().size());
-  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
   UpdatePaddingAndDilation(
       &paddings_, &dilations_, padding_algorithm, in_data_dims, strides, ksize);
 
diff --git a/paddle/phi/kernels/xpu/conv_transpose_kernel.cc b/paddle/phi/kernels/xpu/conv_transpose_kernel.cc
index ae8d71a68300f8..2a1195e48c1f00 100644
--- a/paddle/phi/kernels/xpu/conv_transpose_kernel.cc
+++ b/paddle/phi/kernels/xpu/conv_transpose_kernel.cc
@@ -63,7 +63,7 @@ void Conv2dTransposeKernel(const Context& ctx,
 
   DDim in_data_dims = slice_ddim(x.dims(), 2, x.dims().size());
   DDim filter_data_dims = slice_ddim(filter.dims(), 2, filter.dims().size());
-  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
 
   std::vector<int> paddings_ = paddings;
   std::vector<int> dilations_ = dilations;
diff --git a/paddle/phi/kernels/xpu/cross_entropy_kernel.cc b/paddle/phi/kernels/xpu/cross_entropy_kernel.cc
index b678fde9a882b8..4a3e68169c6b2b 100644
--- a/paddle/phi/kernels/xpu/cross_entropy_kernel.cc
+++ b/paddle/phi/kernels/xpu/cross_entropy_kernel.cc
@@ -43,7 +43,7 @@ void CrossEntropyWithSoftmaxKernel(const Context& dev_ctx,
   dev_ctx.template Alloc<T>(loss);
   const int n = phi::funcs::SizeToAxis(axis, logits.dims());
   const int d = phi::funcs::SizeFromAxis(axis, logits.dims());
-  std::vector<int> logits_dims = phi::vectorize<int>(logits.dims());
+  std::vector<int> logits_dims = common::vectorize<int>(logits.dims());
 
   int t = logits_dims[axis];
 
diff --git a/paddle/phi/kernels/xpu/cum_kernel.cc b/paddle/phi/kernels/xpu/cum_kernel.cc
index cadacf102a8576..64750f9fc54709 100644
--- a/paddle/phi/kernels/xpu/cum_kernel.cc
+++ b/paddle/phi/kernels/xpu/cum_kernel.cc
@@ -40,7 +40,7 @@ void CumsumKernel(const Context& dev_ctx,
   }
 
   // prepare for call xdnn api
-  std::vector<int> x_shape = phi::vectorize<int>(x.dims());
+  std::vector<int> x_shape = common::vectorize<int>(x.dims());
   int axis_as_int = axis.to<int>();
 
   if (flatten) {
diff --git a/paddle/phi/kernels/xpu/cumprod_kernel.cc b/paddle/phi/kernels/xpu/cumprod_kernel.cc
index c9b771c7bd3ef4..da9cdf4cfa6acf 100644
--- a/paddle/phi/kernels/xpu/cumprod_kernel.cc
+++ b/paddle/phi/kernels/xpu/cumprod_kernel.cc
@@ -29,7 +29,7 @@ void CumprodKernel(const Context& dev_ctx,
   auto* x_data = x->data<T>();
   auto* out_data = dev_ctx.template Alloc<T>(out);
   DDim shape = x->dims();
-  std::vector<int64_t> xshape = phi::vectorize<int64_t>(shape);
+  std::vector<int64_t> xshape = common::vectorize<int64_t>(shape);
 
   if (dim < 0) dim += xshape.size();
   if (shape.size() == 0) {
diff --git a/paddle/phi/kernels/xpu/deformable_conv_grad_kernel.cc b/paddle/phi/kernels/xpu/deformable_conv_grad_kernel.cc
index 9b975698e9a99c..45b1d33a9f7ffd 100644
--- a/paddle/phi/kernels/xpu/deformable_conv_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/deformable_conv_grad_kernel.cc
@@ -75,7 +75,7 @@ void DeformableConvGradKernel(const Context& dev_ctx,
                         "in deformable_conv_grad op."));
 
   const int batch_size = static_cast<int>(x.dims()[0]);
-  std::vector<int64_t> output_shape_vec(phi::vectorize(out_grad.dims()));
+  std::vector<int64_t> output_shape_vec(common::vectorize(out_grad.dims()));
   const T* output_grad_ptr = out_grad.data<T>();
   const T* input_ptr = x.data<T>();
   const T* filter_ptr = filter.data<T>();
diff --git a/paddle/phi/kernels/xpu/deformable_conv_kernel.cc b/paddle/phi/kernels/xpu/deformable_conv_kernel.cc
index 895af9486024da..29c5d6896f3ed1 100644
--- a/paddle/phi/kernels/xpu/deformable_conv_kernel.cc
+++ b/paddle/phi/kernels/xpu/deformable_conv_kernel.cc
@@ -54,7 +54,7 @@ void DeformableConvKernel(const Context& dev_ctx,
                         "in deformable_conv op."));
 
   const int batch_size = static_cast<int>(x.dims()[0]);
-  std::vector<int64_t> output_shape_vec(phi::vectorize(out->dims()));
+  std::vector<int64_t> output_shape_vec(common::vectorize(out->dims()));
 
   const T* input_ptr = x.data<T>();
   const T* filter_ptr = filter.data<T>();
diff --git a/paddle/phi/kernels/xpu/diag_kernel.cc b/paddle/phi/kernels/xpu/diag_kernel.cc
index fe7495f471d09f..89c991742e83cc 100644
--- a/paddle/phi/kernels/xpu/diag_kernel.cc
+++ b/paddle/phi/kernels/xpu/diag_kernel.cc
@@ -31,8 +31,8 @@ void DiagKernel(const Context& dev_ctx,
   dev_ctx.template Alloc<T>(out);
   auto* out_data = reinterpret_cast<XPUType*>(out->data<T>());
 
-  auto x_shape = vectorize<int>(x.dims());
-  auto out_shape = vectorize<int>(out->dims());
+  auto x_shape = common::vectorize<int>(x.dims());
+  auto out_shape = common::vectorize<int>(out->dims());
 
   if (x.dims().size() == 0) {
     x_shape = std::vector<int>({1});
diff --git a/paddle/phi/kernels/xpu/diagonal_kernel.cc b/paddle/phi/kernels/xpu/diagonal_kernel.cc
index 708e4d0bd8c88d..eabed011deb5c3 100644
--- a/paddle/phi/kernels/xpu/diagonal_kernel.cc
+++ b/paddle/phi/kernels/xpu/diagonal_kernel.cc
@@ -28,8 +28,8 @@ void DiagonalKernel(const Context& dev_ctx,
                     DenseTensor* out) {
   using XPUType = typename XPUTypeTrait<T>::Type;
   T* out_data = dev_ctx.template Alloc<T>(out);
-  std::vector<int64_t> xshape = phi::vectorize<int64_t>(x.dims());
-  std::vector<int64_t> yshape = phi::vectorize<int64_t>(out->dims());
+  std::vector<int64_t> xshape = common::vectorize<int64_t>(x.dims());
+  std::vector<int64_t> yshape = common::vectorize<int64_t>(out->dims());
 
   int r = xpu::diagonal(dev_ctx.x_context(),
                         reinterpret_cast<const XPUType*>(x.data<T>()),
diff --git a/paddle/phi/kernels/xpu/elementwise_add_grad_kernel.cc b/paddle/phi/kernels/xpu/elementwise_add_grad_kernel.cc
index 4bb12980ec9e31..14a8ad6d34634b 100644
--- a/paddle/phi/kernels/xpu/elementwise_add_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/elementwise_add_grad_kernel.cc
@@ -61,7 +61,7 @@ void AddGradKernel(const Context& dev_ctx,
       }
       std::vector<int> reduce_dims =
           funcs::GetReduceDim(dx->dims(), dz_dims, axis);
-      std::vector<int> dz_vector = phi::vectorize<int>(dz_dims);
+      std::vector<int> dz_vector = common::vectorize<int>(dz_dims);
 
       int ret =
           xpu::reduce_sum<XPUType>(dev_ctx.x_context(),
@@ -86,7 +86,7 @@ void AddGradKernel(const Context& dev_ctx,
     } else {
       std::vector<int> reduce_dims =
           funcs::GetReduceDim(dy->dims(), dz_dims, axis);
-      std::vector<int> dz_vector = phi::vectorize<int>(dz_dims);
+      std::vector<int> dz_vector = common::vectorize<int>(dz_dims);
       int ret =
           xpu::reduce_sum<XPUType>(dev_ctx.x_context(),
                                    reinterpret_cast<const XPUType*>(dz_data),
diff --git a/paddle/phi/kernels/xpu/elementwise_add_kernel.cc b/paddle/phi/kernels/xpu/elementwise_add_kernel.cc
index ad6796f81c5c45..569d967c1379b0 100644
--- a/paddle/phi/kernels/xpu/elementwise_add_kernel.cc
+++ b/paddle/phi/kernels/xpu/elementwise_add_kernel.cc
@@ -55,8 +55,8 @@ void GradAddXPUKernel(const Context& dev_ctx,
   using XPUType = typename XPUTypeTrait<T>::Type;
 
   dev_ctx.template Alloc<T>(out);
-  auto x_shape = phi::vectorize<int>(x.dims());
-  auto y_shape = phi::vectorize<int>(y.dims());
+  auto x_shape = common::vectorize<int>(x.dims());
+  auto y_shape = common::vectorize<int>(y.dims());
   int r = xpu::broadcast_add(dev_ctx.x_context(),
                              reinterpret_cast<const XPUType*>(x.data<T>()),
                              reinterpret_cast<const XPUType*>(y.data<T>()),
diff --git a/paddle/phi/kernels/xpu/expand_as_kernel.cc b/paddle/phi/kernels/xpu/expand_as_kernel.cc
index ae72b85bf06b15..0701294217f412 100644
--- a/paddle/phi/kernels/xpu/expand_as_kernel.cc
+++ b/paddle/phi/kernels/xpu/expand_as_kernel.cc
@@ -28,7 +28,7 @@ void ExpandAs(const Context& context,
               DenseTensor* out) {
   using XPUType = typename XPUTypeTrait<T>::Type;
   auto in_dims = x.dims();
-  auto vec_in_dims = phi::vectorize<int>(in_dims);
+  auto vec_in_dims = common::vectorize<int>(in_dims);
   auto diff = target_shape.size() - vec_in_dims.size();
   vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
   for (size_t i = 0; i < vec_in_dims.size(); ++i) {
@@ -49,7 +49,7 @@ void ExpandAs(const Context& context,
     }
   }
   if (target_shape.size() == 0) {
-    phi::DDim out_dims = phi::make_ddim(target_shape);
+    phi::DDim out_dims = common::make_ddim(target_shape);
     out->Resize(out_dims);
     context.template Alloc<T>(out);
 
@@ -61,11 +61,11 @@ void ExpandAs(const Context& context,
     return;
   }
 
-  phi::DDim out_dims = phi::make_ddim(target_shape);
+  phi::DDim out_dims = common::make_ddim(target_shape);
   out->Resize(out_dims);
   context.template Alloc<T>(out);
   auto& x_shape = vec_in_dims;
-  auto out_shape = phi::vectorize<int>(out_dims);
+  auto out_shape = common::vectorize<int>(out_dims);
 
   int r = XPU_SUCCESS;
 
diff --git a/paddle/phi/kernels/xpu/expand_grad_kernel.cc b/paddle/phi/kernels/xpu/expand_grad_kernel.cc
index 1665b8e31926c8..a346b07064fb7b 100644
--- a/paddle/phi/kernels/xpu/expand_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/expand_grad_kernel.cc
@@ -30,8 +30,8 @@ void ExpandGradKernel(const Context& ctx,
                       DenseTensor* in_grad) {
   using XPUType = typename XPUTypeTrait<T>::Type;
   auto in_grad_data = ctx.template Alloc<T>(in_grad);
-  auto out_grad_dims = phi::vectorize<int64_t>(out_grad.dims());
-  auto in_grad_dims = phi::vectorize<int64_t>(in_grad->dims());
+  auto out_grad_dims = common::vectorize<int64_t>(out_grad.dims());
+  auto in_grad_dims = common::vectorize<int64_t>(in_grad->dims());
   in_grad_dims.insert(
       in_grad_dims.begin(), out_grad.dims().size() - in_grad->dims().size(), 1);
 
diff --git a/paddle/phi/kernels/xpu/expand_kernel.cc b/paddle/phi/kernels/xpu/expand_kernel.cc
index d8808d3c3aae3a..f1bac016a17f14 100644
--- a/paddle/phi/kernels/xpu/expand_kernel.cc
+++ b/paddle/phi/kernels/xpu/expand_kernel.cc
@@ -27,7 +27,7 @@ void ExpandKernel(const Context& ctx,
   using XPUType = typename XPUTypeTrait<T>::Type;
   auto in_dims = x.dims();
   auto expand_shape = shape.GetData();
-  auto vec_in_dims = phi::vectorize<int>(in_dims);
+  auto vec_in_dims = common::vectorize<int>(in_dims);
   auto diff = expand_shape.size() - vec_in_dims.size();
   vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
   std::vector<int> final_expand_shape(vec_in_dims.size());
@@ -94,11 +94,11 @@ void ExpandKernel(const Context& ctx,
           shape_size,
           rank));
 
-  DDim out_dims = phi::make_ddim(final_expand_shape);
+  DDim out_dims = common::make_ddim(final_expand_shape);
   out->Resize(out_dims);
   ctx.template Alloc<T>(out);
   auto& x_shape = vec_in_dims;
-  auto out_shape = phi::vectorize<int>(out_dims);
+  auto out_shape = common::vectorize<int>(out_dims);
   if (shape_size == 0) {
     x_shape = {1};
     out_shape = {1};
diff --git a/paddle/phi/kernels/xpu/fill_diagonal_tensor_kernel.cc b/paddle/phi/kernels/xpu/fill_diagonal_tensor_kernel.cc
index 216205f251046f..9d7f435d583a06 100644
--- a/paddle/phi/kernels/xpu/fill_diagonal_tensor_kernel.cc
+++ b/paddle/phi/kernels/xpu/fill_diagonal_tensor_kernel.cc
@@ -35,8 +35,8 @@ void FillDiagonalTensorKernel(const Context &ctx,
                     x.numel());
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy");
 
-  std::vector<int64_t> xshape = phi::vectorize<int64_t>(x.dims());
-  std::vector<int64_t> yshape = phi::vectorize<int64_t>(y.dims());
+  std::vector<int64_t> xshape = common::vectorize<int64_t>(x.dims());
+  std::vector<int64_t> yshape = common::vectorize<int64_t>(y.dims());
 
   r = xpu::fill_diagonal_tensor(ctx.x_context(),
                                 reinterpret_cast<const XPUType *>(x.data<T>()),
diff --git a/paddle/phi/kernels/xpu/flip_kernel.cc b/paddle/phi/kernels/xpu/flip_kernel.cc
index 3311fce88bc1d0..56a31197e56c79 100644
--- a/paddle/phi/kernels/xpu/flip_kernel.cc
+++ b/paddle/phi/kernels/xpu/flip_kernel.cc
@@ -40,7 +40,7 @@ void FlipKernel(const Context& dev_ctx,
     phi::Copy<Context>(dev_ctx, x, dev_ctx.GetPlace(), false, out);
     return;
   }
-  std::vector<int64_t> x_shape = phi::vectorize(x.dims());
+  std::vector<int64_t> x_shape = common::vectorize(x.dims());
   auto x_data = reinterpret_cast<const XPUInTDType*>(x.data<T>());
   auto out_data = reinterpret_cast<XPUInTDType*>(out->data<T>());
   auto numel = x.numel();
diff --git a/paddle/phi/kernels/xpu/full_kernel.cc b/paddle/phi/kernels/xpu/full_kernel.cc
index abbd28f74db709..1a780f132016d0 100644
--- a/paddle/phi/kernels/xpu/full_kernel.cc
+++ b/paddle/phi/kernels/xpu/full_kernel.cc
@@ -34,7 +34,7 @@ void FullKernel(const Context& dev_ctx,
                 DataType dtype,
                 DenseTensor* out) {
   using XPUInTDType = typename XPUTypeTrait<T>::Type;
-  out->Resize(phi::make_ddim(shape.GetData()));
+  out->Resize(common::make_ddim(shape.GetData()));
   int numel = out->numel();
   dev_ctx.template Alloc<T>(out);
   auto out_data = reinterpret_cast<XPUInTDType*>(out->data<T>());
@@ -109,7 +109,7 @@ void FullBatchSizeLikeKernel(const Context& dev_ctx,
     // set the correct batch size for the LoDTensor.
     auto odims = out->dims();
     odims[out_batch_size_dim] = static_cast<int>(x.lod().back().size()) - 1;
-    FullKernel<T, Context>(dev_ctx, phi::vectorize(odims), val, dtype, out);
+    FullKernel<T, Context>(dev_ctx, common::vectorize(odims), val, dtype, out);
   }
   FullLikeKernel<T, Context>(dev_ctx, x, val, dtype, out);
 }
diff --git a/paddle/phi/kernels/xpu/gather_nd_grad_kernel.cc b/paddle/phi/kernels/xpu/gather_nd_grad_kernel.cc
index d260e97ce30204..bab6e86ec1cde3 100644
--- a/paddle/phi/kernels/xpu/gather_nd_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/gather_nd_grad_kernel.cc
@@ -48,8 +48,8 @@ void GatherNdGradKernel(const Context &ctx,
         0,
         phi::errors::InvalidArgument("end_size[%d] should be 0", end_size));
     // remain dim
-    auto remain_ddim = phi::slice_ddim(index_dims, 0, index_dims_size - 1);
-    int64_t remain_numel = phi::product(remain_ddim);
+    auto remain_ddim = common::slice_ddim(index_dims, 0, index_dims_size - 1);
+    int64_t remain_numel = common::product(remain_ddim);
 
     int64_t x_numel = x.numel();
     int64_t out_grad_numel = out_grad.numel();
@@ -85,8 +85,8 @@ void GatherNdGradKernel(const Context &ctx,
                                    phi::DataType::INT32,
                                    phi::DataType::INT64));
 
-  auto x_shape = phi::vectorize<int64_t>(x_grad->dims());
-  auto index_shape = phi::vectorize<int64_t>(index.dims());
+  auto x_shape = common::vectorize<int64_t>(x_grad->dims());
+  auto index_shape = common::vectorize<int64_t>(index.dims());
   if (index_shape.size() == 1) {
     index_shape.insert(index_shape.begin(), 1);
   }
diff --git a/paddle/phi/kernels/xpu/gather_nd_kernel.cc b/paddle/phi/kernels/xpu/gather_nd_kernel.cc
index d7250678ffdc42..8d9ca774088c6a 100644
--- a/paddle/phi/kernels/xpu/gather_nd_kernel.cc
+++ b/paddle/phi/kernels/xpu/gather_nd_kernel.cc
@@ -41,8 +41,8 @@ void GatherNdKernel(const Context &ctx,
         0,
         phi::errors::InvalidArgument("end_size[%d] should be 0", end_size));
     // remain dim
-    auto remain_ddim = phi::slice_ddim(index_dims, 0, index_dims_size - 1);
-    int64_t remain_numel = phi::product(remain_ddim);
+    auto remain_ddim = common::slice_ddim(index_dims, 0, index_dims_size - 1);
+    int64_t remain_numel = common::product(remain_ddim);
 
     int64_t x_numel = x.numel();
     int64_t y_numel = out->numel();
@@ -78,8 +78,8 @@ void GatherNdKernel(const Context &ctx,
                                    DataType::INT32,
                                    DataType::INT64));
 
-  auto x_shape = phi::vectorize<int>(x.dims());
-  auto index_shape = phi::vectorize<int>(index.dims());
+  auto x_shape = common::vectorize<int>(x.dims());
+  auto index_shape = common::vectorize<int>(index.dims());
   if (index_shape.size() == 1) {
     index_shape.insert(index_shape.begin(), 1);
   }
diff --git a/paddle/phi/kernels/xpu/gaussian_kernel.cc b/paddle/phi/kernels/xpu/gaussian_kernel.cc
index 2c4a29b6bfe515..99bde3096f6c11 100644
--- a/paddle/phi/kernels/xpu/gaussian_kernel.cc
+++ b/paddle/phi/kernels/xpu/gaussian_kernel.cc
@@ -28,7 +28,7 @@ void GaussianKernel(const Context& ctx,
                     int seed,
                     DataType dtype,
                     DenseTensor* out) {
-  out->Resize(phi::make_ddim(shape.GetData()));
+  out->Resize(common::make_ddim(shape.GetData()));
   T* data = ctx.template Alloc<T>(out);
   using XPUType = typename XPUTypeTrait<T>::Type;
   int64_t real_seed = seed != 0 ? seed : ctx.GetGenerator()->Random64();
diff --git a/paddle/phi/kernels/xpu/generate_proposals_kernel.cc b/paddle/phi/kernels/xpu/generate_proposals_kernel.cc
index 367ebfde95ae37..1b05c6e55c0016 100644
--- a/paddle/phi/kernels/xpu/generate_proposals_kernel.cc
+++ b/paddle/phi/kernels/xpu/generate_proposals_kernel.cc
@@ -88,16 +88,16 @@ std::pair<DenseTensor, DenseTensor> ProposalForOneImage(
   SortDescending<T>(dev_ctx, scores_slice, &index_sort, pre_nms_top_n);
 
   DenseTensor scores_sel, bbox_sel, anchor_sel, var_sel;
-  scores_sel.Resize(phi::make_ddim({index_sort.numel(), 1}));
+  scores_sel.Resize(common::make_ddim({index_sort.numel(), 1}));
   dev_ctx.template Alloc<T>(&scores_sel);
 
-  bbox_sel.Resize(phi::make_ddim({index_sort.numel(), 4}));
+  bbox_sel.Resize(common::make_ddim({index_sort.numel(), 4}));
   dev_ctx.template Alloc<T>(&bbox_sel);
 
-  anchor_sel.Resize(phi::make_ddim({index_sort.numel(), 4}));
+  anchor_sel.Resize(common::make_ddim({index_sort.numel(), 4}));
   dev_ctx.template Alloc<T>(&anchor_sel);
 
-  var_sel.Resize(phi::make_ddim({index_sort.numel(), 4}));
+  var_sel.Resize(common::make_ddim({index_sort.numel(), 4}));
   dev_ctx.template Alloc<T>(&var_sel);
 
   int r = xpu::gather<T>(dev_ctx.x_context(),
@@ -145,7 +145,7 @@ std::pair<DenseTensor, DenseTensor> ProposalForOneImage(
 
   // 2. box decode and clipping
   DenseTensor proposals;
-  proposals.Resize(phi::make_ddim({index_sort.numel(), 4}));
+  proposals.Resize(common::make_ddim({index_sort.numel(), 4}));
   dev_ctx.template Alloc<T>(&proposals);
 
   r = xpu::box_decoder<T>(dev_ctx.x_context(),
@@ -161,10 +161,10 @@ std::pair<DenseTensor, DenseTensor> ProposalForOneImage(
 
   // 3. filter
   DenseTensor keep_index, keep_num_t;
-  keep_index.Resize(phi::make_ddim({pre_nms_num}));
+  keep_index.Resize(common::make_ddim({pre_nms_num}));
   dev_ctx.template Alloc<int>(&keep_index);
 
-  keep_num_t.Resize(phi::make_ddim({1}));
+  keep_num_t.Resize(common::make_ddim({1}));
   dev_ctx.template Alloc<int>(&keep_num_t);
   min_size = std::max(min_size, 1.0f);
   r = xpu::remove_small_boxes<T>(dev_ctx.x_context(),
@@ -191,17 +191,17 @@ std::pair<DenseTensor, DenseTensor> ProposalForOneImage(
   // Handle the case when there is no keep index left
   if (keep_num == 0) {
     phi::funcs::SetConstant<phi::XPUContext, T> set_zero;
-    proposals_filter.Resize(phi::make_ddim({1, 4}));
+    proposals_filter.Resize(common::make_ddim({1, 4}));
     dev_ctx.template Alloc<T>(&proposals_filter);
-    scores_filter.Resize(phi::make_ddim({1, 1}));
+    scores_filter.Resize(common::make_ddim({1, 1}));
     dev_ctx.template Alloc<T>(&scores_filter);
     set_zero(dev_ctx, &proposals_filter, static_cast<T>(0));
     set_zero(dev_ctx, &scores_filter, static_cast<T>(0));
     return std::make_pair(proposals_filter, scores_filter);
   }
-  proposals_filter.Resize(phi::make_ddim({keep_num, 4}));
+  proposals_filter.Resize(common::make_ddim({keep_num, 4}));
   dev_ctx.template Alloc<T>(&proposals_filter);
-  scores_filter.Resize(phi::make_ddim({keep_num, 1}));
+  scores_filter.Resize(common::make_ddim({keep_num, 1}));
   dev_ctx.template Alloc<T>(&scores_filter);
   r = xpu::gather<T>(dev_ctx.x_context(),
                      proposals.data<T>(),
@@ -245,9 +245,9 @@ std::pair<DenseTensor, DenseTensor> ProposalForOneImage(
   }
 
   DenseTensor scores_nms, proposals_nms;
-  proposals_nms.Resize(phi::make_ddim({keep_index.numel(), 4}));
+  proposals_nms.Resize(common::make_ddim({keep_index.numel(), 4}));
   dev_ctx.template Alloc<T>(&proposals_nms);
-  scores_nms.Resize(phi::make_ddim({keep_index.numel(), 1}));
+  scores_nms.Resize(common::make_ddim({keep_index.numel(), 1}));
   dev_ctx.template Alloc<T>(&scores_nms);
   r = xpu::gather<T>(dev_ctx.x_context(),
                      proposals_filter.data<T>(),
@@ -307,10 +307,10 @@ void GenerateProposalsKernel(const Context& dev_ctx,
   int w_bbox = bbox_dim[3];
 
   DenseTensor bbox_deltas_swap, scores_swap;
-  bbox_deltas_swap.Resize(phi::make_ddim({num, h_bbox, w_bbox, c_bbox}));
+  bbox_deltas_swap.Resize(common::make_ddim({num, h_bbox, w_bbox, c_bbox}));
   dev_ctx.template Alloc<T>(&bbox_deltas_swap);
 
-  scores_swap.Resize(phi::make_ddim({num, h_score, w_score, c_score}));
+  scores_swap.Resize(common::make_ddim({num, h_score, w_score, c_score}));
   dev_ctx.template Alloc<T>(&scores_swap);
 
   std::vector<int> axis = {0, 2, 3, 1};
@@ -330,14 +330,14 @@ void GenerateProposalsKernel(const Context& dev_ctx,
 
   DenseTensor tmp_anchors = anchors;
   DenseTensor tmp_variances = variances;
-  tmp_anchors.Resize(phi::make_ddim({tmp_anchors.numel() / 4, 4}));
-  tmp_variances.Resize(phi::make_ddim({tmp_variances.numel() / 4, 4}));
+  tmp_anchors.Resize(common::make_ddim({tmp_anchors.numel() / 4, 4}));
+  tmp_variances.Resize(common::make_ddim({tmp_variances.numel() / 4, 4}));
 
   // output
-  rpn_rois->Resize(phi::make_ddim({bbox_deltas.numel() / 4, 4}));
+  rpn_rois->Resize(common::make_ddim({bbox_deltas.numel() / 4, 4}));
   dev_ctx.template Alloc<T>(rpn_rois);
 
-  rpn_roi_probs->Resize(phi::make_ddim({scores.numel(), 1}));
+  rpn_roi_probs->Resize(common::make_ddim({scores.numel(), 1}));
   dev_ctx.template Alloc<T>(rpn_roi_probs);
 
   auto place = dev_ctx.GetPlace();
@@ -352,8 +352,9 @@ void GenerateProposalsKernel(const Context& dev_ctx,
     DenseTensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1);
     DenseTensor scores_slice = scores_swap.Slice(i, i + 1);
 
-    bbox_deltas_slice.Resize(phi::make_ddim({h_bbox * w_bbox * c_bbox / 4, 4}));
-    scores_slice.Resize(phi::make_ddim({h_score * w_score * c_score, 1}));
+    bbox_deltas_slice.Resize(
+        common::make_ddim({h_bbox * w_bbox * c_bbox / 4, 4}));
+    scores_slice.Resize(common::make_ddim({h_score * w_score * c_score, 1}));
 
     std::pair<DenseTensor, DenseTensor> tensor_pair =
         ProposalForOneImage<T>(dev_ctx,
@@ -392,7 +393,7 @@ void GenerateProposalsKernel(const Context& dev_ctx,
   }
 
   if (rpn_rois_num != nullptr) {
-    rpn_rois_num->Resize(phi::make_ddim({num}));
+    rpn_rois_num->Resize(common::make_ddim({num}));
     dev_ctx.template Alloc<int>(rpn_rois_num);
     int* num_data = rpn_rois_num->data<int>();
     memory_utils::Copy(
@@ -403,8 +404,8 @@ void GenerateProposalsKernel(const Context& dev_ctx,
   lod.emplace_back(offset);
   rpn_rois->set_lod(lod);
   rpn_roi_probs->set_lod(lod);
-  rpn_rois->Resize(phi::make_ddim({num_proposals, 4}));
-  rpn_roi_probs->Resize(phi::make_ddim({num_proposals, 1}));
+  rpn_rois->Resize(common::make_ddim({num_proposals, 4}));
+  rpn_roi_probs->Resize(common::make_ddim({num_proposals, 1}));
 }
 }  // namespace phi
 
diff --git a/paddle/phi/kernels/xpu/grid_sample_kernel.cc b/paddle/phi/kernels/xpu/grid_sample_kernel.cc
index c374b2cc9dce68..5f6d4f31f67e77 100644
--- a/paddle/phi/kernels/xpu/grid_sample_kernel.cc
+++ b/paddle/phi/kernels/xpu/grid_sample_kernel.cc
@@ -14,8 +14,8 @@
 
 #include "paddle/phi/kernels/grid_sample_kernel.h"
 
+#include "paddle/common/layout.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
@@ -31,7 +31,7 @@ void GridSampleKernel(const Context& dev_ctx,
   // attrs
   // paddle.nn.functional.grid_sample(x, grid, mode='bilinear',
   // padding_mode='zeros', align_corners=True, name=None)
-  const std::string data_format = phi::DataLayoutToString(x.layout());
+  const std::string data_format = common::DataLayoutToString(x.layout());
 
   // attr to real param
   bool is_nearest_bool;
@@ -85,7 +85,7 @@ void GridSampleKernel(const Context& dev_ctx,
           data_format));
     }
 
-    out->Resize(make_ddim({n, c, out_h, out_w}));
+    out->Resize(common::make_ddim({n, c, out_h, out_w}));
     T* output_data = dev_ctx.template Alloc<T>(out);
 
     int r = xpu::grid_sample(dev_ctx.x_context(),
@@ -111,7 +111,7 @@ void GridSampleKernel(const Context& dev_ctx,
     int out_h = grid.dims()[2];
     int out_w = grid.dims()[3];
 
-    out->Resize(make_ddim({n, c, out_d, out_h, out_w}));
+    out->Resize(common::make_ddim({n, c, out_d, out_h, out_w}));
     T* output_data = dev_ctx.template Alloc<T>(out);
 
     int r = xpu::grid_sample3d(dev_ctx.x_context(),
diff --git a/paddle/phi/kernels/xpu/group_norm_grad_kernel.cc b/paddle/phi/kernels/xpu/group_norm_grad_kernel.cc
index 08532e22d86c96..428b2699dc2753 100644
--- a/paddle/phi/kernels/xpu/group_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/group_norm_grad_kernel.cc
@@ -19,8 +19,8 @@
 #include <numeric>
 #include <string>
 
+#include "paddle/common/layout.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
@@ -42,10 +42,10 @@ void GroupNormGradKernel(const Context& dev_ctx,
                          DenseTensor* d_scale,
                          DenseTensor* d_bias) {
   using XPUType = typename XPUTypeTrait<T>::Type;
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
   const auto scale_ptr = scale.get_ptr();
   const auto bias_ptr = bias.get_ptr();
-  const auto x_dims = phi::vectorize<int>(x.dims());
+  const auto x_dims = common::vectorize<int>(x.dims());
   const int N = x_dims[0];
   const bool channel_first =
       data_layout == DataLayout::kNCHW || data_layout == DataLayout::kNCDHW;
diff --git a/paddle/phi/kernels/xpu/group_norm_kernel.cc b/paddle/phi/kernels/xpu/group_norm_kernel.cc
index 7d82a5d18fee7e..01435f82b2cef8 100644
--- a/paddle/phi/kernels/xpu/group_norm_kernel.cc
+++ b/paddle/phi/kernels/xpu/group_norm_kernel.cc
@@ -19,8 +19,8 @@
 #include <numeric>
 #include <string>
 
+#include "paddle/common/layout.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
@@ -38,11 +38,11 @@ void GroupNormKernel(const Context& dev_ctx,
                      DenseTensor* var) {
   using XPUType = typename XPUTypeTrait<T>::Type;
 
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
   const auto scale_ptr = scale.get_ptr();
   const auto bias_ptr = bias.get_ptr();
 
-  const auto x_dims = phi::vectorize<int>(x.dims());
+  const auto x_dims = common::vectorize<int>(x.dims());
   const int N = x_dims[0];
   const bool channel_first =
       data_layout == DataLayout::kNCHW || data_layout == DataLayout::kNCDHW;
diff --git a/paddle/phi/kernels/xpu/index_put_kernel.cc b/paddle/phi/kernels/xpu/index_put_kernel.cc
index 4197b9698cb3c1..60c91a8e5c83c7 100644
--- a/paddle/phi/kernels/xpu/index_put_kernel.cc
+++ b/paddle/phi/kernels/xpu/index_put_kernel.cc
@@ -43,20 +43,21 @@ void XPUDealWithIndices(const Context& dev_ctx,
       expanded_index = casted_index;
     } else {
       expanded_index.Resize(bd_dim);
-      ExpandKernel<int64_t, Context>(dev_ctx,
-                                     casted_index,
-                                     IntArray(vectorize<int64_t>(bd_dim)),
-                                     &expanded_index);
+      ExpandKernel<int64_t, Context>(
+          dev_ctx,
+          casted_index,
+          IntArray(common::vectorize<int64_t>(bd_dim)),
+          &expanded_index);
     }
 
     tmp_indices_v.emplace_back(expanded_index);
   }
 
-  auto bd_dim_vec = vectorize<int64_t>(bd_dim);
+  auto bd_dim_vec = common::vectorize<int64_t>(bd_dim);
   std::vector<int64_t> stacked_dim_vec(bd_dim.size() + 1);
   std::copy(bd_dim_vec.begin(), bd_dim_vec.end(), stacked_dim_vec.begin());
   stacked_dim_vec.back() = int_indices_v.size();
-  out->Resize(make_ddim(stacked_dim_vec));
+  out->Resize(common::make_ddim(stacked_dim_vec));
 
   std::vector<const DenseTensor*> tmp_indices_ptr(tmp_indices_v.size(),
                                                   nullptr);
@@ -109,13 +110,13 @@ void IndexPutKernel(const Context& dev_ctx,
   DenseTensor res_indices(DataType::INT64);
   // Broadcast and merge indices
   XPUDealWithIndices<Context>(dev_ctx, int_indices_v, bd_dims, &res_indices);
-  auto index_shape = vectorize<int64_t>(res_indices.dims());
-  auto x_shape = vectorize<int64_t>(x.dims());
+  auto index_shape = common::vectorize<int64_t>(res_indices.dims());
+  auto x_shape = common::vectorize<int64_t>(x.dims());
 
   const T* value_data = value.data<T>();
 
   // Broadcast value
-  auto value_shape = vectorize<int64_t>(value.dims());
+  auto value_shape = common::vectorize<int64_t>(value.dims());
   int64_t value_rank = bd_dims.size() + (x_shape.size() - int_indices_v.size());
   std::vector<int64_t> value_shape_bd(value_rank);
   std::copy(index_shape.begin(), index_shape.end() - 1, value_shape_bd.begin());
@@ -126,7 +127,7 @@ void IndexPutKernel(const Context& dev_ctx,
   DenseTensor value_bd(value.dtype());
 
   if (value_shape != value_shape_bd) {
-    value_bd.Resize(make_ddim(value_shape_bd));
+    value_bd.Resize(common::make_ddim(value_shape_bd));
     ExpandKernel<T, Context>(
         dev_ctx, value, IntArray(value_shape_bd), &value_bd);
     value_data = value_bd.data<T>();
diff --git a/paddle/phi/kernels/xpu/index_sample_grad_kernel.cc b/paddle/phi/kernels/xpu/index_sample_grad_kernel.cc
index 22c35ef46840fc..537d50701cd01f 100644
--- a/paddle/phi/kernels/xpu/index_sample_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/index_sample_grad_kernel.cc
@@ -41,9 +41,9 @@ void IndexSampleGradKernel(const Context& ctx,
 
   XPUType* in_grad_data = ctx.template Alloc<XPUType>(in_grad);
   const XPUType* out_grad_data = out_grad.data<XPUType>();
-  auto in_grad_shape = phi::vectorize<int64_t>(in_grad->dims());
-  auto out_grad_shape = phi::vectorize<int64_t>(out_grad.dims());
-  auto index_shape = phi::vectorize<int64_t>(index.dims());
+  auto in_grad_shape = common::vectorize<int64_t>(in_grad->dims());
+  auto out_grad_shape = common::vectorize<int64_t>(out_grad.dims());
+  auto index_shape = common::vectorize<int64_t>(index.dims());
 
   int r = xpu::constant(
       ctx.x_context(), in_grad_data, in_grad->numel(), static_cast<XPUType>(0));
diff --git a/paddle/phi/kernels/xpu/index_select_grad_kernel.cc b/paddle/phi/kernels/xpu/index_select_grad_kernel.cc
index 14bfce38799f0c..7a3ef41b8261a1 100644
--- a/paddle/phi/kernels/xpu/index_select_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/index_select_grad_kernel.cc
@@ -45,8 +45,8 @@ void IndexSelectGradKernel(const Context& ctx,
   T* x_grad_data = ctx.template Alloc<T>(x_grad);
   const T* out_grad_data = out_grad.data<T>();
 
-  auto out_grad_shape = phi::vectorize<int64_t>(out_grad.dims());
-  auto x_grad_shape = phi::vectorize<int64_t>(x_grad->dims());
+  auto out_grad_shape = common::vectorize<int64_t>(out_grad.dims());
+  auto x_grad_shape = common::vectorize<int64_t>(x_grad->dims());
 
   int r = xpu::Error_t::SUCCESS;
   if (index_type == phi::DataType::INT32) {
diff --git a/paddle/phi/kernels/xpu/index_select_kernel.cc b/paddle/phi/kernels/xpu/index_select_kernel.cc
index 75c19aa028bce7..12395387eccf26 100644
--- a/paddle/phi/kernels/xpu/index_select_kernel.cc
+++ b/paddle/phi/kernels/xpu/index_select_kernel.cc
@@ -41,7 +41,7 @@ void IndexSelectKernel(const Context& ctx,
                         phi::DataType::INT32,
                         phi::DataType::INT64));
   auto* in_data = x.data<T>();
-  std::vector<int> in_shape = phi::vectorize<int>(input_dim);
+  std::vector<int> in_shape = common::vectorize<int>(input_dim);
   int index_len = output->dims()[dim];
   T* out_data = ctx.template Alloc<T>(output);
   int r = 0;
diff --git a/paddle/phi/kernels/xpu/interpolate_grad_kernel.cc b/paddle/phi/kernels/xpu/interpolate_grad_kernel.cc
index 0c0570475f7de4..054856862bc15f 100644
--- a/paddle/phi/kernels/xpu/interpolate_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/interpolate_grad_kernel.cc
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/interpolate_grad_kernel.h"
+#include "paddle/common/layout.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/interpolate_function.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -39,7 +39,7 @@ void InterpolateGradKernel(
     bool align_corners,
     int align_mode,
     DenseTensor* x_grad) {
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
   int n, c, in_d, in_h, in_w;
   funcs::ExtractNCDWH(x.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
 
diff --git a/paddle/phi/kernels/xpu/interpolate_kernel.cc b/paddle/phi/kernels/xpu/interpolate_kernel.cc
index 0bf7f6e1113fb8..712897ee90079c 100644
--- a/paddle/phi/kernels/xpu/interpolate_kernel.cc
+++ b/paddle/phi/kernels/xpu/interpolate_kernel.cc
@@ -14,9 +14,9 @@
 
 #include "paddle/phi/kernels/interpolate_kernel.h"
 
+#include "paddle/common/layout.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/interpolate_function.h"
 
@@ -39,7 +39,7 @@ void InterpolateKernel(
     int align_mode,
     DenseTensor* output) {
   using XPUType = typename XPUTypeTrait<T>::Type;
-  const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
   int n, c, in_d, in_h, in_w;
   phi::funcs::ExtractNCDWH(x.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
 
diff --git a/paddle/phi/kernels/xpu/layer_norm_grad_kernel.cc b/paddle/phi/kernels/xpu/layer_norm_grad_kernel.cc
index 6218db6ae2b4e4..35220636dffb68 100644
--- a/paddle/phi/kernels/xpu/layer_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/layer_norm_grad_kernel.cc
@@ -34,7 +34,7 @@ void LayerNormGradKernel(const Context& ctx,
                          DenseTensor* bias_grad) {
   using XPUType = typename XPUTypeTrait<T>::Type;
   const auto& x_dims = x.dims();
-  auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
+  auto matrix_dim = common::flatten_to_2d(x_dims, begin_norm_axis);
   int left = static_cast<int>(matrix_dim[0]);
   int right = static_cast<int>(matrix_dim[1]);
   const auto* x_data = x.data<T>();
diff --git a/paddle/phi/kernels/xpu/layer_norm_kernel.cc b/paddle/phi/kernels/xpu/layer_norm_kernel.cc
index 9c6c2ef727fd2c..7c9727bc121999 100644
--- a/paddle/phi/kernels/xpu/layer_norm_kernel.cc
+++ b/paddle/phi/kernels/xpu/layer_norm_kernel.cc
@@ -31,7 +31,7 @@ void LayerNormKernel(const Context& ctx,
                      DenseTensor* variance) {
   using XPUType = typename XPUTypeTrait<T>::Type;
   const auto& x_dims = x.dims();
-  auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
+  auto matrix_dim = common::flatten_to_2d(x_dims, begin_norm_axis);
   int left = static_cast<int>(matrix_dim[0]);
   int right = static_cast<int>(matrix_dim[1]);
   const auto* x_data = x.data<T>();
diff --git a/paddle/phi/kernels/xpu/linspace_kernel.cc b/paddle/phi/kernels/xpu/linspace_kernel.cc
index 15799b7d40a1af..38a6b176f796d1 100644
--- a/paddle/phi/kernels/xpu/linspace_kernel.cc
+++ b/paddle/phi/kernels/xpu/linspace_kernel.cc
@@ -67,7 +67,7 @@ void LinspaceKernel(const Context& ctx,
                                    "than 0, but received num is %d",
                                    num));
 
-  out->Resize(phi::make_ddim({num}));
+  out->Resize(common::make_ddim({num}));
   T* out_data = ctx.template Alloc<T>(out);
 
   int r = xpu::linspace(ctx.x_context(),
diff --git a/paddle/phi/kernels/xpu/log_softmax_grad_kernel.cc b/paddle/phi/kernels/xpu/log_softmax_grad_kernel.cc
index 949e40474c7351..bcf69f3966e587 100644
--- a/paddle/phi/kernels/xpu/log_softmax_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/log_softmax_grad_kernel.cc
@@ -38,7 +38,7 @@ void LogSoftmaxGradKernel(const Context& dev_ctx,
     return;
   }
 
-  auto out_shape = phi::vectorize<int>(out.dims());
+  auto out_shape = common::vectorize<int>(out.dims());
   dev_ctx.template Alloc<T>(x_grad);
   int r = xpu::log_softmax_grad(
       dev_ctx.x_context(),
diff --git a/paddle/phi/kernels/xpu/log_softmax_kernel.cc b/paddle/phi/kernels/xpu/log_softmax_kernel.cc
index 2ee093dbad44c5..429b53e717cffd 100644
--- a/paddle/phi/kernels/xpu/log_softmax_kernel.cc
+++ b/paddle/phi/kernels/xpu/log_softmax_kernel.cc
@@ -37,7 +37,7 @@ void LogSoftmaxKernel(const Context& dev_ctx,
     return;
   }
   if (x.numel() != 0) {
-    auto x_shape = phi::vectorize<int>(x.dims());
+    auto x_shape = common::vectorize<int>(x.dims());
     dev_ctx.template Alloc<T>(out);
     if (axis < 0) axis += rank;
     int r =
diff --git a/paddle/phi/kernels/xpu/logical_kernel.cc b/paddle/phi/kernels/xpu/logical_kernel.cc
index 57dc8b4387489b..4f8b1b75d4cfd1 100644
--- a/paddle/phi/kernels/xpu/logical_kernel.cc
+++ b/paddle/phi/kernels/xpu/logical_kernel.cc
@@ -91,7 +91,7 @@ void LogicalBinaryKernel(
 
   bool is_x_need_broadcast = false;
   bool is_y_need_broadcast = false;
-  auto out_vec = phi::vectorize(out->dims());
+  auto out_vec = common::vectorize(out->dims());
   for (int i = 0; i < max_dim; i++) {
     if (x_dims_vec[i] != out_vec[i]) {
       is_x_need_broadcast = true;
diff --git a/paddle/phi/kernels/xpu/masked_select_grad_kernel.cc b/paddle/phi/kernels/xpu/masked_select_grad_kernel.cc
index 8e2f56adfa1414..6eed8cb524a752 100644
--- a/paddle/phi/kernels/xpu/masked_select_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/masked_select_grad_kernel.cc
@@ -31,8 +31,8 @@ void MaskedSelectGradKernel(const Context& dev_ctx,
   auto* out_data =
       reinterpret_cast<XPUType*>(dev_ctx.template Alloc<T>(x_grad));
 
-  auto mask_shape = phi::vectorize<int>(mask.dims());
-  auto xshape = phi::vectorize<int>(x_grad->dims());
+  auto mask_shape = common::vectorize<int>(mask.dims());
+  auto xshape = common::vectorize<int>(x_grad->dims());
   if (mask.dims().size() == 0) {
     mask_shape = std::vector<int>({1});
   }
diff --git a/paddle/phi/kernels/xpu/masked_select_kernel.cc b/paddle/phi/kernels/xpu/masked_select_kernel.cc
index e4af5b5a970970..62803fde27aa5c 100644
--- a/paddle/phi/kernels/xpu/masked_select_kernel.cc
+++ b/paddle/phi/kernels/xpu/masked_select_kernel.cc
@@ -59,8 +59,8 @@ void MaskedSelectKernel(const Context& dev_ctx,
   out->Resize(out_dim);
   auto out_data = reinterpret_cast<XPUType*>(dev_ctx.template Alloc<T>(out));
 
-  auto input_shape = vectorize<int>(input_dim);
-  auto mask_shape = vectorize<int>(mask_dim);
+  auto input_shape = common::vectorize<int>(input_dim);
+  auto mask_shape = common::vectorize<int>(mask_dim);
   if (input_dim.size() == 0) {
     input_shape = std::vector<int>({1});
   }
diff --git a/paddle/phi/kernels/xpu/matmul_grad_kernel.cc b/paddle/phi/kernels/xpu/matmul_grad_kernel.cc
index c4fb311cbe5f0e..f94abe63000178 100644
--- a/paddle/phi/kernels/xpu/matmul_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/matmul_grad_kernel.cc
@@ -114,8 +114,8 @@ void MatmulWithFlattenGradKernel(const Context& dev_ctx,
   auto y_matrix = y.dims().size() > 2 ? phi::ReshapeToMatrix(y, y_num_col_dims)
                                       : static_cast<const DenseTensor&>(y);
   DenseTensor dout_mat;
-  dout_mat.Resize({phi::flatten_to_2d(x.dims(), x_num_col_dims)[0],
-                   phi::flatten_to_2d(y.dims(), y_num_col_dims)[1]});
+  dout_mat.Resize({common::flatten_to_2d(x.dims(), x_num_col_dims)[0],
+                   common::flatten_to_2d(y.dims(), y_num_col_dims)[1]});
 
   if (x_grad != nullptr) {
     x_grad->set_lod(x.lod());
diff --git a/paddle/phi/kernels/xpu/meshgrid_kernel.cc b/paddle/phi/kernels/xpu/meshgrid_kernel.cc
index 4a26a1a946f6dd..4e86e360e1c1d0 100644
--- a/paddle/phi/kernels/xpu/meshgrid_kernel.cc
+++ b/paddle/phi/kernels/xpu/meshgrid_kernel.cc
@@ -32,7 +32,7 @@ void MeshgridKernel(const Context& ctx,
 
   for (const auto& x : inputs) {
     x_list.push_back(reinterpret_cast<const XPUType*>(x->data<T>()));
-    xshape_list.emplace_back(phi::vectorize<int64_t>(x->dims()));
+    xshape_list.emplace_back(common::vectorize<int64_t>(x->dims()));
   }
   for (auto& x : outputs) {
     ctx.template Alloc<T>(x);
diff --git a/paddle/phi/kernels/xpu/multiclass_nms3_kernel.cc b/paddle/phi/kernels/xpu/multiclass_nms3_kernel.cc
index 442d972691f5a4..17746e4eeff0af 100644
--- a/paddle/phi/kernels/xpu/multiclass_nms3_kernel.cc
+++ b/paddle/phi/kernels/xpu/multiclass_nms3_kernel.cc
@@ -46,7 +46,7 @@ void MultiClassNMSKernel(const Context& ctx,
   bool return_index = index != nullptr;
   bool has_rois_num = rois_num.get_ptr() != nullptr;
   bool return_rois_num = nms_rois_num != nullptr;
-  auto score_dims = phi::vectorize<int>(scores.dims());
+  auto score_dims = common::vectorize<int>(scores.dims());
   auto score_size = score_dims.size();
   bool is_lod = score_size == 2 ? true : false;
 
diff --git a/paddle/phi/kernels/xpu/nll_loss_grad_kernel.cc b/paddle/phi/kernels/xpu/nll_loss_grad_kernel.cc
index 1dbe679e67498e..ca7b1b23273b1e 100644
--- a/paddle/phi/kernels/xpu/nll_loss_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/nll_loss_grad_kernel.cc
@@ -45,7 +45,7 @@ void NllLossGradKernel(const Context& dev_ctx,
   auto d_x_data = dev_ctx.template Alloc<XPUType>(d_x);
 
   auto d_x_dims = d_x->dims();
-  std::vector<int64_t> d_x_shape = phi::vectorize<int64_t>(d_x_dims);
+  std::vector<int64_t> d_x_shape = common::vectorize<int64_t>(d_x_dims);
 
   auto weight_data =
       weight.get_ptr() ? weight.get_ptr()->data<float>() : nullptr;
diff --git a/paddle/phi/kernels/xpu/nll_loss_kernel.cc b/paddle/phi/kernels/xpu/nll_loss_kernel.cc
index 2d9bf5baf57670..8bd95efe02b329 100644
--- a/paddle/phi/kernels/xpu/nll_loss_kernel.cc
+++ b/paddle/phi/kernels/xpu/nll_loss_kernel.cc
@@ -49,7 +49,7 @@ void NllLossRawKernel(const Context& dev_ctx,
   auto total_weight_data = dev_ctx.template Alloc<XPUType>(total_weight);
 
   auto x_dims = x.dims();
-  std::vector<int64_t> x_shape = phi::vectorize<int64_t>(x_dims);
+  std::vector<int64_t> x_shape = common::vectorize<int64_t>(x_dims);
 
   int64_t reduction_id = 0;
   if (reduction == "none") {
diff --git a/paddle/phi/kernels/xpu/nonzero_kernel.cc b/paddle/phi/kernels/xpu/nonzero_kernel.cc
index fe241965fb5c69..f3d665afaa6643 100644
--- a/paddle/phi/kernels/xpu/nonzero_kernel.cc
+++ b/paddle/phi/kernels/xpu/nonzero_kernel.cc
@@ -41,14 +41,14 @@ void NonZeroKernel(const Context& dev_ctx,
                      static_cast<void*>(true_num),
                      sizeof(int32_t));
 
-  out->Resize(phi::make_ddim({static_cast<int64_t>(true_num_cpu), rank}));
+  out->Resize(common::make_ddim({static_cast<int64_t>(true_num_cpu), rank}));
   auto* out_data = dev_ctx.template Alloc<int64_t>(out);
 
   if (true_num_cpu == 0) {
     return;
   }
 
-  auto condition_shape = phi::vectorize<int>(dims);
+  auto condition_shape = common::vectorize<int>(dims);
   ret = xpu::where(
       dev_ctx.x_context(), cond_data, out_data, condition_shape, true_num_cpu);
   PADDLE_ENFORCE_XDNN_SUCCESS(ret, "where");
diff --git a/paddle/phi/kernels/xpu/p_norm_grad_kernel.cc b/paddle/phi/kernels/xpu/p_norm_grad_kernel.cc
index 883e3262a64876..083cf33d0600ac 100644
--- a/paddle/phi/kernels/xpu/p_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/p_norm_grad_kernel.cc
@@ -134,7 +134,7 @@ void PNormGradKernel(const Context& dev_ctx,
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "abs");
 
     DenseTensor porder_tensor;
-    phi::DDim pdim = phi::make_ddim({1});
+    phi::DDim pdim = common::make_ddim({1});
     porder_tensor.Resize(pdim);
     dev_ctx.template Alloc<float>(&porder_tensor);
     r = xpu::constant(
diff --git a/paddle/phi/kernels/xpu/p_norm_kernel.cc b/paddle/phi/kernels/xpu/p_norm_kernel.cc
index 60abc59517b786..722fc4cc3aba05 100644
--- a/paddle/phi/kernels/xpu/p_norm_kernel.cc
+++ b/paddle/phi/kernels/xpu/p_norm_kernel.cc
@@ -125,7 +125,7 @@ void PNormKernel(const Context& dev_ctx,
 
   } else {
     DenseTensor porder_tensor;
-    phi::DDim pdim = phi::make_ddim({1});
+    phi::DDim pdim = common::make_ddim({1});
     porder_tensor.Resize(pdim);
     dev_ctx.template Alloc<T>(&porder_tensor);
     r = xpu::constant(
diff --git a/paddle/phi/kernels/xpu/pad3d_grad_kernel.cc b/paddle/phi/kernels/xpu/pad3d_grad_kernel.cc
index d5960b02cf91ac..2599458e44733a 100644
--- a/paddle/phi/kernels/xpu/pad3d_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/pad3d_grad_kernel.cc
@@ -34,7 +34,7 @@ void Pad3dGradKernel(const Context& dev_ctx,
 
   auto* d_out = &out_grad;
   auto* d_in = x_grad;
-  auto d_in_dims = vectorize<int>(d_in->dims());
+  auto d_in_dims = common::vectorize<int>(d_in->dims());
   const T* d_out_data = d_out->data<T>();
   T* d_in_data = dev_ctx.template Alloc<T>(d_in);
 
diff --git a/paddle/phi/kernels/xpu/pad_grad_kernel.cc b/paddle/phi/kernels/xpu/pad_grad_kernel.cc
index 45fc3393412cd8..6eb2741ac5521d 100644
--- a/paddle/phi/kernels/xpu/pad_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/pad_grad_kernel.cc
@@ -26,7 +26,7 @@ void PadGradKernel(const Context& dev_ctx,
                    DenseTensor* d_x) {
   using XPUType = typename XPUTypeTrait<T>::Type;
   std::vector<int> pad_left, pad_right;
-  std::vector<int> out_shape = vectorize<int>(d_out.dims());
+  std::vector<int> out_shape = common::vectorize<int>(d_out.dims());
   dev_ctx.template Alloc<T>(d_x);
 
   for (size_t i = 0; i < paddings.size() / 2; ++i) {
diff --git a/paddle/phi/kernels/xpu/pad_kernel.cc b/paddle/phi/kernels/xpu/pad_kernel.cc
index 899503e328607b..63906b8cb09351 100644
--- a/paddle/phi/kernels/xpu/pad_kernel.cc
+++ b/paddle/phi/kernels/xpu/pad_kernel.cc
@@ -27,7 +27,7 @@ void PadKernel(const Context& dev_ctx,
   using XPUType = typename XPUTypeTrait<T>::Type;
   dev_ctx.template Alloc<T>(out);
   std::vector<int> pad_left, pad_right;
-  std::vector<int> xshape = vectorize<int>(x.dims());
+  std::vector<int> xshape = common::vectorize<int>(x.dims());
 
   for (size_t i = 0; i < paddings.size() / 2; ++i) {
     pad_left.push_back(paddings[i * 2]);
diff --git a/paddle/phi/kernels/xpu/pow2_decay_with_linear_warmup_kernel.cc b/paddle/phi/kernels/xpu/pow2_decay_with_linear_warmup_kernel.cc
index bfda5688bb3407..9195613c315f49 100644
--- a/paddle/phi/kernels/xpu/pow2_decay_with_linear_warmup_kernel.cc
+++ b/paddle/phi/kernels/xpu/pow2_decay_with_linear_warmup_kernel.cc
@@ -14,10 +14,10 @@
 
 #include "paddle/phi/kernels/pow2_decay_with_linear_warmup_kernel.h"
 
+#include "paddle/common/macros.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/macros.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/xpu/randint_kernel.cc b/paddle/phi/kernels/xpu/randint_kernel.cc
index ce86d7e77a9fd5..f284846637f09f 100644
--- a/paddle/phi/kernels/xpu/randint_kernel.cc
+++ b/paddle/phi/kernels/xpu/randint_kernel.cc
@@ -32,7 +32,7 @@ void RandintKernel(const Context& dev_ctx,
                    DenseTensor* out) {
   int seed = 0;
   int64_t size = out->numel();
-  out->Resize(phi::make_ddim(shape.GetData()));
+  out->Resize(common::make_ddim(shape.GetData()));
   T* data = dev_ctx.template Alloc<T>(out);
   auto numel = out->numel();
   std::shared_ptr<std::mt19937_64> engine;
diff --git a/paddle/phi/kernels/xpu/randperm_kernel.cc b/paddle/phi/kernels/xpu/randperm_kernel.cc
index b5ba469b837813..a90691c14e7028 100644
--- a/paddle/phi/kernels/xpu/randperm_kernel.cc
+++ b/paddle/phi/kernels/xpu/randperm_kernel.cc
@@ -43,7 +43,7 @@ void RandpermKernel(const Context& dev_ctx,
   } else {
     dev_ctx.template Alloc<T>(out);
     phi::DenseTensor tmp_tensor;
-    tmp_tensor.Resize(phi::make_ddim({n}));
+    tmp_tensor.Resize(common::make_ddim({n}));
     T* tmp_data = dev_ctx.template HostAlloc<T>(&tmp_tensor);
     for (int i = 0; i < n; ++i) {
       tmp_data[i] = static_cast<T>(i);
diff --git a/paddle/phi/kernels/xpu/reduce_mean_grad_kernel.cc b/paddle/phi/kernels/xpu/reduce_mean_grad_kernel.cc
index afe84e43d99d14..c5b0950552629d 100644
--- a/paddle/phi/kernels/xpu/reduce_mean_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/reduce_mean_grad_kernel.cc
@@ -39,8 +39,8 @@ void ReduceMeanGradKernel(const Context& dev_ctx,
 
   auto reduce_dims = dims_arr.GetData();
 
-  std::vector<int> xdims = vectorize<int>(x.dims());
-  std::vector<int> ydims = vectorize<int>(out_grad.dims());
+  std::vector<int> xdims = common::vectorize<int>(x.dims());
+  std::vector<int> ydims = common::vectorize<int>(out_grad.dims());
 
   int reduce_numel = 1;
   if (reduce_all) {
diff --git a/paddle/phi/kernels/xpu/scatter_kernel.cc b/paddle/phi/kernels/xpu/scatter_kernel.cc
index 9052cd5b5f5f0d..fc13bd92b90dec 100644
--- a/paddle/phi/kernels/xpu/scatter_kernel.cc
+++ b/paddle/phi/kernels/xpu/scatter_kernel.cc
@@ -79,8 +79,8 @@ void ScatterKernel(const Context &ctx,
   }
 
   int dim0 = static_cast<int>(x.dims()[0]);
-  int dim1 =
-      static_cast<int>(phi::product(phi::slice_ddim(x_dims, 1, x_dims.size())));
+  int dim1 = static_cast<int>(
+      common::product(common::slice_ddim(x_dims, 1, x_dims.size())));
 
   DenseTensor indices_cpu(index.type());
   phi::Copy(ctx, index, phi::CPUPlace(), true, &indices_cpu);
diff --git a/paddle/phi/kernels/xpu/scatter_nd_add_grad_kernel.cc b/paddle/phi/kernels/xpu/scatter_nd_add_grad_kernel.cc
index a0fd86fcc3208d..37e6e91ea779e3 100644
--- a/paddle/phi/kernels/xpu/scatter_nd_add_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/scatter_nd_add_grad_kernel.cc
@@ -52,8 +52,8 @@ void ScatterNdAddGradKernel(const Context &ctx,
           errors::InvalidArgument(
               "Size of the last dim of the index tensor [%d] should be 0",
               end_size));
-      auto remain_dims = phi::slice_ddim(index_dims, 0, index_dims_size - 1);
-      int64_t remain_numel = phi::product(remain_dims);
+      auto remain_dims = common::slice_ddim(index_dims, 0, index_dims_size - 1);
+      int64_t remain_numel = common::product(remain_dims);
       int64_t updates_grad_numel = updates_grad->numel();
       int64_t out_grad_numel = out_grad.numel();
       PADDLE_ENFORCE_EQ(
@@ -73,11 +73,11 @@ void ScatterNdAddGradKernel(const Context &ctx,
       return;
     }
 
-    auto index_shape_vec = vectorize<int64_t>(index.dims());
+    auto index_shape_vec = common::vectorize<int64_t>(index.dims());
     if (index_shape_vec.size() == 1) {
       index_shape_vec.insert(index_shape_vec.begin(), 1);
     }
-    auto out_grad_shape_vec = vectorize<int64_t>(out_grad.dims());
+    auto out_grad_shape_vec = common::vectorize<int64_t>(out_grad.dims());
     xpu::VectorParam<int64_t> out_grad_shape_param = {
         out_grad_shape_vec.data(),
         static_cast<int64_t>(out_grad_shape_vec.size()),
diff --git a/paddle/phi/kernels/xpu/scatter_nd_add_kernel.cc b/paddle/phi/kernels/xpu/scatter_nd_add_kernel.cc
index 69e40994eb92de..9b826aecdb8b39 100644
--- a/paddle/phi/kernels/xpu/scatter_nd_add_kernel.cc
+++ b/paddle/phi/kernels/xpu/scatter_nd_add_kernel.cc
@@ -37,7 +37,7 @@ void ScatterNdAddKernel(const Context &ctx,
     int64_t index_dims_size = index.dims().size();
     int loop_time = static_cast<int>(
         index_dims_size == 0 ? 1
-                             : phi::product(phi::slice_ddim(
+                             : common::product(common::slice_ddim(
                                    index.dims(), 0, index_dims_size - 1)));
 
     for (int i = 0; i < loop_time; i++) {
@@ -64,8 +64,8 @@ void ScatterNdAddKernel(const Context &ctx,
                         phi::DataType::INT32,
                         phi::DataType::INT64));
 
-  auto x_shape = phi::vectorize<int64_t>(x.dims());
-  auto index_shape = phi::vectorize<int64_t>(index.dims());
+  auto x_shape = common::vectorize<int64_t>(x.dims());
+  auto index_shape = common::vectorize<int64_t>(index.dims());
   if (index_shape.size() == 1) {
     index_shape.insert(index_shape.begin(), 1);
   }
diff --git a/paddle/phi/kernels/xpu/set_value_grad_kernel.cc b/paddle/phi/kernels/xpu/set_value_grad_kernel.cc
index d80a2a97da8cfe..d1ad332cd626c5 100644
--- a/paddle/phi/kernels/xpu/set_value_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/set_value_grad_kernel.cc
@@ -67,7 +67,7 @@ void SetValueGradImpl(const Context& dev_ctx,
           "The input of `set_value_grad`(out_grad) has not been initialized"));
 
   auto in_dims = out_grad.dims();
-  auto in_dims_vector = phi::vectorize<int64_t>(in_dims);
+  auto in_dims_vector = common::vectorize<int64_t>(in_dims);
 
   std::vector<int> decrease_axis_int32(decrease_axes.begin(),
                                        decrease_axes.end());
@@ -88,7 +88,7 @@ void SetValueGradImpl(const Context& dev_ctx,
                              axes.size(),
                              false);
 
-  DDim out_dims(phi::make_ddim(out_dims_vector));
+  DDim out_dims(common::make_ddim(out_dims_vector));
 
   std::vector<int> reverse_vector(starts_local.size(), 0);
   funcs::StridedSliceFunctor(starts_local.data(),
@@ -159,7 +159,7 @@ void SetValueGradImpl(const Context& dev_ctx,
         reinterpret_cast<const XPUType*>(tmp.data<T>()),
         reinterpret_cast<XPUType*>(x_grad->data<T>()),
         out_dims_vector,
-        phi::vectorize<int64_t>(x_grad->dims()),
+        common::vectorize<int64_t>(x_grad->dims()),
         starts_indices,
         ends_indices,
         steps_indices);
@@ -265,11 +265,11 @@ void SetValueGradImpl(const Context& dev_ctx,
           Full<T>(dev_ctx,
                   {fake_value_grad_dims.Get(), fake_value_grad_dims.size()},
                   static_cast<T>(0));
-      auto value_grad_dims_vec = phi::vectorize<int64_t>(value_grad_dims);
+      auto value_grad_dims_vec = common::vectorize<int64_t>(value_grad_dims);
       // for value is a 0-D Tensor
       if (value_grad_dims.size() == 0) {
-        value_grad_dims_vec =
-            phi::vectorize<int64_t>(phi::make_ddim(std::vector<int>({1})));
+        value_grad_dims_vec = common::vectorize<int64_t>(
+            common::make_ddim(std::vector<int>({1})));
       }
       for (auto offset : offsets) {
         for (int i = 0; i < out_dims_size; i++) {
@@ -279,7 +279,7 @@ void SetValueGradImpl(const Context& dev_ctx,
                        reinterpret_cast<const XPUType*>(tmp.data<T>()),
                        reinterpret_cast<XPUType*>(tmp2.data<T>()),
                        out_dims_vector,
-                       phi::vectorize<int64_t>(offset),
+                       common::vectorize<int64_t>(offset),
                        slice_end);
         PADDLE_ENFORCE_XDNN_SUCCESS(r, "slice");
         r = xpu::broadcast_add(
diff --git a/paddle/phi/kernels/xpu/set_value_kernel.cc b/paddle/phi/kernels/xpu/set_value_kernel.cc
index a706ef00b9a41d..c457a6d21fd8a1 100644
--- a/paddle/phi/kernels/xpu/set_value_kernel.cc
+++ b/paddle/phi/kernels/xpu/set_value_kernel.cc
@@ -122,7 +122,7 @@ void SetValueImpl(const Context& dev_ctx,
       none_axes_cur++;
     }
 
-    slice_dims_for_assign = phi::make_ddim(slice_dims_with_none);
+    slice_dims_for_assign = common::make_ddim(slice_dims_with_none);
   }
 
   // Here copy data from input to avoid data loss at PE and Graph level.
@@ -146,7 +146,7 @@ void SetValueImpl(const Context& dev_ctx,
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy");
 
   xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
-  int64_t slice_numels = phi::product(slice_dims);
+  int64_t slice_numels = common::product(slice_dims);
   XPUType* slice_data = RAII_GUARD.alloc_l3_or_gm<XPUType>(slice_numels);
 
   int in_size = in_dims.size();
@@ -227,8 +227,8 @@ void SetValueImpl(const Context& dev_ctx,
     }
   }
 
-  auto out_shape = phi::vectorize<int>(out->dims());
-  auto slice_shape = phi::vectorize<int>(slice_dims);
+  auto out_shape = common::vectorize<int>(out->dims());
+  auto slice_shape = common::vectorize<int>(slice_dims);
 
   if (need_flip) {
     r = xpu::flip(dev_ctx.x_context(),
@@ -407,7 +407,7 @@ void SetValueKernel(const Context& dev_ctx,
                      phi::CPUPlace(),
                      value_data_uint8_cpu,
                      values_length);
-  auto value_dims = phi::make_ddim(shape);
+  auto value_dims = common::make_ddim(shape);
 
   SetValueKernelImpl<T, Context>(dev_ctx,
                                  x,
diff --git a/paddle/phi/kernels/xpu/split_kernel.cc b/paddle/phi/kernels/xpu/split_kernel.cc
index e3aeb7ffdfbe32..eded1bc67c2fec 100644
--- a/paddle/phi/kernels/xpu/split_kernel.cc
+++ b/paddle/phi/kernels/xpu/split_kernel.cc
@@ -28,7 +28,7 @@ void SplitKernel(const Context& dev_ctx,
   using XPUType = typename XPUTypeTrait<T>::Type;
   int axis = axis_scalar.to<int>();
   auto in_dims = x.dims();
-  auto input_shape = vectorize<int>(in_dims);
+  auto input_shape = common::vectorize<int>(in_dims);
   std::vector<XPUType*> out_ptrs;
   std::vector<int> split_lists;
   for (size_t j = 0; j < outs.size(); ++j) {
diff --git a/paddle/phi/kernels/xpu/stack_grad_kernel.cc b/paddle/phi/kernels/xpu/stack_grad_kernel.cc
index cbc91e13dfc64e..be82010d696ce5 100644
--- a/paddle/phi/kernels/xpu/stack_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/stack_grad_kernel.cc
@@ -29,7 +29,7 @@ void StackGradKernel(const Context& dev_ctx,
   auto dy_dims = out.dims();
 
   if (axis < 0) axis += dy_dims.size();
-  auto dy_shape = phi::vectorize<int>(dy_dims);
+  auto dy_shape = common::vectorize<int>(dy_dims);
 
   std::vector<int> dx_dims_list(x_grad.size(), 1);
   std::vector<XPUType*> dx_lists;
diff --git a/paddle/phi/kernels/xpu/stride_slice_kernel.cc b/paddle/phi/kernels/xpu/stride_slice_kernel.cc
index 2f026bae02fe45..5aee59729b52ef 100644
--- a/paddle/phi/kernels/xpu/stride_slice_kernel.cc
+++ b/paddle/phi/kernels/xpu/stride_slice_kernel.cc
@@ -49,7 +49,7 @@ void StridedSliceRawKernel(const Context& dev_ctx,
                              out_dims_vector.data(),
                              axes.size(),
                              false);
-  DDim out_dims(phi::make_ddim(out_dims_vector));
+  DDim out_dims(common::make_ddim(out_dims_vector));
 
   out->Resize(out_dims);
   dev_ctx.template Alloc<T>(out);
diff --git a/paddle/phi/kernels/xpu/strided_copy_kernel.cc b/paddle/phi/kernels/xpu/strided_copy_kernel.cc
index 4721d24f4e1194..4e37d439123c63 100644
--- a/paddle/phi/kernels/xpu/strided_copy_kernel.cc
+++ b/paddle/phi/kernels/xpu/strided_copy_kernel.cc
@@ -27,8 +27,8 @@ void StridedCopyKernel(const Context& dev_ctx,
                        int64_t offset,
                        DenseTensor* out) {
   phi::DenseTensorMeta meta = input.meta();
-  meta.strides = phi::make_ddim(out_stride);
-  meta.dims = phi::make_ddim(dims);
+  meta.strides = common::make_ddim(out_stride);
+  meta.dims = common::make_ddim(dims);
   meta.offset = offset;
   out->set_meta(meta);
 
@@ -61,10 +61,10 @@ void StridedCopyKernel(const Context& dev_ctx,
       r = xpu::strided_copy<float>(dev_ctx.x_context(),
                                    input_data,
                                    output_data,
-                                   phi::vectorize<int64_t>(input.dims()),
-                                   phi::vectorize<int64_t>(out->dims()),
-                                   phi::vectorize<int64_t>(input.strides()),
-                                   phi::vectorize<int64_t>(out->strides()));
+                                   common::vectorize<int64_t>(input.dims()),
+                                   common::vectorize<int64_t>(out->dims()),
+                                   common::vectorize<int64_t>(input.strides()),
+                                   common::vectorize<int64_t>(out->strides()));
     }
   } else if (std::is_same<T, double>::value) {
     auto input_data = reinterpret_cast<const int64_t*>(input.data<T>());
@@ -76,13 +76,14 @@ void StridedCopyKernel(const Context& dev_ctx,
     if (input.numel() == 1) {
       r = xpu::copy<int64_t>(dev_ctx.x_context(), input_data, output_data, 1);
     } else {
-      r = xpu::strided_copy<int64_t>(dev_ctx.x_context(),
-                                     input_data,
-                                     output_data,
-                                     phi::vectorize<int64_t>(input.dims()),
-                                     phi::vectorize<int64_t>(out->dims()),
-                                     phi::vectorize<int64_t>(input.strides()),
-                                     phi::vectorize<int64_t>(out->strides()));
+      r = xpu::strided_copy<int64_t>(
+          dev_ctx.x_context(),
+          input_data,
+          output_data,
+          common::vectorize<int64_t>(input.dims()),
+          common::vectorize<int64_t>(out->dims()),
+          common::vectorize<int64_t>(input.strides()),
+          common::vectorize<int64_t>(out->strides()));
     }
   } else if (std::is_same<T, ::phi::dtype::float16>::value) {
     using XPUFLOAT16 = typename XPUTypeTrait<float16>::Type;
@@ -100,10 +101,10 @@ void StridedCopyKernel(const Context& dev_ctx,
           dev_ctx.x_context(),
           input_data,
           output_data,
-          phi::vectorize<int64_t>(input.dims()),
-          phi::vectorize<int64_t>(out->dims()),
-          phi::vectorize<int64_t>(input.strides()),
-          phi::vectorize<int64_t>(out->strides()));
+          common::vectorize<int64_t>(input.dims()),
+          common::vectorize<int64_t>(out->dims()),
+          common::vectorize<int64_t>(input.strides()),
+          common::vectorize<int64_t>(out->strides()));
     }
   } else if (std::is_same<T, ::phi::dtype::bfloat16>::value) {
     using XPUFLOAT16 = typename XPUTypeTrait<float16>::Type;
@@ -121,10 +122,10 @@ void StridedCopyKernel(const Context& dev_ctx,
           dev_ctx.x_context(),
           input_data,
           output_data,
-          phi::vectorize<int64_t>(input.dims()),
-          phi::vectorize<int64_t>(out->dims()),
-          phi::vectorize<int64_t>(input.strides()),
-          phi::vectorize<int64_t>(out->strides()));
+          common::vectorize<int64_t>(input.dims()),
+          common::vectorize<int64_t>(out->dims()),
+          common::vectorize<int64_t>(input.strides()),
+          common::vectorize<int64_t>(out->strides()));
     }
   } else if (std::is_same<T, int16_t>::value) {
     using XPUFLOAT16 = typename XPUTypeTrait<float16>::Type;
@@ -142,10 +143,10 @@ void StridedCopyKernel(const Context& dev_ctx,
           dev_ctx.x_context(),
           input_data,
           output_data,
-          phi::vectorize<int64_t>(input.dims()),
-          phi::vectorize<int64_t>(out->dims()),
-          phi::vectorize<int64_t>(input.strides()),
-          phi::vectorize<int64_t>(out->strides()));
+          common::vectorize<int64_t>(input.dims()),
+          common::vectorize<int64_t>(out->dims()),
+          common::vectorize<int64_t>(input.strides()),
+          common::vectorize<int64_t>(out->strides()));
     }
   } else if (std::is_same<T, uint8_t>::value) {
     auto input_data = reinterpret_cast<const int8_t*>(input.data<T>());
@@ -160,10 +161,10 @@ void StridedCopyKernel(const Context& dev_ctx,
       r = xpu::strided_copy<int8_t>(dev_ctx.x_context(),
                                     input_data,
                                     output_data,
-                                    phi::vectorize<int64_t>(input.dims()),
-                                    phi::vectorize<int64_t>(out->dims()),
-                                    phi::vectorize<int64_t>(input.strides()),
-                                    phi::vectorize<int64_t>(out->strides()));
+                                    common::vectorize<int64_t>(input.dims()),
+                                    common::vectorize<int64_t>(out->dims()),
+                                    common::vectorize<int64_t>(input.strides()),
+                                    common::vectorize<int64_t>(out->strides()));
     }
   } else if (std::is_same<T, int8_t>::value) {
     auto input_data = reinterpret_cast<const int8_t*>(input.data<T>());
@@ -178,10 +179,10 @@ void StridedCopyKernel(const Context& dev_ctx,
       r = xpu::strided_copy<int8_t>(dev_ctx.x_context(),
                                     input_data,
                                     output_data,
-                                    phi::vectorize<int64_t>(input.dims()),
-                                    phi::vectorize<int64_t>(out->dims()),
-                                    phi::vectorize<int64_t>(input.strides()),
-                                    phi::vectorize<int64_t>(out->strides()));
+                                    common::vectorize<int64_t>(input.dims()),
+                                    common::vectorize<int64_t>(out->dims()),
+                                    common::vectorize<int64_t>(input.strides()),
+                                    common::vectorize<int64_t>(out->strides()));
     }
   } else if (std::is_same<T, int32_t>::value) {
     auto input_data = reinterpret_cast<const int32_t*>(input.data<T>());
@@ -193,13 +194,14 @@ void StridedCopyKernel(const Context& dev_ctx,
     if (input.numel() == 1) {
       r = xpu::copy<int32_t>(dev_ctx.x_context(), input_data, output_data, 1);
     } else {
-      r = xpu::strided_copy<int32_t>(dev_ctx.x_context(),
-                                     input_data,
-                                     output_data,
-                                     phi::vectorize<int64_t>(input.dims()),
-                                     phi::vectorize<int64_t>(out->dims()),
-                                     phi::vectorize<int64_t>(input.strides()),
-                                     phi::vectorize<int64_t>(out->strides()));
+      r = xpu::strided_copy<int32_t>(
+          dev_ctx.x_context(),
+          input_data,
+          output_data,
+          common::vectorize<int64_t>(input.dims()),
+          common::vectorize<int64_t>(out->dims()),
+          common::vectorize<int64_t>(input.strides()),
+          common::vectorize<int64_t>(out->strides()));
     }
   } else if (std::is_same<T, int64_t>::value) {
     auto input_data = reinterpret_cast<const int64_t*>(input.data<T>());
@@ -211,13 +213,14 @@ void StridedCopyKernel(const Context& dev_ctx,
     if (input.numel() == 1) {
       r = xpu::copy<int64_t>(dev_ctx.x_context(), input_data, output_data, 1);
     } else {
-      r = xpu::strided_copy<int64_t>(dev_ctx.x_context(),
-                                     input_data,
-                                     output_data,
-                                     phi::vectorize<int64_t>(input.dims()),
-                                     phi::vectorize<int64_t>(out->dims()),
-                                     phi::vectorize<int64_t>(input.strides()),
-                                     phi::vectorize<int64_t>(out->strides()));
+      r = xpu::strided_copy<int64_t>(
+          dev_ctx.x_context(),
+          input_data,
+          output_data,
+          common::vectorize<int64_t>(input.dims()),
+          common::vectorize<int64_t>(out->dims()),
+          common::vectorize<int64_t>(input.strides()),
+          common::vectorize<int64_t>(out->strides()));
     }
   } else if (std::is_same<T, bool>::value) {
     auto input_data = reinterpret_cast<const bool*>(input.data<T>());
@@ -233,10 +236,10 @@ void StridedCopyKernel(const Context& dev_ctx,
       r = xpu::strided_copy<bool>(dev_ctx.x_context(),
                                   input_data,
                                   output_data,
-                                  phi::vectorize<int64_t>(input.dims()),
-                                  phi::vectorize<int64_t>(out->dims()),
-                                  phi::vectorize<int64_t>(input.strides()),
-                                  phi::vectorize<int64_t>(out->strides()));
+                                  common::vectorize<int64_t>(input.dims()),
+                                  common::vectorize<int64_t>(out->dims()),
+                                  common::vectorize<int64_t>(input.strides()),
+                                  common::vectorize<int64_t>(out->strides()));
     }
   } else {
     PADDLE_THROW(phi::errors::InvalidArgument(
diff --git a/paddle/phi/kernels/xpu/take_along_axis_kernel.cc b/paddle/phi/kernels/xpu/take_along_axis_kernel.cc
index db98be61206742..e55604e768b9af 100644
--- a/paddle/phi/kernels/xpu/take_along_axis_kernel.cc
+++ b/paddle/phi/kernels/xpu/take_along_axis_kernel.cc
@@ -16,8 +16,8 @@
 
 #include "glog/logging.h"
 
+#include "paddle/common/layout.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/xpu/temporal_shift_grad_kernel.cc b/paddle/phi/kernels/xpu/temporal_shift_grad_kernel.cc
index bba2442fdc9e40..37ecad59285ee3 100644
--- a/paddle/phi/kernels/xpu/temporal_shift_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/temporal_shift_grad_kernel.cc
@@ -14,8 +14,8 @@
 
 #include "paddle/phi/kernels/temporal_shift_grad_kernel.h"
 
+#include "paddle/common/layout.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/axis_utils.h"
 
@@ -31,7 +31,7 @@ void TemporalShiftGradKernel(const Context& dev_ctx,
   auto* input_grad = x_grad;
   auto* output_grad = &out_grad;
   int t = seg_num;
-  const DataLayout data_layout = phi::StringToDataLayout(data_format_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_format_str);
 
   const int nt = output_grad->dims()[0];
   const int n = nt / t;
@@ -43,8 +43,8 @@ void TemporalShiftGradKernel(const Context& dev_ctx,
                                                   : output_grad->dims()[2]);
 
   DDim in_grad_dims =
-      (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w})
-                                        : phi::make_ddim({nt, h, w, c}));
+      (data_layout == DataLayout::kNCHW ? common::make_ddim({nt, c, h, w})
+                                        : common::make_ddim({nt, h, w, c}));
   const T* output_grad_data = output_grad->data<T>();
   input_grad->Resize(in_grad_dims);
   T* input_grad_data = dev_ctx.template Alloc<T>(input_grad);
diff --git a/paddle/phi/kernels/xpu/temporal_shift_kernel.cc b/paddle/phi/kernels/xpu/temporal_shift_kernel.cc
index 3da9873d2c01ac..61ee3b555f0dee 100644
--- a/paddle/phi/kernels/xpu/temporal_shift_kernel.cc
+++ b/paddle/phi/kernels/xpu/temporal_shift_kernel.cc
@@ -14,8 +14,8 @@
 
 #include "paddle/phi/kernels/temporal_shift_kernel.h"
 
+#include "paddle/common/layout.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/axis_utils.h"
 
@@ -31,7 +31,7 @@ void TemporalShiftKernel(const Context& dev_ctx,
   auto* input = &x;
   auto* output = out;
   int t = seg_num;
-  const DataLayout data_layout = phi::StringToDataLayout(data_format_str);
+  const DataLayout data_layout = common::StringToDataLayout(data_format_str);
 
   const int nt = input->dims()[0];
   const int n = nt / t;
@@ -43,8 +43,8 @@ void TemporalShiftKernel(const Context& dev_ctx,
       (data_layout == DataLayout::kNCHW ? input->dims()[3] : input->dims()[2]);
 
   DDim out_dims =
-      (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w})
-                                        : phi::make_ddim({nt, h, w, c}));
+      (data_layout == DataLayout::kNCHW ? common::make_ddim({nt, c, h, w})
+                                        : common::make_ddim({nt, h, w, c}));
   const T* input_data = input->data<T>();
   output->Resize(out_dims);
   T* output_data = dev_ctx.template Alloc<T>(output);
diff --git a/paddle/phi/kernels/xpu/tile_grad_kernel.cc b/paddle/phi/kernels/xpu/tile_grad_kernel.cc
index c9dce98d192343..b131c168549607 100644
--- a/paddle/phi/kernels/xpu/tile_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/tile_grad_kernel.cc
@@ -26,7 +26,7 @@ void TileGradKernel(const Context& dev_ctx,
                     const IntArray& repeat_times,
                     DenseTensor* x_grad) {
   auto x_dims = x.dims();
-  auto vec_x_dims = phi::vectorize<int>(x_dims);
+  auto vec_x_dims = common::vectorize<int>(x_dims);
   auto repeat_times_data = repeat_times.GetData();
   if (repeat_times_data.size() < vec_x_dims.size()) {
     int diff = vec_x_dims.size() - repeat_times_data.size();
diff --git a/paddle/phi/kernels/xpu/tile_kernel.cc b/paddle/phi/kernels/xpu/tile_kernel.cc
index f6bc716a7d58a7..cce230c970bf97 100644
--- a/paddle/phi/kernels/xpu/tile_kernel.cc
+++ b/paddle/phi/kernels/xpu/tile_kernel.cc
@@ -74,7 +74,7 @@ void TileKernel(const Context& dev_ctx,
             "be positive integers, but the value received is %d.",
             repeat_times[i]));
   }
-  auto vec_in_dims = phi::vectorize<int>(in_dims);
+  auto vec_in_dims = common::vectorize<int>(in_dims);
   if (repeat_times.size() < vec_in_dims.size()) {
     int diff = vec_in_dims.size() - repeat_times.size();
     repeat_times.insert(repeat_times.begin(), diff, 1);
@@ -91,13 +91,13 @@ void TileKernel(const Context& dev_ctx,
           vec_in_dims.size(),
           repeat_times.size()));
 
-  DDim new_in_dims = phi::make_ddim(vec_in_dims);
+  DDim new_in_dims = common::make_ddim(vec_in_dims);
   DDim out_dims(new_in_dims);
 
   for (size_t i = 0; i < repeat_times.size(); ++i) {
     out_dims[i] *= repeat_times[i];
   }
-  auto vec_out_dims = phi::vectorize<int>(out_dims);
+  auto vec_out_dims = common::vectorize<int>(out_dims);
   out->Resize(out_dims);
   dev_ctx.template Alloc<T>(out);
 
diff --git a/paddle/phi/kernels/xpu/top_k_kernel.cc b/paddle/phi/kernels/xpu/top_k_kernel.cc
index a3a37db5e6e0fb..146a09ef410f55 100644
--- a/paddle/phi/kernels/xpu/top_k_kernel.cc
+++ b/paddle/phi/kernels/xpu/top_k_kernel.cc
@@ -65,7 +65,7 @@ void TopkKernel(const Context& dev_ctx,
     PADDLE_ENFORCE_XDNN_NOT_NULL(indices_int_data);
 
     const size_t row =
-        phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
+        common::product(common::slice_ddim(in_dims, 0, in_dims.size() - 1));
     const size_t col = in_dims[in_dims.size() - 1];
     int r = xpu::sorted_topk<XPUType>(dev_ctx.x_context(),
                                       reinterpret_cast<const XPUType*>(in_data),
@@ -131,8 +131,8 @@ void TopkKernel(const Context& dev_ctx,
     int32_t* trans_idx_int32_data =
         RAII_GUARD.alloc_l3_or_gm<int32_t>(out->numel());
     PADDLE_ENFORCE_XDNN_NOT_NULL(trans_idx_int32_data);
-    const size_t row =
-        phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+    const size_t row = common::product(
+        common::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
     const size_t col = trans_dims[trans_dims.size() - 1];
 
     // Do top k on transposed input
diff --git a/paddle/phi/kernels/xpu/transpose_grad_kernel.cc b/paddle/phi/kernels/xpu/transpose_grad_kernel.cc
index 71b2187bddce10..ab6be8c3347cac 100644
--- a/paddle/phi/kernels/xpu/transpose_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/transpose_grad_kernel.cc
@@ -48,7 +48,7 @@ void TransposeGradKernel(const Context& dev_ctx,
     reversed_axis[formated_axis[i]] = i;
   }
 
-  std::vector<int> out_grad_dim_vec = phi::vectorize<int>(out_grad.dims());
+  std::vector<int> out_grad_dim_vec = common::vectorize<int>(out_grad.dims());
   int r = xpu::transpose<XPUType>(
       dev_ctx.x_context(),
       reinterpret_cast<const XPUType*>(out_grad.data<T>()),
diff --git a/paddle/phi/kernels/xpu/transpose_kernel.cc b/paddle/phi/kernels/xpu/transpose_kernel.cc
index dd985ddc7ebc58..f88e06b18e88db 100644
--- a/paddle/phi/kernels/xpu/transpose_kernel.cc
+++ b/paddle/phi/kernels/xpu/transpose_kernel.cc
@@ -43,7 +43,7 @@ void TransposeKernel(const Context& dev_ctx,
     return;
   }
 
-  std::vector<int> x_dim_vec = phi::vectorize<int>(x.dims());
+  std::vector<int> x_dim_vec = common::vectorize<int>(x.dims());
   int r = xpu::transpose<XPUType>(dev_ctx.x_context(),
                                   reinterpret_cast<const XPUType*>(x.data<T>()),
                                   reinterpret_cast<XPUType*>(out->data<T>()),
diff --git a/paddle/phi/kernels/xpu/tril_triu_grad_kernel.cc b/paddle/phi/kernels/xpu/tril_triu_grad_kernel.cc
index aa3fbb8f9423de..4f672c6b609290 100644
--- a/paddle/phi/kernels/xpu/tril_triu_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/tril_triu_grad_kernel.cc
@@ -27,7 +27,7 @@ void TrilTriuGradKernel(const Context& ctx,
                         DenseTensor* x_grad) {
   using XPUType = typename XPUTypeTrait<T>::Type;
   ctx.template Alloc<T>(x_grad);
-  auto dy_shape = vectorize<int>(out_grad.dims());
+  auto dy_shape = common::vectorize<int>(out_grad.dims());
   int r = 0;
   if (lower) {
     r = xpu::tril(ctx.x_context(),
diff --git a/paddle/phi/kernels/xpu/tril_triu_kernel.cc b/paddle/phi/kernels/xpu/tril_triu_kernel.cc
index e72d5b73dfa5b9..26169136c9d3c8 100644
--- a/paddle/phi/kernels/xpu/tril_triu_kernel.cc
+++ b/paddle/phi/kernels/xpu/tril_triu_kernel.cc
@@ -27,7 +27,7 @@ void TrilTriuKernel(const Context& ctx,
                     DenseTensor* out) {
   using XPUType = typename XPUTypeTrait<T>::Type;
   ctx.template Alloc<T>(out);
-  auto xshape = vectorize<int>(x.dims());
+  auto xshape = common::vectorize<int>(x.dims());
   int r = 0;
   if (lower) {
     r = xpu::tril(ctx.x_context(),
diff --git a/paddle/phi/kernels/xpu/unbind_kernel.cc b/paddle/phi/kernels/xpu/unbind_kernel.cc
index fb7ebc9c13452e..ba59be52884ded 100644
--- a/paddle/phi/kernels/xpu/unbind_kernel.cc
+++ b/paddle/phi/kernels/xpu/unbind_kernel.cc
@@ -32,7 +32,7 @@ void UnbindKernel(const Context& dev_ctx,
     dev_ctx.template Alloc<T>(outs[j]);
     y_ptrs.emplace_back(outs[j]->data<T>());
   }
-  auto x_shape = vectorize<int>(x.dims());
+  auto x_shape = common::vectorize<int>(x.dims());
   int r = xpu::unbind(dev_ctx.x_context(), x.data<T>(), y_ptrs, x_shape, axis);
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "unbind");
 }
diff --git a/paddle/phi/kernels/xpu/unfold_grad_kernel.cc b/paddle/phi/kernels/xpu/unfold_grad_kernel.cc
index 298d6655331da0..1bf6e989d7029b 100644
--- a/paddle/phi/kernels/xpu/unfold_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/unfold_grad_kernel.cc
@@ -31,7 +31,7 @@ void UnfoldGradKernel(const Context& ctx,
                       DenseTensor* x_grad) {
   using XPUType = typename XPUTypeTrait<T>::Type;
   ctx.template Alloc<T>(x_grad);
-  const std::string data_format = phi::DataLayoutToString(x.layout());
+  const std::string data_format = common::DataLayoutToString(x.layout());
   bool is_nchw = data_format == "NCHW";
   PADDLE_ENFORCE_EQ(is_nchw,
                     true,
diff --git a/paddle/phi/kernels/xpu/unfold_kernel.cc b/paddle/phi/kernels/xpu/unfold_kernel.cc
index 64a12b2881296e..2ed1860128140b 100644
--- a/paddle/phi/kernels/xpu/unfold_kernel.cc
+++ b/paddle/phi/kernels/xpu/unfold_kernel.cc
@@ -30,7 +30,7 @@ void UnfoldKernel(const Context& ctx,
                   DenseTensor* out) {
   using XPUType = typename XPUTypeTrait<T>::Type;
   ctx.template Alloc<T>(out);
-  const std::string data_format = phi::DataLayoutToString(x.layout());
+  const std::string data_format = common::DataLayoutToString(x.layout());
   bool is_nchw = data_format == "NCHW";
   PADDLE_ENFORCE_EQ(is_nchw,
                     true,
diff --git a/paddle/phi/kernels/xpu/uniform_kernel.cc b/paddle/phi/kernels/xpu/uniform_kernel.cc
index ead65b65a8466f..a53924e3619410 100644
--- a/paddle/phi/kernels/xpu/uniform_kernel.cc
+++ b/paddle/phi/kernels/xpu/uniform_kernel.cc
@@ -28,7 +28,7 @@ void UniformKernel(const Context &dev_ctx,
                    const Scalar &max,
                    int seed,
                    DenseTensor *out) {
-  out->Resize(phi::make_ddim(shape.GetData()));
+  out->Resize(common::make_ddim(shape.GetData()));
   T *data = dev_ctx.template Alloc<T>(out);
   if (out->numel() == 0) {
     return;
diff --git a/paddle/phi/kernels/xpu/unique_kernel.cc b/paddle/phi/kernels/xpu/unique_kernel.cc
index 6f2d8f470a2120..944276a4b6f51a 100644
--- a/paddle/phi/kernels/xpu/unique_kernel.cc
+++ b/paddle/phi/kernels/xpu/unique_kernel.cc
@@ -63,23 +63,23 @@ void XPUFlattenUniqueKernelImpl(const Context& dev_ctx,
                        unique_len_xpu,
                        sizeof(int64_t));
   }
-  out->Resize(phi::make_ddim({unique_len_cpu}));
+  out->Resize(common::make_ddim({unique_len_cpu}));
   auto* out_data = dev_ctx.template Alloc<T>(out);
   IndexT* indices_data = nullptr;
   if (return_index) {
-    indices->Resize(phi::make_ddim({unique_len_cpu}));
+    indices->Resize(common::make_ddim({unique_len_cpu}));
     indices_data = dev_ctx.template Alloc<IndexT>(indices);
   }
 
   IndexT* inverse_data = nullptr;
   if (return_inverse) {
-    index->Resize(phi::make_ddim({x_len}));
+    index->Resize(common::make_ddim({x_len}));
     inverse_data = dev_ctx.template Alloc<IndexT>(index);
   }
 
   IndexT* counts_data = nullptr;
   if (return_counts) {
-    counts->Resize(phi::make_ddim({unique_len_cpu}));
+    counts->Resize(common::make_ddim({unique_len_cpu}));
     counts_data = dev_ctx.template Alloc<IndexT>(counts);
   }
   if (x_len == 0) {
@@ -124,7 +124,7 @@ void XPUDimUniqueKernelImpl(const Context& dev_ctx,
   permute[axis] = 0;
   permute[0] = axis;
   if (axis != 0) {
-    auto x_shape = vectorize<int>(x.dims());
+    auto x_shape = common::vectorize<int>(x.dims());
     r = xpu::transpose<XPUType>(dev_ctx.x_context(),
                                 reinterpret_cast<const XPUType*>(x_data),
                                 x_trans_data,
@@ -142,10 +142,10 @@ void XPUDimUniqueKernelImpl(const Context& dev_ctx,
   DDim x_trans_dims = x.dims();
   x_trans_dims[0] = x.dims()[axis];
   x_trans_dims[axis] = x.dims()[0];
-  DDim x_trans_flat_dims = phi::flatten_to_2d(x_trans_dims, 1);
+  DDim x_trans_flat_dims = common::flatten_to_2d(x_trans_dims, 1);
   int64_t axis_len = x_trans_flat_dims[0];
   int64_t slice_size = x_trans_flat_dims[1];
-  auto x_trans_flat_dims_vec = vectorize<int>(x_trans_flat_dims);
+  auto x_trans_flat_dims_vec = common::vectorize<int>(x_trans_flat_dims);
 
   auto* sorted_axis_idx = RAII_GUARD.alloc_l3_or_gm<IndexT>(axis_len);
   auto* sort_in_tmp = RAII_GUARD.alloc_l3_or_gm<XPUType>(axis_len);
@@ -284,7 +284,7 @@ void XPUDimUniqueKernelImpl(const Context& dev_ctx,
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather");
   DDim out_trans_dims = x_trans_dims;
   out_trans_dims[0] = unique_len;
-  auto out_trans_dims_vec = vectorize<int>(out_trans_dims);
+  auto out_trans_dims_vec = common::vectorize<int>(out_trans_dims);
   if (axis != 0) {
     r = xpu::transpose<XPUType>(dev_ctx.x_context(),
                                 out_trans_data,
diff --git a/paddle/phi/kernels/xpu/unstack_kernel.cc b/paddle/phi/kernels/xpu/unstack_kernel.cc
index 1c9c7a797957a4..a498ed99c2460f 100644
--- a/paddle/phi/kernels/xpu/unstack_kernel.cc
+++ b/paddle/phi/kernels/xpu/unstack_kernel.cc
@@ -29,7 +29,7 @@ void UnStackKernel(const Context &dev_ctx,
   auto x_dims = x.dims();
 
   if (axis < 0) axis += x_dims.size();
-  auto x_shape = phi::vectorize<int>(x_dims);
+  auto x_shape = common::vectorize<int>(x_dims);
 
   std::vector<int> dx_dims_list(outs.size(), 1);
   std::vector<XPUType *> dx_lists;
diff --git a/paddle/phi/kernels/xpu/warpctc_kernel.cc b/paddle/phi/kernels/xpu/warpctc_kernel.cc
index aac1ee9093a4e6..7a5bbfe5cb2998 100644
--- a/paddle/phi/kernels/xpu/warpctc_kernel.cc
+++ b/paddle/phi/kernels/xpu/warpctc_kernel.cc
@@ -110,7 +110,7 @@ void WarpctcKernel(const Context& dev_ctx,
                         DataTypeToString(labels_length_dtype)));
 
   warpctcgrad->Resize(
-      phi::make_ddim({max_sequence_length, num_sequences, sequence_width}));
+      common::make_ddim({max_sequence_length, num_sequences, sequence_width}));
   dev_ctx.template Alloc<T>(warpctcgrad);
   T* warpctcgrad_data = warpctcgrad->data<T>();
 
@@ -136,7 +136,7 @@ void WarpctcKernel(const Context& dev_ctx,
           256 * 1024,
           sm_workspace + lm_workspace));
 
-  loss->Resize(phi::make_ddim({num_sequences, 1}));
+  loss->Resize(common::make_ddim({num_sequences, 1}));
   dev_ctx.template Alloc<T>(loss);
   T* loss_data = loss->data<T>();
 
diff --git a/paddle/phi/kernels/xpu/where_grad_kernel.cc b/paddle/phi/kernels/xpu/where_grad_kernel.cc
index 03cdc117ed0d9d..49a5a1b22685d7 100644
--- a/paddle/phi/kernels/xpu/where_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/where_grad_kernel.cc
@@ -31,8 +31,8 @@ void WhereGradKernel(const Context& ctx,
   const auto* cond_data = condition.data<bool>();
   auto* dout = out_grad.data<T>();
 
-  auto cond_shape = phi::vectorize(condition.dims());
-  auto out_shape = phi::vectorize(out_grad.dims());
+  auto cond_shape = common::vectorize(condition.dims());
+  auto out_shape = common::vectorize(out_grad.dims());
   // use [1] to replace [], because xpu not support []
   if (cond_shape.size() == 0) {
     cond_shape = std::vector<int64_t>({1});
diff --git a/paddle/phi/kernels/xpu/where_kernel.cc b/paddle/phi/kernels/xpu/where_kernel.cc
index 4c5a7fbf5cc094..1edfc693cff867 100644
--- a/paddle/phi/kernels/xpu/where_kernel.cc
+++ b/paddle/phi/kernels/xpu/where_kernel.cc
@@ -31,8 +31,8 @@ void WhereKernel(const Context& ctx,
   const XPUType* y_data = reinterpret_cast<const XPUType*>(y.data<T>());
   XPUType* out_data = reinterpret_cast<XPUType*>(ctx.template Alloc<T>(out));
 
-  auto cond_dims = phi::vectorize<int>(condition.dims());
-  auto x_dims = phi::vectorize<int>(x.dims());
+  auto cond_dims = common::vectorize<int>(condition.dims());
+  auto x_dims = common::vectorize<int>(x.dims());
 
   // use [1] to replace [], because xpu not support []
   if (cond_dims.size() == 0) {
diff --git a/paddle/phi/kernels/xpu/xpu_api_wrapper.h b/paddle/phi/kernels/xpu/xpu_api_wrapper.h
index b75eaa15893234..70ee326500e1ca 100644
--- a/paddle/phi/kernels/xpu/xpu_api_wrapper.h
+++ b/paddle/phi/kernels/xpu/xpu_api_wrapper.h
@@ -122,9 +122,9 @@ static void GetFCInfo(const phi::DDim& x_dims,
                       bool trans_y,
                       XpuFcInfo* info) {
   DDim new_x_dims =
-      (x_dims.size() > 1) ? x_dims : phi::make_ddim({1, x_dims[0]});
+      (x_dims.size() > 1) ? x_dims : common::make_ddim({1, x_dims[0]});
   DDim new_y_dims =
-      (y_dims.size() > 1) ? y_dims : phi::make_ddim({y_dims[0], 1});
+      (y_dims.size() > 1) ? y_dims : common::make_ddim({y_dims[0], 1});
 
   auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(new_x_dims, 0, trans_x);
   auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(new_y_dims, 0, trans_y);
diff --git a/paddle/phi/tools/CMakeLists.txt b/paddle/phi/tools/CMakeLists.txt
index 74a9bd66a78d05..e6ae73384180eb 100644
--- a/paddle/phi/tools/CMakeLists.txt
+++ b/paddle/phi/tools/CMakeLists.txt
@@ -6,7 +6,7 @@ if(WITH_GPU)
 endif()
 
 add_executable(print_phi_kernels print_phi_kernels.cc)
-target_link_libraries(print_phi_kernels phi)
+target_link_libraries(print_phi_kernels phi common)
 if(WIN32)
   target_link_libraries(print_phi_kernels shlwapi.lib)
 endif()
diff --git a/paddle/pir/core/block.cc b/paddle/pir/core/block.cc
index 9a42e927557990..73902960c95ab7 100644
--- a/paddle/pir/core/block.cc
+++ b/paddle/pir/core/block.cc
@@ -16,7 +16,7 @@
 
 #include <unordered_set>
 
-#include "paddle/pir/core/enforce.h"
+#include "paddle/common/enforce.h"
 #include "paddle/pir/core/operation.h"
 #include "paddle/pir/core/region.h"
 
diff --git a/paddle/pir/core/block_argument.cc b/paddle/pir/core/block_argument.cc
index a0da7fbc16b2ac..66a18964280d39 100644
--- a/paddle/pir/core/block_argument.cc
+++ b/paddle/pir/core/block_argument.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/pir/core/block_argument.h"
-#include "paddle/pir/core/enforce.h"
+#include "paddle/common/enforce.h"
 #include "paddle/pir/core/value_impl.h"
 
 #define CHECK_NULL_IMPL(func_name) \
diff --git a/paddle/pir/core/block_operand.cc b/paddle/pir/core/block_operand.cc
index 78dd9c0b5d14e6..2b435f74a29a94 100644
--- a/paddle/pir/core/block_operand.cc
+++ b/paddle/pir/core/block_operand.cc
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include "paddle/pir/core/block_operand.h"
+#include "paddle/common/enforce.h"
 #include "paddle/pir/core/block.h"
 #include "paddle/pir/core/block_operand_impl.h"
-#include "paddle/pir/core/enforce.h"
 
 namespace pir {
 
diff --git a/paddle/pir/core/builtin_attribute_storage.h b/paddle/pir/core/builtin_attribute_storage.h
index 2ab13326d3ebc6..c35d17e2544e6f 100644
--- a/paddle/pir/core/builtin_attribute_storage.h
+++ b/paddle/pir/core/builtin_attribute_storage.h
@@ -18,9 +18,9 @@
 #include <map>
 #include <type_traits>
 
+#include "paddle/common/enforce.h"
 #include "paddle/pir/core/attribute.h"
 #include "paddle/pir/core/attribute_base.h"
-#include "paddle/pir/core/enforce.h"
 #include "paddle/pir/core/type.h"
 #include "paddle/pir/core/utils.h"
 
diff --git a/paddle/pir/core/builtin_op.cc b/paddle/pir/core/builtin_op.cc
index c228e2565b9ed5..9f80b7a93a4193 100644
--- a/paddle/pir/core/builtin_op.cc
+++ b/paddle/pir/core/builtin_op.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/pir/core/builtin_op.h"
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 #include "paddle/pir/core/builtin_attribute.h"
 #include "paddle/pir/core/builtin_type.h"
 
diff --git a/paddle/pir/core/builtin_type.cc b/paddle/pir/core/builtin_type.cc
index fb168a9a051cc4..a12255dc8adf97 100644
--- a/paddle/pir/core/builtin_type.cc
+++ b/paddle/pir/core/builtin_type.cc
@@ -23,9 +23,7 @@ const DenseTensorType::Dim& DenseTensorType::dims() const {
   return storage()->dims_;
 }
 
-DenseTensorType::DataLayout DenseTensorType::data_layout() const {
-  return storage()->layout_;
-}
+DataLayout DenseTensorType::data_layout() const { return storage()->layout_; }
 
 const DenseTensorType::LoD& DenseTensorType::lod() const {
   return storage()->lod_;
diff --git a/paddle/pir/core/builtin_type.h b/paddle/pir/core/builtin_type.h
index d151f80d3e79c7..b1f21d6e9d418b 100644
--- a/paddle/pir/core/builtin_type.h
+++ b/paddle/pir/core/builtin_type.h
@@ -59,7 +59,6 @@ class DenseTensorType : public Type::TypeBase<DenseTensorType,
  public:
   using Base::Base;
   using Dim = DenseTensorTypeStorage::Dim;
-  using DataLayout = DenseTensorTypeStorage::DataLayout;
   using LoD = DenseTensorTypeStorage::LoD;
 
   Type dtype() const;
diff --git a/paddle/pir/core/builtin_type_interfaces.cc b/paddle/pir/core/builtin_type_interfaces.cc
index 460a3405104237..1325069bf79f37 100644
--- a/paddle/pir/core/builtin_type_interfaces.cc
+++ b/paddle/pir/core/builtin_type_interfaces.cc
@@ -23,7 +23,7 @@ Type ShapedTypeInterface::GetElementType() const {
 
 std::vector<int64_t> ShapedTypeInterface::GetDyShape() const {
   if (dy_shape_.size() == 0) {
-    auto ddim_vec = vectorize(impl_->get_shape(*this));
+    auto ddim_vec = common::vectorize(impl_->get_shape(*this));
     dy_shape_ = ddim_vec;
     std::replace(dy_shape_.begin(),
                  dy_shape_.end(),
diff --git a/paddle/pir/core/builtin_type_interfaces.h b/paddle/pir/core/builtin_type_interfaces.h
index 6497a0146bd69c..34144e5a7785b8 100644
--- a/paddle/pir/core/builtin_type_interfaces.h
+++ b/paddle/pir/core/builtin_type_interfaces.h
@@ -17,16 +17,16 @@
 #include <algorithm>
 #include <vector>
 
-#include "paddle/phi/core/tensor_base.h"
+#include "paddle/common/ddim.h"
+#include "paddle/common/enforce.h"
 #include "paddle/pir/core/cast_utils.h"
-#include "paddle/pir/core/enforce.h"
 #include "paddle/pir/core/type.h"
 
 namespace pir {
 
 class ShapedTypeInterface : public TypeInterfaceBase<ShapedTypeInterface> {
  public:
-  using DDim = phi::DDim;
+  using DDim = pir::DDim;
   using DataType = Type;
   struct Concept {
     /// Defined these methods with the interface.
diff --git a/paddle/pir/core/builtin_type_storage.h b/paddle/pir/core/builtin_type_storage.h
index 10063963df6332..d8361658f9e85b 100644
--- a/paddle/pir/core/builtin_type_storage.h
+++ b/paddle/pir/core/builtin_type_storage.h
@@ -14,8 +14,9 @@
 
 #pragma once
 
-#include "paddle/phi/common/layout.h"
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
+#include "paddle/common/dim.h"
+#include "paddle/common/layout.h"
 #include "paddle/pir/core/type.h"
 #include "paddle/pir/core/type_base.h"
 #include "paddle/pir/core/utils.h"
@@ -50,13 +51,13 @@ struct DenseTensorTypeStorage : public pir::TypeStorage {
   ///
   /// \brief Declare ParamKey according to parameter type.
   ///
-  using DataLayout = phi::DataLayout;
-  using Dim = phi::DDim;
+  using Dim = pir::DDim;
+  using DataLayout = pir::DataLayout;
   using LoD = std::vector<std::vector<size_t>>;
-  using ParamKey = std::tuple<Type, Dim, DataLayout, LoD, size_t>;
+  using ParamKey = std::tuple<Type, pir::DDim, DataLayout, LoD, size_t>;
 
   DenseTensorTypeStorage(Type dtype,
-                         const Dim& dims,
+                         const pir::DDim& dims,
                          DataLayout layout,
                          const LoD& lod,
                          size_t offset)
@@ -88,7 +89,7 @@ struct DenseTensorTypeStorage : public pir::TypeStorage {
         pir::hash_combine(hash_value, std::hash<pir::Type>()(std::get<0>(key)));
     // hash dims
     hash_value =
-        pir::hash_combine(hash_value, std::hash<Dim>()(std::get<1>(key)));
+        pir::hash_combine(hash_value, std::hash<pir::DDim>()(std::get<1>(key)));
     // hash layout
     hash_value = pir::hash_combine(
         hash_value,
@@ -120,7 +121,7 @@ struct DenseTensorTypeStorage : public pir::TypeStorage {
   /// layout, lod, offset.
   ///
   pir::Type dtype_;
-  Dim dims_;
+  pir::DDim dims_;
   DataLayout layout_;
   LoD lod_;
   size_t offset_;
diff --git a/paddle/pir/core/dialect.h b/paddle/pir/core/dialect.h
index 87332e184256aa..0cb6f9aae234e1 100644
--- a/paddle/pir/core/dialect.h
+++ b/paddle/pir/core/dialect.h
@@ -17,10 +17,10 @@
 #include <functional>
 #include <ostream>
 
+#include "paddle/common/enforce.h"
 #include "paddle/pir/core/attribute.h"
 #include "paddle/pir/core/attribute_base.h"
 #include "paddle/pir/core/dialect_interface.h"
-#include "paddle/pir/core/enforce.h"
 #include "paddle/pir/core/ir_context.h"
 #include "paddle/pir/core/op_base.h"
 #include "paddle/pir/core/type_base.h"
diff --git a/paddle/pir/core/enforce.h b/paddle/pir/core/enforce.h
deleted file mode 100644
index e8624b8bbe4e13..00000000000000
--- a/paddle/pir/core/enforce.h
+++ /dev/null
@@ -1,82 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <exception>
-#include <string>
-
-#include "paddle/utils/string/printf.h"
-
-#if defined(_WIN32)
-#define UNUSED
-#define __builtin_expect(EXP, C) (EXP)
-#else
-#define UNUSED __attribute__((unused))
-#endif
-
-#if !defined(_WIN32)
-#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
-#else
-// there is no equivalent intrinsics in msvc.
-#define UNLIKELY(condition) (condition)
-#endif
-template <typename T>
-inline bool is_error(const T& stat) {
-  return !stat;
-}
-
-namespace pir {
-class IrNotMetException : public std::exception {
- public:
-  explicit IrNotMetException(const std::string& str) : err_str_(str) {}
-
-  const char* what() const noexcept override { return err_str_.c_str(); }
-
- private:
-  std::string err_str_;
-};
-
-#define IR_THROW(...)                                                     \
-  do {                                                                    \
-    try {                                                                 \
-      throw pir::IrNotMetException(                                       \
-          paddle::string::Sprintf("Error occured at: %s:%d :\n%s",        \
-                                  __FILE__,                               \
-                                  __LINE__,                               \
-                                  paddle::string::Sprintf(__VA_ARGS__))); \
-    } catch (const std::exception& e) {                                   \
-      std::cout << e.what() << std::endl;                                 \
-      throw;                                                              \
-    }                                                                     \
-  } while (0)
-
-#define IR_ENFORCE(COND, ...)                                               \
-  do {                                                                      \
-    bool __cond__(COND);                                                    \
-    if (UNLIKELY(is_error(__cond__))) {                                     \
-      try {                                                                 \
-        throw pir::IrNotMetException(                                       \
-            paddle::string::Sprintf("Error occured at: %s:%d :\n%s",        \
-                                    __FILE__,                               \
-                                    __LINE__,                               \
-                                    paddle::string::Sprintf(__VA_ARGS__))); \
-      } catch (const std::exception& e) {                                   \
-        std::cout << e.what() << std::endl;                                 \
-        throw;                                                              \
-      }                                                                     \
-    }                                                                       \
-  } while (0)
-
-}  // namespace pir
diff --git a/paddle/pir/core/interface_support.h b/paddle/pir/core/interface_support.h
index 083be35f7f1f92..f8fc83efa31720 100644
--- a/paddle/pir/core/interface_support.h
+++ b/paddle/pir/core/interface_support.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "paddle/pir/core/enforce.h"
+#include "paddle/common/enforce.h"
 #include "paddle/pir/core/interface_value.h"
 
 namespace pir {
diff --git a/paddle/pir/core/iterator.h b/paddle/pir/core/iterator.h
index 54563d2fce80c1..4e87fa290f8cff 100644
--- a/paddle/pir/core/iterator.h
+++ b/paddle/pir/core/iterator.h
@@ -15,7 +15,7 @@
 #pragma once
 #include <iterator>
 #include <list>
-#include "paddle/pir/core/macros.h"
+#include "paddle/common/macros.h"
 namespace pir {
 
 class Operation;
diff --git a/paddle/pir/core/macros.h b/paddle/pir/core/macros.h
deleted file mode 100644
index 25d6dd5a812abc..00000000000000
--- a/paddle/pir/core/macros.h
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-namespace pir {
-// TODO(Aurelius84): We also has DISABLE_COPY_AND_ASSIGN in phi/core/maros.h,
-// howere it's not recommended to use it in ir namspace. So we define this again
-// here.
-
-// Disable the copy and assignment operator for a class.
-#ifndef DISABLE_COPY_AND_ASSIGN
-#define DISABLE_COPY_AND_ASSIGN(classname)         \
- private:                                          \
-  classname(const classname&) = delete;            \
-  classname(classname&&) = delete;                 \
-  classname& operator=(const classname&) = delete; \
-  classname& operator=(classname&&) = delete
-#endif
-
-}  // namespace pir
diff --git a/paddle/pir/core/op_base.h b/paddle/pir/core/op_base.h
index 9a0edfd6714988..c7f82954844d79 100644
--- a/paddle/pir/core/op_base.h
+++ b/paddle/pir/core/op_base.h
@@ -15,7 +15,7 @@
 #pragma once
 #include <type_traits>
 
-#include "paddle/pir/core/enforce.h"
+#include "paddle/common/enforce.h"
 #include "paddle/pir/core/interface_support.h"
 #include "paddle/pir/core/op_result.h"
 #include "paddle/pir/core/operation.h"
diff --git a/paddle/pir/core/op_operand.cc b/paddle/pir/core/op_operand.cc
index c728180f48fbfb..74e5dced1fc630 100644
--- a/paddle/pir/core/op_operand.cc
+++ b/paddle/pir/core/op_operand.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/pir/core/op_operand.h"
-#include "paddle/pir/core/enforce.h"
+#include "paddle/common/enforce.h"
 #include "paddle/pir/core/op_operand_impl.h"
 
 #define CHECK_NULL_IMPL(class_name, func_name)                  \
diff --git a/paddle/pir/core/op_result.cc b/paddle/pir/core/op_result.cc
index 8249872593652f..30c6ec97d8fbae 100644
--- a/paddle/pir/core/op_result.cc
+++ b/paddle/pir/core/op_result.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/pir/core/op_result.h"
-#include "paddle/pir/core/enforce.h"
+#include "paddle/common/enforce.h"
 #include "paddle/pir/core/op_result_impl.h"
 
 #define CHECK_OPRESULT_NULL_IMPL(func_name) \
diff --git a/paddle/pir/core/op_trait.cc b/paddle/pir/core/op_trait.cc
index d7103fa31ce455..506af3177e671f 100644
--- a/paddle/pir/core/op_trait.cc
+++ b/paddle/pir/core/op_trait.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/pir/core/op_trait.h"
-#include "paddle/pir/core/enforce.h"
+#include "paddle/common/enforce.h"
 #include "paddle/pir/core/type_util.h"
 
 namespace {
diff --git a/paddle/pir/core/operation.cc b/paddle/pir/core/operation.cc
index 0697195fc2f94a..4ce0cda102be7e 100644
--- a/paddle/pir/core/operation.cc
+++ b/paddle/pir/core/operation.cc
@@ -14,10 +14,10 @@
 
 #include <ostream>
 
+#include "paddle/common/enforce.h"
 #include "paddle/pir/core/block.h"
 #include "paddle/pir/core/block_operand_impl.h"
 #include "paddle/pir/core/dialect.h"
-#include "paddle/pir/core/enforce.h"
 #include "paddle/pir/core/op_info.h"
 #include "paddle/pir/core/op_result_impl.h"
 #include "paddle/pir/core/operation.h"
diff --git a/paddle/pir/core/operation.h b/paddle/pir/core/operation.h
index 11943609e41634..0c3f213adab506 100644
--- a/paddle/pir/core/operation.h
+++ b/paddle/pir/core/operation.h
@@ -16,10 +16,10 @@
 
 #include <ostream>
 #include <vector>
+#include "paddle/common/enforce.h"
+#include "paddle/common/macros.h"
 #include "paddle/pir/core/block.h"
-#include "paddle/pir/core/enforce.h"
 #include "paddle/pir/core/iterator.h"
-#include "paddle/pir/core/macros.h"
 #include "paddle/pir/core/op_info.h"
 #include "paddle/pir/core/operation_utils.h"
 #include "paddle/pir/core/type.h"
diff --git a/paddle/pir/core/region.cc b/paddle/pir/core/region.cc
index dfb3b45aef3e9b..911cb740fa6452 100644
--- a/paddle/pir/core/region.cc
+++ b/paddle/pir/core/region.cc
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/pir/core/region.h"
+#include "paddle/common/enforce.h"
 #include "paddle/pir/core/block.h"
-#include "paddle/pir/core/enforce.h"
 #include "paddle/pir/core/operation.h"
 
 namespace pir {
diff --git a/paddle/pir/core/storage_manager.cc b/paddle/pir/core/storage_manager.cc
index 07cc4e07cce2c1..bcfdf34a231e83 100644
--- a/paddle/pir/core/storage_manager.cc
+++ b/paddle/pir/core/storage_manager.cc
@@ -17,7 +17,7 @@
 #include <memory>
 #include <unordered_map>
 
-#include "paddle/pir/core/enforce.h"
+#include "paddle/common/enforce.h"
 
 namespace pir {
 // This is a structure for creating, caching, and looking up Storage of
diff --git a/paddle/pir/core/type_util.h b/paddle/pir/core/type_util.h
index 5704ba2abea781..14f1c7022c88ce 100644
--- a/paddle/pir/core/type_util.h
+++ b/paddle/pir/core/type_util.h
@@ -31,8 +31,8 @@ Type GetElementTypeOrSelf(Type type);
 /// have the same size and each pair of the elements are equal or one of them is
 /// dynamic.
 ///
-bool VerifyCompatibleShape(const phi::DDim& lhs_shape,
-                           const phi::DDim& rhs_shape);
+bool VerifyCompatibleShape(const pir::DDim& lhs_shape,
+                           const pir::DDim& rhs_shape);
 
 ///
 /// \brief Returns true if the given two types have compatible shape. That
diff --git a/paddle/pir/core/value.cc b/paddle/pir/core/value.cc
index dec2aaecfb3441..8bdda56a5d75ee 100644
--- a/paddle/pir/core/value.cc
+++ b/paddle/pir/core/value.cc
@@ -16,7 +16,7 @@
 
 #include <cstddef>
 
-#include "paddle/pir/core/enforce.h"
+#include "paddle/common/enforce.h"
 #include "paddle/pir/core/op_operand.h"
 #include "paddle/pir/core/op_result.h"
 #include "paddle/pir/core/operation.h"
diff --git a/paddle/pir/dialect/shape/ir/shape_op.cc b/paddle/pir/dialect/shape/ir/shape_op.cc
index bf4a85d0d648f0..4a494f3ca187ac 100644
--- a/paddle/pir/dialect/shape/ir/shape_op.cc
+++ b/paddle/pir/dialect/shape/ir/shape_op.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/pir/dialect/shape/ir/shape_op.h"
+#include "paddle/common/enforce.h"
 #include "paddle/pir/core/builtin_attribute.h"
 #include "paddle/pir/core/builtin_op.h"
 #include "paddle/pir/core/builtin_type.h"
-#include "paddle/pir/core/enforce.h"
 
 namespace pir::shape {
 
diff --git a/paddle/pir/pass/pass.h b/paddle/pir/pass/pass.h
index cc5e4a1dcbd834..30c55d7d3c6c59 100644
--- a/paddle/pir/pass/pass.h
+++ b/paddle/pir/pass/pass.h
@@ -18,8 +18,8 @@
 #include <string>
 #include <vector>
 
+#include "paddle/common/enforce.h"
 #include "paddle/pir/core/builtin_op.h"
-#include "paddle/pir/core/enforce.h"
 #include "paddle/pir/pass/analysis_manager.h"
 #include "paddle/pir/pattern_rewrite/pattern_rewrite_driver.h"
 
diff --git a/paddle/pir/pass/pass_registry.h b/paddle/pir/pass/pass_registry.h
index 88dbfa443ddc37..08d76133edaa55 100644
--- a/paddle/pir/pass/pass_registry.h
+++ b/paddle/pir/pass/pass_registry.h
@@ -18,8 +18,7 @@
 #include <memory>
 #include <unordered_map>
 
-#include "paddle/pir/core/enforce.h"
-#include "paddle/pir/core/macros.h"
+#include "paddle/common/enforce.h"
 #include "paddle/pir/pass/pass.h"
 
 namespace pir {
diff --git a/paddle/pir/pattern_rewrite/pattern_match.cc b/paddle/pir/pattern_rewrite/pattern_match.cc
index 7155894a68ef47..2cc8e80e3d6dc2 100644
--- a/paddle/pir/pattern_rewrite/pattern_match.cc
+++ b/paddle/pir/pattern_rewrite/pattern_match.cc
@@ -17,7 +17,7 @@
 #include <algorithm>
 #include <cstdint>
 
-#include "paddle/pir/core/enforce.h"
+#include "paddle/common/enforce.h"
 #include "paddle/pir/core/operation.h"
 
 namespace pir {
diff --git a/paddle/pir/pattern_rewrite/pattern_match.h b/paddle/pir/pattern_rewrite/pattern_match.h
index d247ff075615a6..a0c34d8f58f073 100644
--- a/paddle/pir/pattern_rewrite/pattern_match.h
+++ b/paddle/pir/pattern_rewrite/pattern_match.h
@@ -24,9 +24,9 @@
 #include <type_traits>
 #include <vector>
 
+#include "paddle/common/enforce.h"
 #include "paddle/pir/core/builder.h"
 #include "paddle/pir/core/dll_decl.h"
-#include "paddle/pir/core/enforce.h"
 #include "paddle/pir/core/ir_context.h"
 #include "paddle/pir/core/op_info.h"
 #include "paddle/pir/core/operation.h"
diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
index 4c5f3049f23254..ecba5716b7b494 100644
--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -5,6 +5,7 @@ if(WITH_TESTING)
       device_context
       gtest
       phi
+      common
       init
       memory
       phi_utils
diff --git a/paddle/utils/CMakeLists.txt b/paddle/utils/CMakeLists.txt
index 0bc36dd5578ea7..529ffe8ebb44c0 100644
--- a/paddle/utils/CMakeLists.txt
+++ b/paddle/utils/CMakeLists.txt
@@ -14,11 +14,11 @@ add_subdirectory(string)
 cc_test(
   array_ref_test
   SRCS array_ref_test.cc
-  DEPS gtest phi)
+  DEPS gtest phi common)
 cc_test(
   small_vector_test
   SRCS small_vector_test.cc
-  DEPS gtest phi)
+  DEPS gtest phi common)
 cc_test(
   variant_test
   SRCS variant_test.cc
@@ -32,5 +32,5 @@ if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
   cc_library(
     pybind_util
     SRCS pybind.cc
-    DEPS phi)
+    DEPS phi common)
 endif()
diff --git a/paddle/utils/string/CMakeLists.txt b/paddle/utils/string/CMakeLists.txt
index ddfc8f96b2ecdd..7bdc43629b10a1 100644
--- a/paddle/utils/string/CMakeLists.txt
+++ b/paddle/utils/string/CMakeLists.txt
@@ -1,15 +1,15 @@
 cc_library(
   pretty_log
   SRCS pretty_log.cc
-  DEPS phi)
+  DEPS phi common)
 cc_library(
   string_helper
   SRCS string_helper.cc
-  DEPS phi)
+  DEPS phi common)
 cc_test(
   stringprintf_test
   SRCS printf_test.cc
-  DEPS phi)
+  DEPS phi common)
 cc_test(to_string_test SRCS to_string_test.cc)
 cc_test(split_test SRCS split_test.cc)
 cc_test(
diff --git a/patches/eigen/TensorReductionGpu.h b/patches/eigen/TensorReductionGpu.h
index 696078e54881af..4807aaa2c1be75 100644
--- a/patches/eigen/TensorReductionGpu.h
+++ b/patches/eigen/TensorReductionGpu.h
@@ -14,7 +14,7 @@ namespace Eigen {
 namespace internal {
 
 #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
-// Full reducers for GPU, don't vectorize for now
+// Full reducers for GPU, don't common::vectorize for now
 
 // Reducer function that enables multiple gpu thread to safely accumulate at the same
 // output address. It basically reads the current value of the output variable, and
diff --git a/python/setup.py.in b/python/setup.py.in
index 25e1c2ca8df7cc..620893ab0d17b0 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -737,7 +737,7 @@ if '${CMAKE_BUILD_TYPE}' == 'Release':
         if "@APPLE@" == "1":
             commands = ["install_name_tool -id '@loader_path/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/base/${FLUID_CORE_NAME}" + '.so']
             commands.append("install_name_tool -add_rpath '@loader_path/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/base/${FLUID_CORE_NAME}" + '.so')
-            commands.append("install_name_tool -add_rpath '@loader_path' ${PADDLE_BINARY_DIR}/python/paddle/libs/${COMMON_NAME}")
+            commands.append("install_name_tool -add_rpath '@loader_path/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/libs/${COMMON_NAME}")
             if('${WITH_SHARED_PHI}' == 'ON'):
                 # change rpath of phi.ext for loading 3rd party libb
                 commands.append("install_name_tool -add_rpath '@loader_path' ${PADDLE_BINARY_DIR}/python/paddle/libs/${PHI_NAME}")
@@ -780,6 +780,7 @@ def find_files(pattern, root, recursive=False):
 headers = (
     # paddle level api headers (high level api, for both training and inference)
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle')) +
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/common')) +  # paddle common headers
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/api')) +  # phi unify api header
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/api/ext')) +  # custom op api
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/api/include')) +  # phi api
diff --git a/python/setup_cinn.py.in b/python/setup_cinn.py.in
index 753a1d30cd7ad3..f7fb513ed218ea 100644
--- a/python/setup_cinn.py.in
+++ b/python/setup_cinn.py.in
@@ -141,6 +141,8 @@ if '${WITH_MKLDNN}' == 'ON':
 
 if '${CINN_ONLY}' == 'OFF':
     cinnlibs.append('${PHI_LIB}')
+    cinnlibs.append('${IR_LIB}')
+    cinnlibs.append('${COMMON_LIB}')
 
 if '${WITH_GPU}' == 'ON':
     cinnlibs.append('${CMAKE_BINARY_DIR}/dist/cinn/include/paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh')
diff --git a/setup.py b/setup.py
index e0e52c27d5b639..791e9ecc0cf22e 100644
--- a/setup.py
+++ b/setup.py
@@ -38,7 +38,7 @@
 # check python
 python_version = platform.python_version()
 version_detail = sys.version_info
-version = str(version_detail[0]) + '.' + str(version_detail[1]) 
+version = str(version_detail[0]) + '.' + str(version_detail[1])
 env_version = str(os.getenv("PY_VERSION"))
 
 if version_detail < (3, 7):
@@ -57,14 +57,12 @@
         f"we will attempt to use the python version you set to execute."
     )
     cmd = 'which python' + env_version
-    res = subprocess.run(cmd, shell = True, stdout=subprocess.PIPE)
+    res = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE)
     if res.returncode == 0:
         os.environ["PYTHON_EXECUTABLE"] = res
     else:
-        raise RuntimeError(
-            "We can't find the version you set in your machine"
-        )
-        
+        raise RuntimeError("We can't find the version you set in your machine")
+
 
 # check cmake
 CMAKE = shutil.which('cmake3') or shutil.which('cmake')
@@ -1181,7 +1179,7 @@ def get_package_data_and_package_dir():
                     + '.so'
                 )
                 commands.append(
-                    "install_name_tool -add_rpath '@loader_path' "
+                    "install_name_tool -add_rpath '@loader_path/../libs/' "
                     + env_dict.get("PADDLE_BINARY_DIR")
                     + '/python/paddle/libs/'
                     + env_dict.get("COMMON_NAME")
@@ -1266,6 +1264,9 @@ def get_headers():
         + list(  # phi api
             find_files('*.h', paddle_source_dir + '/paddle/phi/common')
         )
+        + list(  # common api
+            find_files('*.h', paddle_source_dir + '/paddle/common')
+        )
         # phi level api headers (low level api, for training only)
         + list(  # phi extension header
             find_files('*.h', paddle_source_dir + '/paddle/phi')
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 00186357bfac5e..dd4ff65e332019 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -189,6 +189,7 @@ if(${len} GREATER_EQUAL 1)
       if(WITH_SHARED_IR)
         target_link_libraries(${test_name} $<TARGET_LINKER_FILE:pir>)
       endif()
+      target_link_libraries(${test_name} $<TARGET_LINKER_FILE:common>)
       add_dependencies(${test_name} ${paddle_lib} paddle_gtest_main_new)
       if(WITH_GPU)
         target_link_libraries(${test_name} ${CUDA_CUDART_LIBRARY}
@@ -200,7 +201,7 @@ if(${len} GREATER_EQUAL 1)
       if(APPLE)
         target_link_libraries(
           ${test_name}
-          "-Wl,-rpath,$<TARGET_FILE_DIR:${paddle_lib}> -Wl,-rpath,$<TARGET_FILE_DIR:phi> -Wl,-rpath,$<TARGET_FILE_DIR:pir>"
+          "-Wl,-rpath,$<TARGET_FILE_DIR:${paddle_lib}> -Wl,-rpath,$<TARGET_FILE_DIR:phi> -Wl,-rpath,$<TARGET_FILE_DIR:pir> -Wl,-rpath,$<TARGET_FILE_DIR:common>"
         )
       endif()
       if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
@@ -239,9 +240,6 @@ endif()
 
 if(TARGET layer_test)
   add_dependencies(layer_test jit_download_program)
-  add_dependencies(layer_test_new jit_download_program)
-  set_tests_properties(layer_test_new PROPERTIES ENVIRONMENT
-                                                 "FLAGS_jit_engine_type=New")
 endif()
 
 if(TEST buddy_allocator_test)
diff --git a/test/cpp/auto_parallel/CMakeLists.txt b/test/cpp/auto_parallel/CMakeLists.txt
index e041c746624443..f38cf32f350592 100644
--- a/test/cpp/auto_parallel/CMakeLists.txt
+++ b/test/cpp/auto_parallel/CMakeLists.txt
@@ -13,7 +13,8 @@ if(WITH_DISTRIBUTE)
   cc_test(
     dist_tensor_test
     SRCS dist_tensor_test.cc
-    DEPS phi)
+    DEPS phi common)
+
   paddle_test(
     spmd_rule_test
     SRCS
@@ -21,7 +22,8 @@ if(WITH_DISTRIBUTE)
     DEPS
     spmd_rule_test_util
     spmd_rules
-    phi)
+    phi
+    common)
   paddle_test(
     softmax_grad_spmd_rule_test
     SRCS
@@ -32,4 +34,4 @@ if(WITH_DISTRIBUTE)
     phi)
 endif()
 
-cc_test_old(dist_mapper_test SRCS dist_mapper_test.cc DEPS phi)
+cc_test_old(dist_mapper_test SRCS dist_mapper_test.cc DEPS phi common)
diff --git a/test/cpp/auto_parallel/dist_tensor_test.cc b/test/cpp/auto_parallel/dist_tensor_test.cc
index a94cfd37d6cc24..c1d6851b86ca2d 100644
--- a/test/cpp/auto_parallel/dist_tensor_test.cc
+++ b/test/cpp/auto_parallel/dist_tensor_test.cc
@@ -34,7 +34,7 @@ TEST(dist_tensor, constructor) {
   DDim dims({3, 4});
   DenseTensorMeta meta(dtype, dims);
 
-  auto dist_attr = TensorDistAttr(phi::vectorize(dims));
+  auto dist_attr = TensorDistAttr(common::vectorize(dims));
 
   std::vector<int64_t> mesh_shape = {1};
   std::vector<int64_t> process_ids = {0};
diff --git a/test/cpp/auto_parallel/spmd_rule_test.cc b/test/cpp/auto_parallel/spmd_rule_test.cc
index 014672f91add1e..b4254e2d0912e6 100644
--- a/test/cpp/auto_parallel/spmd_rule_test.cc
+++ b/test/cpp/auto_parallel/spmd_rule_test.cc
@@ -41,8 +41,8 @@ TEST(MatmulSPMDRule, Ctor) {
   size_t input_size = 2;
   size_t output_size = 1;
 
-  phi::distributed::DistMetaTensor x(phi::make_ddim(x_shape), x_dist_attr);
-  phi::distributed::DistMetaTensor y(phi::make_ddim(y_shape), y_dist_attr);
+  phi::distributed::DistMetaTensor x(common::make_ddim(x_shape), x_dist_attr);
+  phi::distributed::DistMetaTensor y(common::make_ddim(y_shape), y_dist_attr);
 
   auto matmul_spmd_rule =
       phi::distributed::SpmdRuleFactory::Instance().GetSpmdRule("matmul");
@@ -63,8 +63,8 @@ TEST(MatmulSPMDRule, Ctor) {
   // mk[-1,-1],kn[-1,0] --> mk[-1,-1],kn[-1,0] = nm[-1,0] partial[]
   x_dist_attr.set_dims_mapping({-1, -1});
   y_dist_attr.set_dims_mapping({-1, 0});
-  x = phi::distributed::DistMetaTensor(phi::make_ddim(x_shape), x_dist_attr);
-  y = phi::distributed::DistMetaTensor(phi::make_ddim(y_shape), y_dist_attr);
+  x = phi::distributed::DistMetaTensor(common::make_ddim(x_shape), x_dist_attr);
+  y = phi::distributed::DistMetaTensor(common::make_ddim(y_shape), y_dist_attr);
   ctx = phi::distributed::InferSpmdContext(
       {x, y}, {/*trans_x=*/false, /*trans_x=*/false});
   infered_dist_attrs = matmul_spmd_rule.InferForward(ctx);
@@ -76,8 +76,8 @@ TEST(MatmulSPMDRule, Ctor) {
   // mk[1, 0],kn[-1,-1] --> mk[1, 0],kn[0, -1] = nm[1, -1] partial[0]: done
   x_dist_attr.set_dims_mapping({1, 0});
   y_dist_attr.set_dims_mapping({-1, -1});
-  x = phi::distributed::DistMetaTensor(phi::make_ddim(x_shape), x_dist_attr);
-  y = phi::distributed::DistMetaTensor(phi::make_ddim(y_shape), y_dist_attr);
+  x = phi::distributed::DistMetaTensor(common::make_ddim(x_shape), x_dist_attr);
+  y = phi::distributed::DistMetaTensor(common::make_ddim(y_shape), y_dist_attr);
   ctx = phi::distributed::InferSpmdContext(
       {x, y}, {/*trans_x=*/false, /*trans_x=*/false});
   infered_dist_attrs = matmul_spmd_rule.InferForward(ctx);
@@ -91,8 +91,8 @@ TEST(MatmulSPMDRule, Ctor) {
   // mk[-1,-1],kn[1,0] --> mk[-1, 1],kn[1, 0] = nm[-1, 0] partial[1]: done
   x_dist_attr.set_dims_mapping({-1, -1});
   y_dist_attr.set_dims_mapping({1, 0});
-  x = phi::distributed::DistMetaTensor(phi::make_ddim(x_shape), x_dist_attr);
-  y = phi::distributed::DistMetaTensor(phi::make_ddim(y_shape), y_dist_attr);
+  x = phi::distributed::DistMetaTensor(common::make_ddim(x_shape), x_dist_attr);
+  y = phi::distributed::DistMetaTensor(common::make_ddim(y_shape), y_dist_attr);
   ctx = phi::distributed::InferSpmdContext(
       {x, y}, {/*trans_x=*/false, /*trans_x=*/false});
   infered_dist_attrs = matmul_spmd_rule.InferForward(ctx);
@@ -108,8 +108,8 @@ TEST(MatmulSPMDRule, Ctor) {
   x_shape = {512, 48, 64, 32};
   x_dist_attr.set_dims_mapping({0, 1, -1, -1});
   y_dist_attr.set_dims_mapping({-1, -1});
-  x = phi::distributed::DistMetaTensor(phi::make_ddim(x_shape), x_dist_attr);
-  y = phi::distributed::DistMetaTensor(phi::make_ddim(y_shape), y_dist_attr);
+  x = phi::distributed::DistMetaTensor(common::make_ddim(x_shape), x_dist_attr);
+  y = phi::distributed::DistMetaTensor(common::make_ddim(y_shape), y_dist_attr);
   ctx = phi::distributed::InferSpmdContext(
       {x, y}, {/*trans_x=*/false, /*trans_x=*/false});
   infered_dist_attrs = matmul_spmd_rule.InferForward(ctx);
@@ -123,8 +123,8 @@ TEST(MatmulSPMDRule, Ctor) {
   // -1, -1, -1] partial[0]: done
   x_dist_attr.set_dims_mapping({1, -1, -1, 0});
   y_dist_attr.set_dims_mapping({-1, -1});
-  x = phi::distributed::DistMetaTensor(phi::make_ddim(x_shape), x_dist_attr);
-  y = phi::distributed::DistMetaTensor(phi::make_ddim(y_shape), y_dist_attr);
+  x = phi::distributed::DistMetaTensor(common::make_ddim(x_shape), x_dist_attr);
+  y = phi::distributed::DistMetaTensor(common::make_ddim(y_shape), y_dist_attr);
   ctx = phi::distributed::InferSpmdContext(
       {x, y}, {/*trans_x=*/false, /*trans_x=*/false});
   infered_dist_attrs = matmul_spmd_rule.InferForward(ctx);
@@ -139,8 +139,8 @@ TEST(MatmulSPMDRule, Ctor) {
   // abcmn[1, -1, 0, -1] partial[]: done
   x_dist_attr.set_dims_mapping({1, -1, -1, 0});
   y_dist_attr.set_dims_mapping({-1, -1});
-  x = phi::distributed::DistMetaTensor(phi::make_ddim(x_shape), x_dist_attr);
-  y = phi::distributed::DistMetaTensor(phi::make_ddim(y_shape), y_dist_attr);
+  x = phi::distributed::DistMetaTensor(common::make_ddim(x_shape), x_dist_attr);
+  y = phi::distributed::DistMetaTensor(common::make_ddim(y_shape), y_dist_attr);
   ctx = phi::distributed::InferSpmdContext(
       {x, y}, {/*trans_x=*/true, /*trans_x=*/false});
   infered_dist_attrs = matmul_spmd_rule.InferForward(ctx);
@@ -156,8 +156,8 @@ TEST(MatmulSPMDRule, Ctor) {
   // abcmn[-1, -1, -1, 1] partial[0]: done
   x_dist_attr.set_dims_mapping({-1, -1, -1, -1});
   y_dist_attr.set_dims_mapping({1, 0});
-  x = phi::distributed::DistMetaTensor(phi::make_ddim(x_shape), x_dist_attr);
-  y = phi::distributed::DistMetaTensor(phi::make_ddim(y_shape), y_dist_attr);
+  x = phi::distributed::DistMetaTensor(common::make_ddim(x_shape), x_dist_attr);
+  y = phi::distributed::DistMetaTensor(common::make_ddim(y_shape), y_dist_attr);
   ctx = phi::distributed::InferSpmdContext(
       {x, y}, {/*trans_x=*/false, /*trans_x=*/true});
   infered_dist_attrs = matmul_spmd_rule.InferForward(ctx);
@@ -174,8 +174,8 @@ TEST(MatmulSPMDRule, Ctor) {
   // 0, -1],kn[-1, 0] = abcmn[-1, -1, 1, -1] partial[0]: done
   x_dist_attr.set_dims_mapping({-1, -1, 0, 1});
   y_dist_attr.set_dims_mapping({1, 0});
-  x = phi::distributed::DistMetaTensor(phi::make_ddim(x_shape), x_dist_attr);
-  y = phi::distributed::DistMetaTensor(phi::make_ddim(y_shape), y_dist_attr);
+  x = phi::distributed::DistMetaTensor(common::make_ddim(x_shape), x_dist_attr);
+  y = phi::distributed::DistMetaTensor(common::make_ddim(y_shape), y_dist_attr);
   ctx = phi::distributed::InferSpmdContext(
       {x, y}, {/*trans_x=*/true, /*trans_x=*/true});
   infered_dist_attrs = matmul_spmd_rule.InferForward(ctx);
@@ -195,8 +195,8 @@ TEST(MatmulSPMDRule, Ctor) {
   // abcmn[-1, -1, -1, 1] partial[0]: done
   x_dist_attr.set_dims_mapping({-1, -1, 1, 0});
   y_dist_attr.set_dims_mapping({1, 0});
-  x = phi::distributed::DistMetaTensor(phi::make_ddim(x_shape), x_dist_attr);
-  y = phi::distributed::DistMetaTensor(phi::make_ddim(y_shape), y_dist_attr);
+  x = phi::distributed::DistMetaTensor(common::make_ddim(x_shape), x_dist_attr);
+  y = phi::distributed::DistMetaTensor(common::make_ddim(y_shape), y_dist_attr);
   ctx = phi::distributed::InferSpmdContext(
       {x, y}, {/*trans_x=*/true, /*trans_x=*/true});
   EXPECT_ANY_THROW(infered_dist_attrs = matmul_spmd_rule.InferForward(ctx));
@@ -207,8 +207,8 @@ TEST(MatmulSPMDRule, Ctor) {
   // abcmn[-1, -1, 1, -1] partial[0]:
   x_dist_attr.set_dims_mapping({-1, -1, 0, 1});
   y_dist_attr.set_dims_mapping({1, 0});
-  x = phi::distributed::DistMetaTensor(phi::make_ddim(x_shape), x_dist_attr);
-  y = phi::distributed::DistMetaTensor(phi::make_ddim(y_shape), y_dist_attr);
+  x = phi::distributed::DistMetaTensor(common::make_ddim(x_shape), x_dist_attr);
+  y = phi::distributed::DistMetaTensor(common::make_ddim(y_shape), y_dist_attr);
   ctx = phi::distributed::InferSpmdContext(
       {x, y}, {/*trans_x=*/true, /*trans_x=*/true});
   infered_dist_attrs = matmul_spmd_rule.InferForward(ctx);
@@ -269,10 +269,10 @@ TEST(LayerNormSPMDRule, Ctor) {
   x_dist_attr.set_dims_mapping({1, -1, -1});
   scale_dist_attr.set_dims_mapping({-1});
   bias_dist_attr.set_dims_mapping({-1});
-  phi::distributed::DistMetaTensor x(phi::make_ddim(x_shape), x_dist_attr);
-  phi::distributed::DistMetaTensor scale(phi::make_ddim(scale_shape),
+  phi::distributed::DistMetaTensor x(common::make_ddim(x_shape), x_dist_attr);
+  phi::distributed::DistMetaTensor scale(common::make_ddim(scale_shape),
                                          scale_dist_attr);
-  phi::distributed::DistMetaTensor bias(phi::make_ddim(bias_shape),
+  phi::distributed::DistMetaTensor bias(common::make_ddim(bias_shape),
                                         bias_dist_attr);
   phi::distributed::InferSpmdContext ctx({x, scale, bias},
                                          {epsilon, begin_norm_axis});
@@ -296,10 +296,10 @@ TEST(LayerNormSPMDRule, Ctor) {
   x_dist_attr.set_dims_mapping({1, 0, -1});
   scale_dist_attr.set_dims_mapping({0});
   bias_dist_attr.set_dims_mapping({0});
-  x = phi::distributed::DistMetaTensor(phi::make_ddim(x_shape), x_dist_attr);
-  scale = phi::distributed::DistMetaTensor(phi::make_ddim(scale_shape),
+  x = phi::distributed::DistMetaTensor(common::make_ddim(x_shape), x_dist_attr);
+  scale = phi::distributed::DistMetaTensor(common::make_ddim(scale_shape),
                                            scale_dist_attr);
-  bias = phi::distributed::DistMetaTensor(phi::make_ddim(bias_shape),
+  bias = phi::distributed::DistMetaTensor(common::make_ddim(bias_shape),
                                           bias_dist_attr);
   ctx = phi::distributed::InferSpmdContext({x, scale, bias},
                                            {epsilon, begin_norm_axis});
@@ -319,10 +319,10 @@ TEST(LayerNormSPMDRule, Ctor) {
   x_dist_attr.set_dims_mapping({0, -1, -1});
   scale_dist_attr.set_dims_mapping({-1});
   bias_dist_attr.set_dims_mapping({1});
-  x = phi::distributed::DistMetaTensor(phi::make_ddim(x_shape), x_dist_attr);
-  scale = phi::distributed::DistMetaTensor(phi::make_ddim(scale_shape),
+  x = phi::distributed::DistMetaTensor(common::make_ddim(x_shape), x_dist_attr);
+  scale = phi::distributed::DistMetaTensor(common::make_ddim(scale_shape),
                                            scale_dist_attr);
-  bias = phi::distributed::DistMetaTensor(phi::make_ddim(bias_shape),
+  bias = phi::distributed::DistMetaTensor(common::make_ddim(bias_shape),
                                           bias_dist_attr);
   ctx = phi::distributed::InferSpmdContext({x, scale, bias},
                                            {epsilon, begin_norm_axis});
@@ -366,9 +366,9 @@ TEST(MatmulSPMDRuleInferBackward, Ctor) {
   out_dist_attr.set_dynamic_dims(std::vector<bool>({false, false}));
   out_dist_attr.set_partial_status(std::vector<int64_t>({0}));
 
-  phi::distributed::DistMetaTensor x(phi::make_ddim(x_shape), x_dist_attr);
-  phi::distributed::DistMetaTensor y(phi::make_ddim(y_shape), y_dist_attr);
-  phi::distributed::DistMetaTensor out(phi::make_ddim(out_shape),
+  phi::distributed::DistMetaTensor x(common::make_ddim(x_shape), x_dist_attr);
+  phi::distributed::DistMetaTensor y(common::make_ddim(y_shape), y_dist_attr);
+  phi::distributed::DistMetaTensor out(common::make_ddim(out_shape),
                                        out_dist_attr);
 
   auto matmul_spmd_rule =
@@ -427,11 +427,11 @@ TEST(ReplicatedSPMDRule, Ctor) {
   out2_dist_attr.set_dims_mapping(std::vector<int64_t>({-1, 1, -1}));
   out2_dist_attr.set_dynamic_dims(std::vector<bool>({false, false}));
 
-  phi::distributed::DistMetaTensor x(phi::make_ddim(x_shape), x_dist_attr);
-  phi::distributed::DistMetaTensor y(phi::make_ddim(y_shape), y_dist_attr);
-  phi::distributed::DistMetaTensor out1(phi::make_ddim(out1_shape),
+  phi::distributed::DistMetaTensor x(common::make_ddim(x_shape), x_dist_attr);
+  phi::distributed::DistMetaTensor y(common::make_ddim(y_shape), y_dist_attr);
+  phi::distributed::DistMetaTensor out1(common::make_ddim(out1_shape),
                                         out1_dist_attr);
-  phi::distributed::DistMetaTensor out2(phi::make_ddim(out2_shape),
+  phi::distributed::DistMetaTensor out2(common::make_ddim(out2_shape),
                                         out2_dist_attr);
 
   // 2 inputs 2 outputs
@@ -539,11 +539,11 @@ TEST(DefaultDataParallelSPMDRule, Ctor) {
   out2_dist_attr.set_dims_mapping(std::vector<int64_t>({-1, 1, -1}));
   out2_dist_attr.set_dynamic_dims(std::vector<bool>({false, false}));
 
-  phi::distributed::DistMetaTensor x(phi::make_ddim(x_shape), x_dist_attr);
-  phi::distributed::DistMetaTensor y(phi::make_ddim(y_shape), y_dist_attr);
-  phi::distributed::DistMetaTensor out1(phi::make_ddim(out1_shape),
+  phi::distributed::DistMetaTensor x(common::make_ddim(x_shape), x_dist_attr);
+  phi::distributed::DistMetaTensor y(common::make_ddim(y_shape), y_dist_attr);
+  phi::distributed::DistMetaTensor out1(common::make_ddim(out1_shape),
                                         out1_dist_attr);
-  phi::distributed::DistMetaTensor out2(phi::make_ddim(out2_shape),
+  phi::distributed::DistMetaTensor out2(common::make_ddim(out2_shape),
                                         out2_dist_attr);
 
   // 2 inputs 2 outputs, batch axis sharding is propagatd while other axes are
@@ -603,9 +603,9 @@ TEST(DefaultDataParallelSPMDRule, Ctor) {
   x_dist_attr.set_dims_mapping(std::vector<int64_t>({0, -1, -1, -1}));
   y_dist_attr.set_dims_mapping(std::vector<int64_t>({-1, -1}));
   out1_dist_attr.set_dims_mapping(std::vector<int64_t>({1, -1, -1, -1}));
-  x = phi::distributed::DistMetaTensor(phi::make_ddim(x_shape), x_dist_attr);
-  y = phi::distributed::DistMetaTensor(phi::make_ddim(y_shape), y_dist_attr);
-  out1 = phi::distributed::DistMetaTensor(phi::make_ddim(out1_shape),
+  x = phi::distributed::DistMetaTensor(common::make_ddim(x_shape), x_dist_attr);
+  y = phi::distributed::DistMetaTensor(common::make_ddim(y_shape), y_dist_attr);
+  out1 = phi::distributed::DistMetaTensor(common::make_ddim(out1_shape),
                                           out1_dist_attr);
 
   EXPECT_ANY_THROW(infered_dist_attrs_st =
@@ -622,9 +622,9 @@ TEST(DefaultDataParallelSPMDRule, Ctor) {
   // call in vector arguments format
   out1_dist_attr.set_dims_mapping(std::vector<int64_t>({-1, 0, 1, -1}));
   out2_dist_attr.set_dims_mapping(std::vector<int64_t>({0, 1, -1}));
-  out1 = phi::distributed::DistMetaTensor(phi::make_ddim(out1_shape),
+  out1 = phi::distributed::DistMetaTensor(common::make_ddim(out1_shape),
                                           out1_dist_attr);
-  out2 = phi::distributed::DistMetaTensor(phi::make_ddim(out2_shape),
+  out2 = phi::distributed::DistMetaTensor(common::make_ddim(out2_shape),
                                           out2_dist_attr);
 
   infered_dist_attrs_st = phi::distributed::DefaultDataParallelInferSpmdReverse(
@@ -667,8 +667,8 @@ TEST(ConcatRule, Ctor) {
       t_dist_attr.set_process_mesh(process_mesh);
       t_dist_attr.set_dims_mapping(dim_mappings[i]);
       t_dist_attr.set_dynamic_dims({false, false, false});
-      auto input = phi::distributed::DistMetaTensor(phi::make_ddim(shapes[i]),
-                                                    t_dist_attr);
+      auto input = phi::distributed::DistMetaTensor(
+          common::make_ddim(shapes[i]), t_dist_attr);
       inputs.push_back(input);
     }
     return inputs;
@@ -695,7 +695,8 @@ TEST(ConcatRule, Ctor) {
 
   auto build_output = [&](const TensorDistAttr& t_dist_attr,
                           const std::vector<int64_t>& shape) {
-    return phi::distributed::DistMetaTensor(phi::make_ddim(shape), t_dist_attr);
+    return phi::distributed::DistMetaTensor(common::make_ddim(shape),
+                                            t_dist_attr);
   };
 
   auto& output_dist_attr =
@@ -769,8 +770,8 @@ TEST(StackRule, Ctor) {
       t_dist_attr.set_process_mesh(process_mesh);
       t_dist_attr.set_dims_mapping(dim_mappings[i]);
       t_dist_attr.set_dynamic_dims({false, false, false});
-      auto input = phi::distributed::DistMetaTensor(phi::make_ddim(input_shape),
-                                                    t_dist_attr);
+      auto input = phi::distributed::DistMetaTensor(
+          common::make_ddim(input_shape), t_dist_attr);
       inputs.push_back(input);
     }
     return inputs;
@@ -787,7 +788,7 @@ TEST(StackRule, Ctor) {
                    input_shape.end(),
                    std::back_inserter(output_shape),
                    [](int64_t x) { return x; });
-    return phi::distributed::DistMetaTensor(phi::make_ddim(output_shape),
+    return phi::distributed::DistMetaTensor(common::make_ddim(output_shape),
                                             t_dist_attr);
   };
 
@@ -873,8 +874,8 @@ TEST(WhereRule, Ctor) {
       t_dist_attr.set_process_mesh(process_mesh);
       t_dist_attr.set_dims_mapping(dim_mappings[i]);
       t_dist_attr.set_dynamic_dims({false, false, false});
-      auto input = phi::distributed::DistMetaTensor(phi::make_ddim(shapes[i]),
-                                                    t_dist_attr);
+      auto input = phi::distributed::DistMetaTensor(
+          common::make_ddim(shapes[i]), t_dist_attr);
       inputs.push_back(input);
     }
     return inputs;
@@ -909,8 +910,8 @@ TEST(ReduceMaxRule, Ctor) {
   t_dist_attr.set_process_mesh(process_mesh);
   t_dist_attr.set_dims_mapping({-1, 0, -1});
   t_dist_attr.set_dynamic_dims({false, false, false});
-  phi::distributed::DistMetaTensor x =
-      phi::distributed::DistMetaTensor(phi::make_ddim({4, 6, 8}), t_dist_attr);
+  phi::distributed::DistMetaTensor x = phi::distributed::DistMetaTensor(
+      common::make_ddim({4, 6, 8}), t_dist_attr);
   IntArray axis = {1};
   bool keep_dim = false;
   phi::distributed::SpmdInfo forward_info =
@@ -919,7 +920,7 @@ TEST(ReduceMaxRule, Ctor) {
   check_partial_dims(forward_info.second[0], {0});
   // test backward
   phi::distributed::DistMetaTensor out = phi::distributed::DistMetaTensor(
-      phi::make_ddim({4, 8}),
+      common::make_ddim({4, 8}),
       PADDLE_GET_CONST(TensorDistAttr, forward_info.second[0]));
   phi::distributed::DistMetaTensor out_grad = out;
   phi::distributed::SpmdInfo backward_info =
@@ -965,7 +966,7 @@ TEST(Numel, Ctor) {
   t_dist_attr.set_dims_mapping(dims_mapping);
   t_dist_attr.set_dynamic_dims({false, false, false});
   auto input =
-      phi::distributed::DistMetaTensor(phi::make_ddim(shape), t_dist_attr);
+      phi::distributed::DistMetaTensor(common::make_ddim(shape), t_dist_attr);
   auto infered_dist_attrs = phi::distributed::NumelInferSpmd(input);
   EXPECT_EQ(infered_dist_attrs.first.size(), static_cast<size_t>(1));
   EXPECT_EQ(infered_dist_attrs.second.size(), static_cast<size_t>(1));
@@ -988,7 +989,7 @@ TEST(Triu, Ctor) {
   t_dist_attr.set_dims_mapping(dims_mapping);
   t_dist_attr.set_dynamic_dims({false, false, false});
   auto input =
-      phi::distributed::DistMetaTensor(phi::make_ddim(shape), t_dist_attr);
+      phi::distributed::DistMetaTensor(common::make_ddim(shape), t_dist_attr);
   auto infered_dist_attrs = phi::distributed::TriuGradInferSpmd(input, 0);
   EXPECT_EQ(infered_dist_attrs.first.size(), static_cast<size_t>(1));
   EXPECT_EQ(infered_dist_attrs.second.size(), static_cast<size_t>(1));
@@ -1013,7 +1014,7 @@ TEST(LayerNorm, Ctor) {
     t_dist_attr.set_dims_mapping(dim_mapping);
     t_dist_attr.set_dynamic_dims({false, false, false});
     auto input =
-        phi::distributed::DistMetaTensor(phi::make_ddim(shape), t_dist_attr);
+        phi::distributed::DistMetaTensor(common::make_ddim(shape), t_dist_attr);
     return input;
   };
   // test 1
@@ -1076,7 +1077,7 @@ TEST(FlashAtt, Ctor) {
     t_dist_attr.set_dims_mapping(dim_mapping);
     t_dist_attr.set_dynamic_dims(std::vector<bool>(shape.size(), false));
     auto input =
-        phi::distributed::DistMetaTensor(phi::make_ddim(shape), t_dist_attr);
+        phi::distributed::DistMetaTensor(common::make_ddim(shape), t_dist_attr);
     return input;
   };
 
@@ -1164,7 +1165,7 @@ TEST(Transpose, Ctor) {
   t_dist_attr.set_dims_mapping(dims_mapping);
   t_dist_attr.set_dynamic_dims({false, false, false});
   phi::distributed::DistMetaTensor x =
-      phi::distributed::DistMetaTensor(phi::make_ddim(shape), t_dist_attr);
+      phi::distributed::DistMetaTensor(common::make_ddim(shape), t_dist_attr);
   std::vector<int> perm = {1, 2, -3};
   // test forward
   phi::distributed::SpmdInfo forward_spmd_info =
@@ -1176,7 +1177,7 @@ TEST(Transpose, Ctor) {
   check_partial_dims(forward_spmd_info.second[0], {});
   // test backward
   phi::distributed::DistMetaTensor out_grad = phi::distributed::DistMetaTensor(
-      phi::make_ddim({8, 10, 6}),
+      common::make_ddim({8, 10, 6}),
       PADDLE_GET_CONST(TensorDistAttr, forward_spmd_info.second[0]));
   phi::distributed::SpmdInfo backward_spmd_info =
       TransposeGradInferSpmd(out_grad, perm);
@@ -1200,7 +1201,7 @@ TEST(Reshape, Ctor) {
     t_dist_attr.set_dims_mapping(dim_mapping);
     t_dist_attr.set_dynamic_dims(std::vector<bool>(shape.size(), false));
     auto input =
-        phi::distributed::DistMetaTensor(phi::make_ddim(shape), t_dist_attr);
+        phi::distributed::DistMetaTensor(common::make_ddim(shape), t_dist_attr);
     return input;
   };
 
@@ -1246,28 +1247,32 @@ TEST(ElementwiseUnaryLike, Ctor) {
 
   // cast
   auto input =
-      phi::distributed::DistMetaTensor(phi::make_ddim(shape), t_dist_attr);
+      phi::distributed::DistMetaTensor(common::make_ddim(shape), t_dist_attr);
   auto infered_dist_attrs =
       phi::distributed::CastInferSpmd(input, phi::DataType::FLOAT32);
 
   check_element_unary_like(infered_dist_attrs);
   // full like
-  input = phi::distributed::DistMetaTensor(phi::make_ddim(shape), t_dist_attr);
+  input =
+      phi::distributed::DistMetaTensor(common::make_ddim(shape), t_dist_attr);
   infered_dist_attrs =
       phi::distributed::FullLikeInferSpmd(input, 1.0, phi::DataType::FLOAT32);
   check_element_unary_like(infered_dist_attrs);
 
   // pow
-  input = phi::distributed::DistMetaTensor(phi::make_ddim(shape), t_dist_attr);
+  input =
+      phi::distributed::DistMetaTensor(common::make_ddim(shape), t_dist_attr);
   infered_dist_attrs = phi::distributed::PowInferSpmd(input, 2);
   check_element_unary_like(infered_dist_attrs);
 
   // pow backward
-  input = phi::distributed::DistMetaTensor(phi::make_ddim(shape), t_dist_attr);
+  input =
+      phi::distributed::DistMetaTensor(common::make_ddim(shape), t_dist_attr);
   infered_dist_attrs = phi::distributed::PowGradInferSpmd(input, input, 2);
 
   // scale
-  input = phi::distributed::DistMetaTensor(phi::make_ddim(shape), t_dist_attr);
+  input =
+      phi::distributed::DistMetaTensor(common::make_ddim(shape), t_dist_attr);
   infered_dist_attrs = phi::distributed::ScaleInferSpmd(input, 1.0, 1.0, false);
   check_element_unary_like(infered_dist_attrs);
 }
diff --git a/test/cpp/eager/CMakeLists.txt b/test/cpp/eager/CMakeLists.txt
index b9729743a1c426..58c3547b9ef235 100644
--- a/test/cpp/eager/CMakeLists.txt
+++ b/test/cpp/eager/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(eager_deps
     phi
+    common
     hook_utils
     utils
     global_utils
diff --git a/test/cpp/eager/data_structure_tests/accumulation_node_test.cc b/test/cpp/eager/data_structure_tests/accumulation_node_test.cc
index 4bad555a439088..c1469d6e61a741 100644
--- a/test/cpp/eager/data_structure_tests/accumulation_node_test.cc
+++ b/test/cpp/eager/data_structure_tests/accumulation_node_test.cc
@@ -32,11 +32,11 @@ using namespace egr;  // NOLINT
 TEST(AccumulationNode, SelectedRowsAddToTensor) {
   // Construct Eager Tensor
   phi::DenseTensorMeta meta =
-      phi::DenseTensorMeta(phi::DataType::FLOAT32, phi::make_ddim({1, 1}));
+      phi::DenseTensorMeta(phi::DataType::FLOAT32, common::make_ddim({1, 1}));
   std::vector<int64_t> rows = {0};
   std::shared_ptr<phi::SelectedRows> sr0 =
       std::make_shared<phi::SelectedRows>(rows, 1);
-  sr0->mutable_value()->Resize(phi::make_ddim({1, 1}));
+  sr0->mutable_value()->Resize(common::make_ddim({1, 1}));
   sr0->mutable_value()->mutable_data<float>(paddle::platform::CPUPlace())[0] =
       static_cast<float>(10.0f);
   paddle::Tensor et0 = paddle::Tensor(sr0);
@@ -59,7 +59,7 @@ TEST(AccumulationNode, SelectedRowsAddToTensor) {
   // Initialize Grad Tensor
   std::shared_ptr<phi::SelectedRows> grad_dt =
       std::make_shared<phi::SelectedRows>(rows, 1);
-  grad_dt->mutable_value()->Resize(phi::make_ddim({1, 1}));
+  grad_dt->mutable_value()->Resize(common::make_ddim({1, 1}));
   grad_dt->mutable_value()->mutable_data<float>(
       paddle::platform::CPUPlace())[0] = static_cast<float>(0.0f);
   grad_meta->MutableGrad()->set_impl(grad_dt);
@@ -97,17 +97,17 @@ TEST(AccumulationNode, SelectedRowsAddToTensor) {
 TEST(AccumulationNode, SelectedRowsMerge) {
   // Construct Eager Tensor
   phi::DenseTensorMeta meta =
-      phi::DenseTensorMeta(phi::DataType::FLOAT32, phi::make_ddim({1, 1}));
+      phi::DenseTensorMeta(phi::DataType::FLOAT32, common::make_ddim({1, 1}));
   std::vector<int64_t> rows = {0};
   std::shared_ptr<phi::SelectedRows> sr0 =
       std::make_shared<phi::SelectedRows>(rows, 1);
-  sr0->mutable_value()->Resize(phi::make_ddim({1, 1}));
+  sr0->mutable_value()->Resize(common::make_ddim({1, 1}));
   sr0->mutable_value()->mutable_data<float>(paddle::platform::CPUPlace())[0] =
       static_cast<float>(10.0f);
   paddle::Tensor et0 = paddle::Tensor(sr0);
   std::shared_ptr<phi::SelectedRows> sr1 =
       std::make_shared<phi::SelectedRows>(rows, 1);
-  sr1->mutable_value()->Resize(phi::make_ddim({1, 1}));
+  sr1->mutable_value()->Resize(common::make_ddim({1, 1}));
   sr1->mutable_value()->mutable_data<float>(paddle::platform::CPUPlace())[0] =
       static_cast<float>(20.0f);
   paddle::Tensor et1 = paddle::Tensor(sr1);
@@ -122,7 +122,7 @@ TEST(AccumulationNode, SelectedRowsMerge) {
   // Initialize Grad Tensor
   std::shared_ptr<phi::SelectedRows> grad_dt =
       std::make_shared<phi::SelectedRows>(rows, 1);
-  grad_dt->mutable_value()->Resize(phi::make_ddim({1, 1}));
+  grad_dt->mutable_value()->Resize(common::make_ddim({1, 1}));
   grad_dt->mutable_value()->mutable_data<float>(
       paddle::platform::CPUPlace())[0] = static_cast<float>(0.0f);
   grad_meta->MutableGrad()->set_impl(grad_dt);
@@ -162,17 +162,17 @@ TEST(AccumulationNode, SelectedRowsMerge) {
 TEST(AccumulationNode, SelectedRowsAddTensor) {
   // Construct Eager Tensor
   phi::DenseTensorMeta meta =
-      phi::DenseTensorMeta(phi::DataType::FLOAT32, phi::make_ddim({1, 1}));
+      phi::DenseTensorMeta(phi::DataType::FLOAT32, common::make_ddim({1, 1}));
   std::vector<int64_t> rows = {0};
   std::shared_ptr<phi::SelectedRows> sr0 =
       std::make_shared<phi::SelectedRows>(rows, 1);
-  sr0->mutable_value()->Resize(phi::make_ddim({1, 1}));
+  sr0->mutable_value()->Resize(common::make_ddim({1, 1}));
   sr0->mutable_value()->mutable_data<float>(paddle::platform::CPUPlace())[0] =
       static_cast<float>(10.0f);
   paddle::Tensor et0 = paddle::Tensor(sr0);
   std::shared_ptr<phi::SelectedRows> sr1 =
       std::make_shared<phi::SelectedRows>(rows, 1);
-  sr1->mutable_value()->Resize(phi::make_ddim({1, 1}));
+  sr1->mutable_value()->Resize(common::make_ddim({1, 1}));
   sr1->mutable_value()->mutable_data<float>(paddle::platform::CPUPlace())[0] =
       static_cast<float>(20.0f);
   paddle::Tensor et1 = paddle::Tensor(sr1);
@@ -229,7 +229,7 @@ TEST(AccumulationNode, SelectedRowsAddTensor) {
 TEST(AccumulationNode, Tensor) {
   // Construct Eager Tensor
   phi::DenseTensorMeta meta =
-      phi::DenseTensorMeta(phi::DataType::FLOAT16, phi::make_ddim({1, 1}));
+      phi::DenseTensorMeta(phi::DataType::FLOAT16, common::make_ddim({1, 1}));
   std::shared_ptr<phi::DenseTensor> dt0 = std::make_shared<phi::DenseTensor>(
       std::make_unique<paddle::experimental::DefaultAllocator>(
           paddle::platform::CPUPlace())
diff --git a/test/cpp/eager/data_structure_tests/autograd_meta_test.cc b/test/cpp/eager/data_structure_tests/autograd_meta_test.cc
index 651e3b63f07ac6..41eda2ec8080a4 100644
--- a/test/cpp/eager/data_structure_tests/autograd_meta_test.cc
+++ b/test/cpp/eager/data_structure_tests/autograd_meta_test.cc
@@ -40,7 +40,7 @@ TEST(AutogradMeta, MemberFunction) {
   CHECK(tmp_auto->Grad().defined() == false);
   auto* grad_t = tmp_auto->MutableGrad();
   phi::DenseTensorMeta meta =
-      phi::DenseTensorMeta(phi::DataType::FLOAT32, phi::make_ddim({1, 2}));
+      phi::DenseTensorMeta(phi::DataType::FLOAT32, common::make_ddim({1, 2}));
   std::shared_ptr<phi::DenseTensor> dt = std::make_shared<phi::DenseTensor>(
       std::make_unique<paddle::experimental::DefaultAllocator>(
           paddle::platform::CPUPlace())
diff --git a/test/cpp/eager/data_structure_tests/eager_tensor_test.cc b/test/cpp/eager/data_structure_tests/eager_tensor_test.cc
index b2e3d8b5e7bd23..2ffdf033cf1cd1 100644
--- a/test/cpp/eager/data_structure_tests/eager_tensor_test.cc
+++ b/test/cpp/eager/data_structure_tests/eager_tensor_test.cc
@@ -16,9 +16,9 @@
 
 #include "glog/logging.h"
 #include "gtest/gtest.h"
+#include "paddle/common/layout.h"
 #include "paddle/fluid/imperative/var_helper.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
-#include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace eager_test {
@@ -37,7 +37,7 @@ TEST(Tensor, Constructor) {
   CHECK_EQ(et2.name(), "et2");
 
   phi::DenseTensorMeta meta =
-      phi::DenseTensorMeta(phi::DataType::FLOAT32, phi::make_ddim({1, 2}));
+      phi::DenseTensorMeta(phi::DataType::FLOAT32, common::make_ddim({1, 2}));
   std::shared_ptr<phi::DenseTensor> dt = std::make_shared<phi::DenseTensor>(
       std::make_unique<paddle::experimental::DefaultAllocator>(
           paddle::platform::CPUPlace())
@@ -67,7 +67,7 @@ TEST(Tensor, Constructor) {
 TEST(Tensor, MemberFunction) {
   paddle::Tensor et3;
   phi::DenseTensorMeta meta =
-      phi::DenseTensorMeta(phi::DataType::FLOAT32, phi::make_ddim({1, 2}));
+      phi::DenseTensorMeta(phi::DataType::FLOAT32, common::make_ddim({1, 2}));
   std::shared_ptr<phi::DenseTensor> dt = std::make_shared<phi::DenseTensor>(
       std::make_unique<paddle::experimental::DefaultAllocator>(
           paddle::platform::CPUPlace())
@@ -87,7 +87,7 @@ TEST(Tensor, MemberFunction) {
   CHECK_EQ(et3.is_cpu(), true);
   CHECK_EQ(et3.is_gpu(), false);
   CHECK_EQ(et3.numel(), 2);
-  auto expected_dim = phi::make_ddim({1, 2});
+  auto expected_dim = common::make_ddim({1, 2});
   CHECK_EQ(et3.dims(), expected_dim);
   CHECK_EQ(et3.type(), phi::DataType::FLOAT32);
   CHECK_EQ(et3.layout(), phi::DataLayout::NCHW);
@@ -121,7 +121,7 @@ TEST(Tensor, MemberFunction) {
 TEST(EagerVariable, Constructor) {
   paddle::Tensor t3;
   phi::DenseTensorMeta meta =
-      phi::DenseTensorMeta(phi::DataType::FLOAT32, phi::make_ddim({1, 2}));
+      phi::DenseTensorMeta(phi::DataType::FLOAT32, common::make_ddim({1, 2}));
   std::shared_ptr<phi::DenseTensor> dt = std::make_shared<phi::DenseTensor>(
       std::make_unique<paddle::experimental::DefaultAllocator>(
           paddle::platform::CPUPlace())
@@ -159,7 +159,7 @@ TEST(EagerVariable, Constructor) {
   paddle::Tensor t7(std::make_shared<phi::SelectedRows>(rows, 2));
   std::dynamic_pointer_cast<phi::SelectedRows>(t7.impl())
       ->mutable_value()
-      ->Resize(phi::make_ddim(dims));
+      ->Resize(common::make_ddim(dims));
   auto* dt7_tmp_ptr = std::dynamic_pointer_cast<phi::SelectedRows>(t7.impl())
                           ->mutable_value()
                           ->mutable_data<float>(paddle::platform::CPUPlace());
@@ -202,9 +202,10 @@ TEST(EagerVariable, Constructor) {
 
 TEST(EagerVariable, DataLayout) {
   paddle::Tensor tensor;
-  phi::DenseTensorMeta meta = phi::DenseTensorMeta(phi::DataType::FLOAT32,
-                                                   phi::make_ddim({1, 1, 1, 1}),
-                                                   phi::DataLayout::UNDEFINED);
+  phi::DenseTensorMeta meta =
+      phi::DenseTensorMeta(phi::DataType::FLOAT32,
+                           common::make_ddim({1, 1, 1, 1}),
+                           phi::DataLayout::UNDEFINED);
   std::shared_ptr<phi::DenseTensor> dt = std::make_shared<phi::DenseTensor>(
       std::make_unique<paddle::experimental::DefaultAllocator>(
           paddle::platform::CPUPlace())
diff --git a/test/cpp/eager/data_structure_tests/grad_node_info_test.cc b/test/cpp/eager/data_structure_tests/grad_node_info_test.cc
index 0948e6f72aa0b0..dc7027eac030ec 100644
--- a/test/cpp/eager/data_structure_tests/grad_node_info_test.cc
+++ b/test/cpp/eager/data_structure_tests/grad_node_info_test.cc
@@ -37,7 +37,7 @@ void TestGradNodeBase(bool is_remove_gradient_hook) {
   paddle::small_vector<std::vector<paddle::Tensor>, egr::kSlotSmallVectorSize>
       grads;
   phi::DenseTensorMeta meta =
-      phi::DenseTensorMeta(phi::DataType::FLOAT32, phi::make_ddim({1, 1}));
+      phi::DenseTensorMeta(phi::DataType::FLOAT32, common::make_ddim({1, 1}));
   std::shared_ptr<phi::DenseTensor> dt = std::make_shared<phi::DenseTensor>(
       std::make_unique<paddle::experimental::DefaultAllocator>(
           paddle::platform::CPUPlace())
@@ -87,7 +87,7 @@ void TestGradNodeBase(bool is_remove_gradient_hook) {
   auto gradient_hook = [](const paddle::Tensor& et) -> paddle::Tensor {
     paddle::Tensor res;
     phi::DenseTensorMeta meta =
-        phi::DenseTensorMeta(phi::DataType::FLOAT32, phi::make_ddim({1, 1}));
+        phi::DenseTensorMeta(phi::DataType::FLOAT32, common::make_ddim({1, 1}));
     std::shared_ptr<phi::DenseTensor> dt = std::make_shared<phi::DenseTensor>(
         std::make_unique<paddle::experimental::DefaultAllocator>(
             paddle::platform::CPUPlace())
@@ -125,7 +125,7 @@ TEST(GradNodeInfo, GradNodeBase) {
 
 TEST(GradNodeInfo, Edge) {
   phi::DenseTensorMeta meta =
-      phi::DenseTensorMeta(phi::DataType::FLOAT32, phi::make_ddim({1, 1}));
+      phi::DenseTensorMeta(phi::DataType::FLOAT32, common::make_ddim({1, 1}));
   std::shared_ptr<phi::DenseTensor> dt = std::make_shared<phi::DenseTensor>(
       std::make_unique<paddle::experimental::DefaultAllocator>(
           paddle::platform::CPUPlace())
diff --git a/test/cpp/eager/data_structure_tests/grad_node_test.h b/test/cpp/eager/data_structure_tests/grad_node_test.h
index c1125f0774a517..8ead02e88f6baa 100644
--- a/test/cpp/eager/data_structure_tests/grad_node_test.h
+++ b/test/cpp/eager/data_structure_tests/grad_node_test.h
@@ -38,7 +38,7 @@ class GradTestNode : public egr::GradNodeBase {
     val_ = std::dynamic_pointer_cast<phi::DenseTensor>(grads[0][0].impl())
                ->data<float>()[0];
     phi::DenseTensorMeta meta =
-        phi::DenseTensorMeta(phi::DataType::FLOAT32, phi::make_ddim({1, 1}));
+        phi::DenseTensorMeta(phi::DataType::FLOAT32, common::make_ddim({1, 1}));
     std::shared_ptr<phi::DenseTensor> dt = std::make_shared<phi::DenseTensor>(
         std::make_unique<paddle::experimental::DefaultAllocator>(
             paddle::platform::CPUPlace())
diff --git a/test/cpp/eager/data_structure_tests/grad_tensor_holder_test.cc b/test/cpp/eager/data_structure_tests/grad_tensor_holder_test.cc
index b9e5b23a04e0b0..8476eb132dc4cc 100644
--- a/test/cpp/eager/data_structure_tests/grad_tensor_holder_test.cc
+++ b/test/cpp/eager/data_structure_tests/grad_tensor_holder_test.cc
@@ -37,7 +37,7 @@ TEST(GradTensorHolder, Constructor) {
 
   // Construct Eager Tensor
   phi::DenseTensorMeta meta =
-      phi::DenseTensorMeta(phi::DataType::FLOAT32, phi::make_ddim({2, 2}));
+      phi::DenseTensorMeta(phi::DataType::FLOAT32, common::make_ddim({2, 2}));
   std::shared_ptr<phi::DenseTensor> dt = std::make_shared<phi::DenseTensor>(
       std::make_unique<paddle::experimental::DefaultAllocator>(
           paddle::platform::CPUPlace())
@@ -55,7 +55,7 @@ TEST(GradTensorHolder, Constructor) {
 TEST(GradTensorHolder, Interfaces) {
   // Construct Eager Tensor
   phi::DenseTensorMeta meta =
-      phi::DenseTensorMeta(phi::DataType::FLOAT32, phi::make_ddim({1, 1}));
+      phi::DenseTensorMeta(phi::DataType::FLOAT32, common::make_ddim({1, 1}));
   std::shared_ptr<phi::DenseTensor> dt0 = std::make_shared<phi::DenseTensor>(
       std::make_unique<paddle::experimental::DefaultAllocator>(
           paddle::platform::CPUPlace())
@@ -117,7 +117,8 @@ TEST(GradTensorHolder, SelectedRowsMergeAdd) {
   auto sr2 = std::make_shared<phi::SelectedRows>(rows, table_size);
 
   // initialize a sparse table 1
-  sr1->mutable_value()->Resize(phi::make_ddim({table_size, embedding_width}));
+  sr1->mutable_value()->Resize(
+      common::make_ddim({table_size, embedding_width}));
   auto* data_sr1 = sr1->mutable_value()->mutable_data<float>(cpu);
   for (int64_t i = 0; i < table_size; ++i) {
     for (int64_t j = 0; j < embedding_width; ++j) {
@@ -126,7 +127,8 @@ TEST(GradTensorHolder, SelectedRowsMergeAdd) {
   }
 
   // initialize a sparse table 2
-  sr2->mutable_value()->Resize(phi::make_ddim({table_size, embedding_width}));
+  sr2->mutable_value()->Resize(
+      common::make_ddim({table_size, embedding_width}));
   auto* data_sr2 = sr2->mutable_value()->mutable_data<float>(cpu);
   for (int64_t i = 0; i < table_size; ++i) {
     for (int64_t j = 0; j < embedding_width; ++j) {
diff --git a/test/cpp/eager/data_structure_tests/tensor_wrapper_test.cc b/test/cpp/eager/data_structure_tests/tensor_wrapper_test.cc
index a3a82b0c3b2018..38eb45fe192487 100644
--- a/test/cpp/eager/data_structure_tests/tensor_wrapper_test.cc
+++ b/test/cpp/eager/data_structure_tests/tensor_wrapper_test.cc
@@ -23,7 +23,7 @@ TEST(TensorWrapper, Basic) {
   VLOG(6) << "Test Full reserved";
   paddle::Tensor et1;
   phi::DenseTensorMeta meta =
-      phi::DenseTensorMeta(phi::DataType::FLOAT32, phi::make_ddim({1, 2}));
+      phi::DenseTensorMeta(phi::DataType::FLOAT32, common::make_ddim({1, 2}));
   std::shared_ptr<phi::DenseTensor> dt = std::make_shared<phi::DenseTensor>(
       std::make_unique<paddle::experimental::DefaultAllocator>(
           paddle::platform::CPUPlace())
@@ -52,7 +52,7 @@ TEST(TensorWrapper, Basic) {
   VLOG(6) << "Test reconstruct";
   paddle::Tensor et2;
   phi::DenseTensorMeta meta2 =
-      phi::DenseTensorMeta(phi::DataType::FLOAT32, phi::make_ddim({1, 2}));
+      phi::DenseTensorMeta(phi::DataType::FLOAT32, common::make_ddim({1, 2}));
   std::shared_ptr<phi::DenseTensor> dt2 = std::make_shared<phi::DenseTensor>(
       std::make_unique<paddle::experimental::DefaultAllocator>(
           paddle::platform::CPUPlace())
diff --git a/test/cpp/eager/performance_tests/CMakeLists.txt b/test/cpp/eager/performance_tests/CMakeLists.txt
index 7b48812d6dd7fe..69388abb70861d 100644
--- a/test/cpp/eager/performance_tests/CMakeLists.txt
+++ b/test/cpp/eager/performance_tests/CMakeLists.txt
@@ -16,15 +16,15 @@ if(NOT (NOT WITH_PYTHON AND ON_INFER))
     eager_prim_api)
 
   paddle_test(test_egr_performance_benchmark_eager_cpu SRCS
-              benchmark_eager_cpu.cc DEPS performance_benchmark_utils)
+              benchmark_eager_cpu.cc DEPS performance_benchmark_utils common)
   paddle_test(test_egr_performance_benchmark_fluid_cpu SRCS
-              benchmark_fluid_cpu.cc DEPS performance_benchmark_utils)
+              benchmark_fluid_cpu.cc DEPS performance_benchmark_utils common)
 
   if(WITH_GPU)
     paddle_test(test_egr_performance_benchmark_eager_cuda SRCS
-                benchmark_eager_cuda.cc DEPS performance_benchmark_utils)
+                benchmark_eager_cuda.cc DEPS performance_benchmark_utils common)
     paddle_test(test_egr_performance_benchmark_fluid_cuda SRCS
-                benchmark_fluid_cuda.cc DEPS performance_benchmark_utils)
+                benchmark_fluid_cuda.cc DEPS performance_benchmark_utils common)
   endif()
 
   if(WITH_ONNXRUNTIME AND WIN32)
diff --git a/test/cpp/eager/performance_tests/benchmark_eager_cpu.cc b/test/cpp/eager/performance_tests/benchmark_eager_cpu.cc
index b6e991e358fde6..f0865efab3156c 100644
--- a/test/cpp/eager/performance_tests/benchmark_eager_cpu.cc
+++ b/test/cpp/eager/performance_tests/benchmark_eager_cpu.cc
@@ -41,7 +41,7 @@ TEST(Benchmark, EagerScaleCPU) {
   eager_test::InitEnv(paddle::platform::CPUPlace());
 
   for (const std::string mode : {"Accuracy", "Performance"}) {
-    paddle::framework::DDim ddim = phi::make_ddim({2, 4, 4, 4});
+    paddle::framework::DDim ddim = common::make_ddim({2, 4, 4, 4});
     paddle::Tensor tensor =
         eager_test::CreateTensorWithValue(ddim,
                                           paddle::platform::CPUPlace(),
@@ -81,7 +81,7 @@ TEST(Benchmark, EagerMatmulCPU) {
   eager_test::InitEnv(paddle::platform::CPUPlace());
 
   for (const std::string mode : {"Accuracy", "Performance"}) {
-    paddle::framework::DDim ddimX = phi::make_ddim({2, 2});
+    paddle::framework::DDim ddimX = common::make_ddim({2, 2});
     paddle::Tensor X =
         eager_test::CreateTensorWithValue(ddimX,
                                           paddle::platform::CPUPlace(),
@@ -91,7 +91,7 @@ TEST(Benchmark, EagerMatmulCPU) {
                                           true);
     RetainGradForTensor(X);
 
-    paddle::framework::DDim ddimY = phi::make_ddim({2, 2});
+    paddle::framework::DDim ddimY = common::make_ddim({2, 2});
     paddle::Tensor Y =
         eager_test::CreateTensorWithValue(ddimY,
                                           paddle::platform::CPUPlace(),
@@ -133,7 +133,7 @@ TEST(Benchmark, EagerIntermediateMatmulCPU) {
   paddle::imperative::SetCurrentTracer(tracer);
 
   for (const std::string mode : {"Accuracy", "Performance"}) {
-    paddle::framework::DDim ddimX = phi::make_ddim({2, 2});
+    paddle::framework::DDim ddimX = common::make_ddim({2, 2});
     paddle::Tensor X =
         eager_test::CreateTensorWithValue(ddimX,
                                           paddle::platform::CPUPlace(),
@@ -143,7 +143,7 @@ TEST(Benchmark, EagerIntermediateMatmulCPU) {
                                           true);
     RetainGradForTensor(X);
 
-    paddle::framework::DDim ddimY = phi::make_ddim({2, 2});
+    paddle::framework::DDim ddimY = common::make_ddim({2, 2});
     paddle::Tensor Y =
         eager_test::CreateTensorWithValue(ddimY,
                                           paddle::platform::CPUPlace(),
@@ -185,7 +185,7 @@ TEST(Benchmark, EagerIntermediateMLPCPU) {
   paddle::imperative::SetCurrentTracer(tracer);
 
   for (const std::string mode : {"Accuracy", "Performance"}) {
-    paddle::framework::DDim ddimX = phi::make_ddim({MLP_M, MLP_N});
+    paddle::framework::DDim ddimX = common::make_ddim({MLP_M, MLP_N});
     paddle::Tensor X =
         eager_test::CreateTensorWithValue(ddimX,
                                           paddle::platform::CPUPlace(),
@@ -198,7 +198,7 @@ TEST(Benchmark, EagerIntermediateMLPCPU) {
     std::vector<paddle::Tensor> Ws;
     std::vector<paddle::Tensor> Bs;
     for (size_t i = 0; i < MLP_NUM_LINEAR; i++) {
-      paddle::framework::DDim ddimW = phi::make_ddim({MLP_N, MLP_K});
+      paddle::framework::DDim ddimW = common::make_ddim({MLP_N, MLP_K});
       paddle::Tensor W =
           eager_test::CreateTensorWithValue(ddimW,
                                             paddle::platform::CPUPlace(),
@@ -208,7 +208,7 @@ TEST(Benchmark, EagerIntermediateMLPCPU) {
                                             true);
       RetainGradForTensor(W);
 
-      paddle::framework::DDim ddimB = phi::make_ddim({MLP_K});
+      paddle::framework::DDim ddimB = common::make_ddim({MLP_K});
       paddle::Tensor B =
           eager_test::CreateTensorWithValue(ddimB,
                                             paddle::platform::CPUPlace(),
diff --git a/test/cpp/eager/performance_tests/benchmark_eager_cuda.cc b/test/cpp/eager/performance_tests/benchmark_eager_cuda.cc
index 79e0d382f75289..6b3f395ea8f5b1 100644
--- a/test/cpp/eager/performance_tests/benchmark_eager_cuda.cc
+++ b/test/cpp/eager/performance_tests/benchmark_eager_cuda.cc
@@ -41,7 +41,7 @@ TEST(Benchmark, EagerScaleCUDA) {
   eager_test::InitEnv(paddle::platform::CUDAPlace());
 
   for (const std::string mode : {"Accuracy", "WarmUp", "Performance"}) {
-    paddle::framework::DDim ddim = phi::make_ddim({2, 4, 4, 4});
+    paddle::framework::DDim ddim = common::make_ddim({2, 4, 4, 4});
     paddle::Tensor tensor =
         eager_test::CreateTensorWithValue(ddim,
                                           paddle::platform::CUDAPlace(),
@@ -83,7 +83,7 @@ TEST(Benchmark, EagerMatmulCUDA) {
   eager_test::InitEnv(place);
 
   for (const std::string mode : {"Accuracy", "WarmUp", "Performance"}) {
-    paddle::framework::DDim ddimX = phi::make_ddim({2, 2});
+    paddle::framework::DDim ddimX = common::make_ddim({2, 2});
     paddle::Tensor X =
         eager_test::CreateTensorWithValue(ddimX,
                                           paddle::platform::CUDAPlace(),
@@ -93,7 +93,7 @@ TEST(Benchmark, EagerMatmulCUDA) {
                                           true);
     RetainGradForTensor(X);
 
-    paddle::framework::DDim ddimY = phi::make_ddim({2, 2});
+    paddle::framework::DDim ddimY = common::make_ddim({2, 2});
     paddle::Tensor Y =
         eager_test::CreateTensorWithValue(ddimY,
                                           paddle::platform::CUDAPlace(),
@@ -139,7 +139,7 @@ TEST(Benchmark, EagerIntermediateMatmulCUDA) {
   paddle::imperative::SetCurrentTracer(tracer);
 
   for (const std::string mode : {"Accuracy", "WarmUp", "Performance"}) {
-    paddle::framework::DDim ddimX = phi::make_ddim({2, 2});
+    paddle::framework::DDim ddimX = common::make_ddim({2, 2});
     paddle::Tensor X =
         eager_test::CreateTensorWithValue(ddimX,
                                           paddle::platform::CUDAPlace(),
@@ -149,7 +149,7 @@ TEST(Benchmark, EagerIntermediateMatmulCUDA) {
                                           true);
     RetainGradForTensor(X);
 
-    paddle::framework::DDim ddimY = phi::make_ddim({2, 2});
+    paddle::framework::DDim ddimY = common::make_ddim({2, 2});
     paddle::Tensor Y =
         eager_test::CreateTensorWithValue(ddimY,
                                           paddle::platform::CUDAPlace(),
@@ -195,7 +195,7 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) {
   paddle::imperative::SetCurrentTracer(tracer);
 
   for (const std::string mode : {"Accuracy", "WarmUp", "Performance"}) {
-    paddle::framework::DDim ddimX = phi::make_ddim({MLP_M, MLP_N});
+    paddle::framework::DDim ddimX = common::make_ddim({MLP_M, MLP_N});
     paddle::Tensor X =
         eager_test::CreateTensorWithValue(ddimX,
                                           paddle::platform::CUDAPlace(),
@@ -208,7 +208,7 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) {
     std::vector<paddle::Tensor> Ws;
     std::vector<paddle::Tensor> Bs;
     for (size_t i = 0; i < MLP_NUM_LINEAR; i++) {
-      paddle::framework::DDim ddimW = phi::make_ddim({MLP_N, MLP_K});
+      paddle::framework::DDim ddimW = common::make_ddim({MLP_N, MLP_K});
       paddle::Tensor W =
           eager_test::CreateTensorWithValue(ddimW,
                                             paddle::platform::CUDAPlace(),
@@ -218,7 +218,7 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) {
                                             true);
       RetainGradForTensor(W);
 
-      paddle::framework::DDim ddimB = phi::make_ddim({MLP_K});
+      paddle::framework::DDim ddimB = common::make_ddim({MLP_K});
       paddle::Tensor B =
           eager_test::CreateTensorWithValue(ddimB,
                                             paddle::platform::CUDAPlace(),
diff --git a/test/cpp/eager/performance_tests/benchmark_fluid_cpu.cc b/test/cpp/eager/performance_tests/benchmark_fluid_cpu.cc
index 069065d03c21a1..f1ac8bc77e7f3a 100644
--- a/test/cpp/eager/performance_tests/benchmark_fluid_cpu.cc
+++ b/test/cpp/eager/performance_tests/benchmark_fluid_cpu.cc
@@ -51,7 +51,7 @@ TEST(Benchmark, FluidScaleCPU) {
     std::vector<int64_t> dims = {2, 4, 4, 4};
 
     auto* x_tensor = X->MutableVar()->GetMutable<phi::DenseTensor>();
-    x_tensor->Resize(phi::make_ddim(dims));
+    x_tensor->Resize(common::make_ddim(dims));
     auto* mutable_x = x_tensor->mutable_data<float>(place);
     paddle::memory::Copy(place,
                          mutable_x,
@@ -100,7 +100,7 @@ TEST(Benchmark, FluidMatmulCPU) {
     std::vector<int64_t> dims = {2, 2};
 
     auto* x_tensor = X->MutableVar()->GetMutable<phi::DenseTensor>();
-    x_tensor->Resize(phi::make_ddim(dims));
+    x_tensor->Resize(common::make_ddim(dims));
     auto* mutable_x = x_tensor->mutable_data<float>(place);
     paddle::memory::Copy(place,
                          mutable_x,
@@ -109,7 +109,7 @@ TEST(Benchmark, FluidMatmulCPU) {
                          sizeof(float) * x_src_data.size());
 
     auto* y_tensor = Y->MutableVar()->GetMutable<phi::DenseTensor>();
-    y_tensor->Resize(phi::make_ddim(dims));
+    y_tensor->Resize(common::make_ddim(dims));
     auto* mutable_y = y_tensor->mutable_data<float>(place);
     paddle::memory::Copy(place,
                          mutable_y,
@@ -161,7 +161,7 @@ TEST(Benchmark, FluidMLPCPU) {
     X->SetOverridedStopGradient(false);
 
     auto* x_tensor = X->MutableVar()->GetMutable<phi::DenseTensor>();
-    x_tensor->Resize(phi::make_ddim(x_dims));
+    x_tensor->Resize(common::make_ddim(x_dims));
     auto* mutable_x = x_tensor->mutable_data<float>(place);
     paddle::memory::Copy(place,
                          mutable_x,
@@ -180,7 +180,7 @@ TEST(Benchmark, FluidMLPCPU) {
       B->SetOverridedStopGradient(false);
 
       auto* w_tensor = W->MutableVar()->GetMutable<phi::DenseTensor>();
-      w_tensor->Resize(phi::make_ddim(w_dims));
+      w_tensor->Resize(common::make_ddim(w_dims));
       auto* mutable_w = w_tensor->mutable_data<float>(place);
       paddle::memory::Copy(place,
                            mutable_w,
@@ -189,7 +189,7 @@ TEST(Benchmark, FluidMLPCPU) {
                            sizeof(float) * w_src_data.size());
 
       auto* b_tensor = B->MutableVar()->GetMutable<phi::DenseTensor>();
-      b_tensor->Resize(phi::make_ddim(b_dims));
+      b_tensor->Resize(common::make_ddim(b_dims));
       auto* mutable_b = b_tensor->mutable_data<float>(place);
       paddle::memory::Copy(place,
                            mutable_b,
diff --git a/test/cpp/eager/performance_tests/benchmark_fluid_cuda.cc b/test/cpp/eager/performance_tests/benchmark_fluid_cuda.cc
index 178fbdce86c3d6..909165bf99688a 100644
--- a/test/cpp/eager/performance_tests/benchmark_fluid_cuda.cc
+++ b/test/cpp/eager/performance_tests/benchmark_fluid_cuda.cc
@@ -52,7 +52,7 @@ TEST(Benchmark, FluidScaleCUDA) {
     std::vector<int64_t> dims = {2, 4, 4, 4};
 
     auto* x_tensor = X->MutableVar()->GetMutable<phi::DenseTensor>();
-    x_tensor->Resize(phi::make_ddim(dims));
+    x_tensor->Resize(common::make_ddim(dims));
     auto* mutable_x = x_tensor->mutable_data<float>(place);
 
     paddle::platform::DeviceContextPool& pool =
@@ -115,7 +115,7 @@ TEST(Benchmark, FluidMatmulCUDA) {
     auto stream = dev_ctx->stream();
 
     auto* x_tensor = X->MutableVar()->GetMutable<phi::DenseTensor>();
-    x_tensor->Resize(phi::make_ddim(dims));
+    x_tensor->Resize(common::make_ddim(dims));
     auto* mutable_x = x_tensor->mutable_data<float>(place);
     paddle::memory::Copy(place,
                          mutable_x,
@@ -125,7 +125,7 @@ TEST(Benchmark, FluidMatmulCUDA) {
                          stream);
 
     auto* y_tensor = Y->MutableVar()->GetMutable<phi::DenseTensor>();
-    y_tensor->Resize(phi::make_ddim(dims));
+    y_tensor->Resize(common::make_ddim(dims));
     auto* mutable_y = y_tensor->mutable_data<float>(place);
     paddle::memory::Copy(place,
                          mutable_y,
@@ -185,7 +185,7 @@ TEST(Benchmark, FluidMLPCUDA) {
     X->SetOverridedStopGradient(false);
 
     auto* x_tensor = X->MutableVar()->GetMutable<phi::DenseTensor>();
-    x_tensor->Resize(phi::make_ddim(x_dims));
+    x_tensor->Resize(common::make_ddim(x_dims));
     auto* mutable_x = x_tensor->mutable_data<float>(place);
     paddle::memory::Copy(place,
                          mutable_x,
@@ -205,7 +205,7 @@ TEST(Benchmark, FluidMLPCUDA) {
       B->SetOverridedStopGradient(false);
 
       auto* w_tensor = W->MutableVar()->GetMutable<phi::DenseTensor>();
-      w_tensor->Resize(phi::make_ddim(w_dims));
+      w_tensor->Resize(common::make_ddim(w_dims));
       auto* mutable_w = w_tensor->mutable_data<float>(place);
       paddle::memory::Copy(place,
                            mutable_w,
@@ -215,7 +215,7 @@ TEST(Benchmark, FluidMLPCUDA) {
                            stream);
 
       auto* b_tensor = B->MutableVar()->GetMutable<phi::DenseTensor>();
-      b_tensor->Resize(phi::make_ddim(b_dims));
+      b_tensor->Resize(common::make_ddim(b_dims));
       auto* mutable_b = b_tensor->mutable_data<float>(place);
       paddle::memory::Copy(place,
                            mutable_b,
diff --git a/test/cpp/eager/task_tests/CMakeLists.txt b/test/cpp/eager/task_tests/CMakeLists.txt
index 4df64e81d0ffc4..9bcd4b19f856a2 100755
--- a/test/cpp/eager/task_tests/CMakeLists.txt
+++ b/test/cpp/eager/task_tests/CMakeLists.txt
@@ -1,7 +1,7 @@
 cc_test(
   test_egr_task_nan_inf_utils
   SRCS nan_inf_utils_test.cc
-  DEPS eager_nan_inf_utils phi)
+  DEPS eager_nan_inf_utils phi common)
 
 if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
   cc_test(
diff --git a/test/cpp/eager/task_tests/backward_test.cc b/test/cpp/eager/task_tests/backward_test.cc
index c520c92c7f3e29..a6730d2dead69d 100644
--- a/test/cpp/eager/task_tests/backward_test.cc
+++ b/test/cpp/eager/task_tests/backward_test.cc
@@ -38,7 +38,7 @@ TEST(Backward, SingleNodeEmptyGrad) {
   eager_test::InitEnv(paddle::platform::CPUPlace());
 
   // Prepare Inputs
-  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+  paddle::framework::DDim ddim = common::make_ddim({4, 16, 16, 32});
 
   // Create Target Tensor
   paddle::Tensor target_tensor =
@@ -90,7 +90,7 @@ TEST(Backward, SingleNodeCustomGrad) {
 
   // Prepare Inputs
   std::vector<paddle::Tensor> target_tensors;
-  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+  paddle::framework::DDim ddim = common::make_ddim({4, 16, 16, 32});
 
   // Create Target Tensor
   paddle::Tensor tensor =
@@ -162,7 +162,7 @@ TEST(Backward, LinearNodes) {
 
   // Prepare Inputs
   std::vector<paddle::Tensor> target_tensors;
-  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+  paddle::framework::DDim ddim = common::make_ddim({4, 16, 16, 32});
 
   // Create Target Tensor
   paddle::Tensor tensor =
@@ -237,7 +237,7 @@ TEST(Backward, WithAccumulation) {
   eager_test::InitEnv(paddle::platform::CPUPlace());
 
   // Prepare Inputs
-  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+  paddle::framework::DDim ddim = common::make_ddim({4, 16, 16, 32});
 
   // Create Target Tensor
   std::vector<paddle::Tensor> target_tensors;
diff --git a/test/cpp/eager/task_tests/cross_batch_accumulation_test.cc b/test/cpp/eager/task_tests/cross_batch_accumulation_test.cc
index b4d62fa27c08fc..007f8f80dacc73 100644
--- a/test/cpp/eager/task_tests/cross_batch_accumulation_test.cc
+++ b/test/cpp/eager/task_tests/cross_batch_accumulation_test.cc
@@ -35,7 +35,7 @@ TEST(CrossBatchAccumulation, SingleScaleNode) {
   eager_test::InitEnv(paddle::platform::CPUPlace());
 
   std::vector<paddle::Tensor> target_tensors;
-  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+  paddle::framework::DDim ddim = common::make_ddim({4, 16, 16, 32});
 
   paddle::Tensor tensor =
       eager_test::CreateTensorWithValue(ddim,
diff --git a/test/cpp/eager/task_tests/eager_utils_test.cc b/test/cpp/eager/task_tests/eager_utils_test.cc
index 77902fa5eed506..5326e359780c2a 100644
--- a/test/cpp/eager/task_tests/eager_utils_test.cc
+++ b/test/cpp/eager/task_tests/eager_utils_test.cc
@@ -31,7 +31,7 @@ namespace egr {
 TEST(EagerUtils, AutoGradMeta) {
   // Construct Eager Tensor
   phi::DenseTensorMeta meta =
-      phi::DenseTensorMeta(phi::DataType::FLOAT32, phi::make_ddim({1, 1}));
+      phi::DenseTensorMeta(phi::DataType::FLOAT32, common::make_ddim({1, 1}));
   std::shared_ptr<phi::DenseTensor> dt0 = std::make_shared<phi::DenseTensor>(
       std::make_unique<paddle::experimental::DefaultAllocator>(
           paddle::platform::CPUPlace())
@@ -169,7 +169,7 @@ TEST(EagerUtils, PassStopGradient) {
 }
 
 TEST(EagerUtils, TrySyncToVar) {
-  paddle::framework::DDim ddim = phi::make_ddim({2, 4, 4, 4});
+  paddle::framework::DDim ddim = common::make_ddim({2, 4, 4, 4});
   auto tensor = CreateTestCPUTensor(5.0f, ddim);
   std::vector<std::shared_ptr<egr::EagerVariable>> var_bases = {
       egr::EagerUtils::TrySyncToVar(tensor)};
@@ -187,7 +187,7 @@ TEST(EagerUtils, TrySyncToVar) {
 }
 
 TEST(EagerUtils, TrySyncToVars) {
-  paddle::framework::DDim ddim = phi::make_ddim({2, 4, 4, 4});
+  paddle::framework::DDim ddim = common::make_ddim({2, 4, 4, 4});
   std::vector<paddle::Tensor> tensors = {CreateTestCPUTensor(1.0f, ddim),
                                          CreateTestCPUTensor(2.0f, ddim)};
 
diff --git a/test/cpp/eager/task_tests/forward_autograd_test.cc b/test/cpp/eager/task_tests/forward_autograd_test.cc
index d7d1b87c99dfb6..c68e51ab2b08bd 100644
--- a/test/cpp/eager/task_tests/forward_autograd_test.cc
+++ b/test/cpp/eager/task_tests/forward_autograd_test.cc
@@ -35,7 +35,7 @@ TEST(Forward, SingleNode) {
 
   // Prepare Inputs
   std::vector<paddle::Tensor> target_tensors;
-  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+  paddle::framework::DDim ddim = common::make_ddim({4, 16, 16, 32});
 
   // Create Target Tensor
   paddle::Tensor t =
@@ -85,7 +85,7 @@ TEST(Forward, LinearNodes) {
 
   // Prepare Inputs
   std::vector<paddle::Tensor> target_tensors;
-  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+  paddle::framework::DDim ddim = common::make_ddim({4, 16, 16, 32});
 
   // Create Target Tensor
   paddle::Tensor t =
@@ -171,7 +171,7 @@ TEST(Forward, BranchedNodes) {
 
   // Prepare Inputs
   std::vector<paddle::Tensor> target_tensors;
-  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+  paddle::framework::DDim ddim = common::make_ddim({4, 16, 16, 32});
 
   // Create Target Tensor
   paddle::Tensor t =
diff --git a/test/cpp/eager/task_tests/fwd_bwd_joint_test.cc b/test/cpp/eager/task_tests/fwd_bwd_joint_test.cc
index 1aff3a2104fa15..133bd7e7c954ac 100644
--- a/test/cpp/eager/task_tests/fwd_bwd_joint_test.cc
+++ b/test/cpp/eager/task_tests/fwd_bwd_joint_test.cc
@@ -43,7 +43,8 @@ paddle::Tensor hook_function(const paddle::Tensor& t) {
   auto ret_meta = phi::DenseTensorMeta(
       t_dense->dtype(), t_dense->dims(), t_dense->layout());
   auto place = t_dense->place();
-  size_t bytes_size = phi::product(t_dense->dims()) * SizeOf(t_dense->dtype());
+  size_t bytes_size =
+      common::product(t_dense->dims()) * SizeOf(t_dense->dtype());
   auto ret_dense = std::make_shared<phi::DenseTensor>(
       paddle::memory::Alloc(place, bytes_size), std::move(ret_meta));
 
@@ -64,7 +65,7 @@ TEST(FwdBwdJoint, SingleNode) {
   eager_test::InitEnv(paddle::platform::CPUPlace());
 
   // 1. Prepare Input
-  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+  paddle::framework::DDim ddim = common::make_ddim({4, 16, 16, 32});
   paddle::Tensor tensor =
       eager_test::CreateTensorWithValue(ddim,
                                         paddle::platform::CPUPlace(),
@@ -108,7 +109,7 @@ TEST(FwdBwdJoint, LinearNodes) {
   eager_test::InitEnv(paddle::platform::CPUPlace());
 
   // 1. Prepare Input
-  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+  paddle::framework::DDim ddim = common::make_ddim({4, 16, 16, 32});
   paddle::Tensor tensor =
       eager_test::CreateTensorWithValue(ddim,
                                         paddle::platform::CPUPlace(),
@@ -162,7 +163,7 @@ TEST(FwdBwdJoint, BranchedNodes) {
   eager_test::InitEnv(paddle::platform::CPUPlace());
 
   // 1. Prepare Input
-  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+  paddle::framework::DDim ddim = common::make_ddim({4, 16, 16, 32});
   paddle::Tensor tensor =
       eager_test::CreateTensorWithValue(ddim,
                                         paddle::platform::CPUPlace(),
@@ -235,7 +236,7 @@ TEST(FwdBwdJoint, GradientHook) {
   eager_test::InitEnv(paddle::platform::CPUPlace());
 
   // 1. Prepare Input
-  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+  paddle::framework::DDim ddim = common::make_ddim({4, 16, 16, 32});
   paddle::Tensor tensor =
       eager_test::CreateTensorWithValue(ddim,
                                         paddle::platform::CPUPlace(),
@@ -308,7 +309,7 @@ TEST(FwdBwdJoint, CrossBatchAccumulation) {
   eager_test::InitEnv(paddle::platform::CPUPlace());
 
   // 1. Prepare Input
-  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+  paddle::framework::DDim ddim = common::make_ddim({4, 16, 16, 32});
   paddle::Tensor tensor =
       eager_test::CreateTensorWithValue(ddim,
                                         paddle::platform::CPUPlace(),
@@ -363,7 +364,7 @@ TEST(FwdBwdJoint, SingleNodeCUDA) {
   eager_test::InitEnv(paddle::platform::CUDAPlace());
 
   // 1. Prepare Input
-  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+  paddle::framework::DDim ddim = common::make_ddim({4, 16, 16, 32});
   paddle::Tensor tensor =
       eager_test::CreateTensorWithValue(ddim,
                                         paddle::platform::CUDAPlace(),
@@ -404,7 +405,7 @@ TEST(FwdBwdJoint, BranchedNodesCUDA) {
   eager_test::InitEnv(paddle::platform::CUDAPlace());
 
   // 1. Prepare Input
-  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+  paddle::framework::DDim ddim = common::make_ddim({4, 16, 16, 32});
   paddle::Tensor tensor =
       eager_test::CreateTensorWithValue(ddim,
                                         paddle::platform::CUDAPlace(),
diff --git a/test/cpp/eager/task_tests/generated_test.cc b/test/cpp/eager/task_tests/generated_test.cc
index 36032ef21f1b1f..3c1753a3dd5263 100644
--- a/test/cpp/eager/task_tests/generated_test.cc
+++ b/test/cpp/eager/task_tests/generated_test.cc
@@ -41,7 +41,7 @@ TEST(Generated, Sigmoid) {
   eager_test::InitEnv(paddle::platform::CPUPlace());
   VLOG(6) << "Init Env";
   // 1. Prepare Input
-  paddle::framework::DDim ddim = phi::make_ddim({2, 4, 4, 4});
+  paddle::framework::DDim ddim = common::make_ddim({2, 4, 4, 4});
   VLOG(6) << "Make Dim";
   paddle::Tensor tensor =
       eager_test::CreateTensorWithValue(ddim,
@@ -73,7 +73,7 @@ TEST(Generated, Matmul_v2) {
   paddle::imperative::SetCurrentTracer(tracer);
 
   // 1. Prepare Input
-  paddle::framework::DDim ddimX = phi::make_ddim({4, 16});
+  paddle::framework::DDim ddimX = common::make_ddim({4, 16});
   paddle::Tensor X =
       eager_test::CreateTensorWithValue(ddimX,
                                         paddle::platform::CPUPlace(),
@@ -83,7 +83,7 @@ TEST(Generated, Matmul_v2) {
                                         true);
   egr_utils_api::RetainGradForTensor(X);
 
-  paddle::framework::DDim ddimY = phi::make_ddim({16, 20});
+  paddle::framework::DDim ddimY = common::make_ddim({16, 20});
   paddle::Tensor Y =
       eager_test::CreateTensorWithValue(ddimY,
                                         paddle::platform::CPUPlace(),
@@ -113,7 +113,7 @@ TEST(Generated, ElementwiseAdd) {
   paddle::imperative::SetCurrentTracer(tracer);
 
   // 1. Prepare Input
-  paddle::framework::DDim ddimX = phi::make_ddim({4, 16});
+  paddle::framework::DDim ddimX = common::make_ddim({4, 16});
   paddle::Tensor X =
       eager_test::CreateTensorWithValue(ddimX,
                                         paddle::platform::CPUPlace(),
@@ -123,7 +123,7 @@ TEST(Generated, ElementwiseAdd) {
                                         true);
   egr_utils_api::RetainGradForTensor(X);
 
-  paddle::framework::DDim ddimY = phi::make_ddim({4, 16});
+  paddle::framework::DDim ddimY = common::make_ddim({4, 16});
   paddle::Tensor Y =
       eager_test::CreateTensorWithValue(ddimY,
                                         paddle::platform::CPUPlace(),
diff --git a/test/cpp/eager/task_tests/grad_test.cc b/test/cpp/eager/task_tests/grad_test.cc
index ed4fb839dc6cd0..878ce0404954d4 100644
--- a/test/cpp/eager/task_tests/grad_test.cc
+++ b/test/cpp/eager/task_tests/grad_test.cc
@@ -37,7 +37,7 @@ TEST(Grad, SingleNodeEmptyGrad) {
   eager_test::InitEnv(paddle::platform::CPUPlace());
 
   // Prepare Inputs
-  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+  paddle::framework::DDim ddim = common::make_ddim({4, 16, 16, 32});
 
   // Create Target Tensor (output)
   paddle::Tensor output_tensor =
@@ -104,7 +104,7 @@ TEST(Grad, SingleNodeCustomGrad) {
 
   // Prepare Inputs
   std::vector<paddle::Tensor> target_tensors;
-  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+  paddle::framework::DDim ddim = common::make_ddim({4, 16, 16, 32});
 
   // Create Target Tensor
   paddle::Tensor tensor =
@@ -183,7 +183,7 @@ TEST(Grad, LinearNodes) {
 
   // Prepare Target Tensor
   std::vector<paddle::Tensor> target_tensors;
-  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+  paddle::framework::DDim ddim = common::make_ddim({4, 16, 16, 32});
 
   // Create Target Tensor
   paddle::Tensor tensor =
@@ -264,7 +264,7 @@ TEST(Grad, WithAccumulation) {
   eager_test::InitEnv(paddle::platform::CPUPlace());
 
   // Prepare Inputs
-  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+  paddle::framework::DDim ddim = common::make_ddim({4, 16, 16, 32});
 
   // Create Target Tensor
   std::vector<paddle::Tensor> target_tensors;
diff --git a/test/cpp/eager/task_tests/hook_test.cc b/test/cpp/eager/task_tests/hook_test.cc
index 898590201eef63..b0812ea48d562d 100644
--- a/test/cpp/eager/task_tests/hook_test.cc
+++ b/test/cpp/eager/task_tests/hook_test.cc
@@ -38,7 +38,8 @@ paddle::Tensor hook_function(const paddle::Tensor& t) {
   auto ret_meta = phi::DenseTensorMeta(
       t_dense->dtype(), t_dense->dims(), t_dense->layout());
   auto place = t_dense->place();
-  size_t bytes_size = phi::product(t_dense->dims()) * SizeOf(t_dense->dtype());
+  size_t bytes_size =
+      common::product(t_dense->dims()) * SizeOf(t_dense->dtype());
   auto ret_dense = std::make_shared<phi::DenseTensor>(
       paddle::memory::Alloc(place, bytes_size), std::move(ret_meta));
 
@@ -60,7 +61,7 @@ TEST(RetainGrad, HookBeforeRetainGrad) {
 
   // Prepare Inputs
   std::vector<paddle::Tensor> target_tensors;
-  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+  paddle::framework::DDim ddim = common::make_ddim({4, 16, 16, 32});
 
   // Create Target Tensor
   paddle::Tensor tensor =
@@ -136,7 +137,7 @@ TEST(RetainGrad, HookAfterRetainGrad) {
 
   // Prepare Inputs
   std::vector<paddle::Tensor> target_tensors;
-  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+  paddle::framework::DDim ddim = common::make_ddim({4, 16, 16, 32});
 
   // Create Target Tensor
   paddle::Tensor tensor =
diff --git a/test/cpp/eager/task_tests/hook_test_intermidiate.cc b/test/cpp/eager/task_tests/hook_test_intermidiate.cc
index 37070d9b7b8f27..050672e2f07c52 100644
--- a/test/cpp/eager/task_tests/hook_test_intermidiate.cc
+++ b/test/cpp/eager/task_tests/hook_test_intermidiate.cc
@@ -41,7 +41,8 @@ paddle::Tensor hook_function(const paddle::Tensor& t) {
   auto ret_meta = phi::DenseTensorMeta(
       t_dense->dtype(), t_dense->dims(), t_dense->layout());
   auto place = t_dense->place();
-  size_t bytes_size = phi::product(t_dense->dims()) * SizeOf(t_dense->dtype());
+  size_t bytes_size =
+      common::product(t_dense->dims()) * SizeOf(t_dense->dtype());
   auto ret_dense = std::make_shared<phi::DenseTensor>(
       paddle::memory::Alloc(place, bytes_size), std::move(ret_meta));
 
@@ -64,7 +65,7 @@ void test_sigmoid(bool is_remove_gradient_hook) {
   eager_test::InitEnv(paddle::platform::CPUPlace());
 
   VLOG(6) << "Make Dim";
-  paddle::framework::DDim ddim = phi::make_ddim({2, 4, 4, 4});
+  paddle::framework::DDim ddim = common::make_ddim({2, 4, 4, 4});
 
   VLOG(6) << "Make paddle::Tensor";
   paddle::Tensor tensor =
@@ -131,7 +132,7 @@ void test_elementwiseAdd(bool is_remove_gradient_hook) {
   paddle::imperative::SetCurrentTracer(tracer);
 
   // 1. Prepare Input
-  paddle::framework::DDim ddimX = phi::make_ddim({4, 16});
+  paddle::framework::DDim ddimX = common::make_ddim({4, 16});
   paddle::Tensor X =
       eager_test::CreateTensorWithValue(ddimX,
                                         paddle::platform::CPUPlace(),
@@ -141,7 +142,7 @@ void test_elementwiseAdd(bool is_remove_gradient_hook) {
                                         true);
   egr_utils_api::RetainGradForTensor(X);
 
-  paddle::framework::DDim ddimY = phi::make_ddim({4, 16});
+  paddle::framework::DDim ddimY = common::make_ddim({4, 16});
   paddle::Tensor Y =
       eager_test::CreateTensorWithValue(ddimY,
                                         paddle::platform::CPUPlace(),
@@ -195,7 +196,7 @@ void test_matmul(bool is_remove_gradient_hook) {
   paddle::imperative::SetCurrentTracer(tracer);
 
   // 1. Prepare Input
-  paddle::framework::DDim ddimX = phi::make_ddim({4, 16});
+  paddle::framework::DDim ddimX = common::make_ddim({4, 16});
   paddle::Tensor X =
       eager_test::CreateTensorWithValue(ddimX,
                                         paddle::platform::CPUPlace(),
@@ -205,7 +206,7 @@ void test_matmul(bool is_remove_gradient_hook) {
                                         true);
   egr_utils_api::RetainGradForTensor(X);
 
-  paddle::framework::DDim ddimY = phi::make_ddim({16, 20});
+  paddle::framework::DDim ddimY = common::make_ddim({16, 20});
   paddle::Tensor Y =
       eager_test::CreateTensorWithValue(ddimY,
                                         paddle::platform::CPUPlace(),
@@ -258,7 +259,7 @@ void test_backward_final_hooks() {
   eager_test::InitEnv(paddle::platform::CPUPlace());
 
   VLOG(6) << "Make paddle::Tensor";
-  paddle::framework::DDim ddimX = phi::make_ddim({4, 16});
+  paddle::framework::DDim ddimX = common::make_ddim({4, 16});
   paddle::Tensor X =
       eager_test::CreateTensorWithValue(ddimX,
                                         paddle::platform::CPUPlace(),
@@ -266,7 +267,7 @@ void test_backward_final_hooks() {
                                         phi::DataLayout::NCHW,
                                         3.0,
                                         true);
-  paddle::framework::DDim ddimY = phi::make_ddim({16, 20});
+  paddle::framework::DDim ddimY = common::make_ddim({16, 20});
   egr_utils_api::RetainGradForTensor(X);
 
   paddle::Tensor Y =
diff --git a/test/cpp/eager/task_tests/tensor_utils_test.cc b/test/cpp/eager/task_tests/tensor_utils_test.cc
index a39280101a5bc7..98d8c59ae2e47e 100644
--- a/test/cpp/eager/task_tests/tensor_utils_test.cc
+++ b/test/cpp/eager/task_tests/tensor_utils_test.cc
@@ -32,7 +32,7 @@ TEST(TensorUtils, Test) {
 
   // Prepare Inputs
   std::vector<paddle::Tensor> target_tensors;
-  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+  paddle::framework::DDim ddim = common::make_ddim({4, 16, 16, 32});
 
   // Create Target Tensor
   paddle::Tensor t =
diff --git a/test/cpp/eager/test_utils.h b/test/cpp/eager/test_utils.h
index 9d6ec2fbf797bb..393eec6f402bfb 100644
--- a/test/cpp/eager/test_utils.h
+++ b/test/cpp/eager/test_utils.h
@@ -35,8 +35,11 @@ inline paddle::Tensor CreateTensorWithValue(
     const phi::DataLayout& layout,
     float value,
     bool is_leaf = true) {
-  paddle::Tensor out = paddle::experimental::full(
-      phi::vectorize(ddim), paddle::experimental::Scalar(value), dtype, place);
+  paddle::Tensor out =
+      paddle::experimental::full(common::vectorize(ddim),
+                                 paddle::experimental::Scalar(value),
+                                 dtype,
+                                 place);
 
   auto meta = egr::EagerUtils::autograd_meta(&out);
   if (is_leaf) {
diff --git a/test/cpp/fluid/CMakeLists.txt b/test/cpp/fluid/CMakeLists.txt
index bba22ebf76b935..cf5a0d21302e1b 100644
--- a/test/cpp/fluid/CMakeLists.txt
+++ b/test/cpp/fluid/CMakeLists.txt
@@ -47,6 +47,7 @@ cc_test(
        generated_op
        elementwise_add_op
        phi
+       common
        generated_static_op)
 cc_test(
   gather_test
@@ -59,7 +60,7 @@ cc_test(
 cc_test(
   scatter_test
   SRCS scatter_test.cc
-  DEPS tensor phi)
+  DEPS tensor phi common)
 cc_test(
   beam_search_decode_op_test
   SRCS beam_search_decode_op_test.cc
@@ -79,7 +80,7 @@ if(WITH_GPU)
   nv_test(
     dropout_op_test
     SRCS dropout_op_test.cc
-    DEPS dropout_op tensor phi)
+    DEPS dropout_op tensor phi common)
   nv_test(
     test_leaky_relu_grad_grad_functor
     SRCS test_leaky_relu_grad_grad_functor.cc
@@ -88,12 +89,18 @@ if(WITH_GPU)
   nv_test(
     feed_forward_test
     SRCS feed_forward_test.cu
-    DEPS fleet_executor elementwise_add_op matmul_op tensor phi ${CINN_DEPS})
+    DEPS fleet_executor
+         elementwise_add_op
+         matmul_op
+         tensor
+         phi
+         common
+         ${CINN_DEPS})
 elseif(WITH_ROCM)
   hip_test(
     dropout_op_test
     SRCS dropout_op_test.cc
-    DEPS dropout_op tensor phi)
+    DEPS dropout_op tensor phi common)
   hip_test(
     test_leaky_relu_grad_grad_functor
     SRCS test_leaky_relu_grad_grad_functor.cc
@@ -117,6 +124,7 @@ if(WITH_CINN)
     op_debug_string_test.cc
     DEPS
     executor
+    common
     fleet_executor
     recurrent_op_helper
     recurrent_op
@@ -124,14 +132,20 @@ if(WITH_CINN)
     ${COMMON_OP_DEPS}
     python)
 else()
-  paddle_test(op_debug_string_test SRCS op_debug_string_test.cc)
+  paddle_test(op_debug_string_test SRCS op_debug_string_test.cc DEPS common)
 endif()
 
 if(WITH_GPU)
   cc_test(
     copy_cross_scope_test
     SRCS copy_cross_scope_test.cc
-    DEPS op_registry copy_cross_scope_op scope device_context enforce executor)
+    DEPS op_registry
+         copy_cross_scope_op
+         scope
+         device_context
+         enforce
+         executor
+         common)
 endif()
 
 if(WITH_ONNXRUNTIME AND WIN32)
diff --git a/test/cpp/fluid/assign_op_test.cc b/test/cpp/fluid/assign_op_test.cc
index cc6c915c09a40c..8f53cce426456b 100644
--- a/test/cpp/fluid/assign_op_test.cc
+++ b/test/cpp/fluid/assign_op_test.cc
@@ -15,10 +15,10 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/phi/core/ddim.h"
 
 TEST(AssignOp, AssignLoDTensor) {
   paddle::platform::CPUPlace cpu_place;
@@ -28,7 +28,7 @@ TEST(AssignOp, AssignLoDTensor) {
   paddle::operators::AssignFunctor assign_functor(&output, ctx);
 
   phi::DenseTensor input;
-  paddle::framework::DDim in_dims = phi::make_ddim({3, 4});
+  paddle::framework::DDim in_dims = common::make_ddim({3, 4});
   int* in_data = input.mutable_data<int>(in_dims, cpu_place);
   for (int i = 0; i < 12; ++i) {
     in_data[i] = i;
@@ -54,7 +54,7 @@ TEST(AssignOp, AssignLoDTensorArray) {
 
   paddle::framework::LoDTensorArray input;
   for (int i = 0; i < 5; ++i) {
-    paddle::framework::DDim in_dims = phi::make_ddim({i + 1, i + 2});
+    paddle::framework::DDim in_dims = common::make_ddim({i + 1, i + 2});
     phi::DenseTensor lod_tensor;
     float* in_data = lod_tensor.mutable_data<float>(in_dims, cpu_place);
     for (int j = 0; j < (i + 1) * (i + 2); ++j) {
@@ -68,7 +68,7 @@ TEST(AssignOp, AssignLoDTensorArray) {
   auto& out_array = output.Get<paddle::framework::LoDTensorArray>();
   for (int i = 0; i < 5; ++i) {
     paddle::framework::DDim out_dims = out_array[i].dims();
-    EXPECT_EQ(phi::make_ddim({i + 1, i + 2}), out_dims);
+    EXPECT_EQ(common::make_ddim({i + 1, i + 2}), out_dims);
     const float* out_data = out_array[i].data<float>();
     for (int j = 0; j < (i + 1) * (i + 2); ++j) {
       EXPECT_EQ(static_cast<float>(j), out_data[j]);
@@ -89,7 +89,7 @@ TEST(AssignOp, AssignSelectedRows) {
   phi::SelectedRows input(rows, height);
   phi::DenseTensor* input_tensor = input.mutable_value();
 
-  paddle::framework::DDim in_dims = phi::make_ddim({3, 4});
+  paddle::framework::DDim in_dims = common::make_ddim({3, 4});
   int* in_data = input_tensor->mutable_data<int>(in_dims, cpu_place);
   for (int i = 0; i < 12; ++i) {
     in_data[i] = i;
diff --git a/test/cpp/fluid/benchmark/CMakeLists.txt b/test/cpp/fluid/benchmark/CMakeLists.txt
index 9111dfe2ff35f8..bc634f55cab792 100644
--- a/test/cpp/fluid/benchmark/CMakeLists.txt
+++ b/test/cpp/fluid/benchmark/CMakeLists.txt
@@ -11,7 +11,8 @@ cc_test(
        scope
        ${GLOB_OP_LIB}
        ${GLOB_OPERATOR_DEPS}
-       phi)
+       phi
+       common)
 
 if(WITH_ONNXRUNTIME AND WIN32)
   # Copy onnxruntime for some c++ test in Windows, since the test will
diff --git a/test/cpp/fluid/benchmark/op_tester.cc b/test/cpp/fluid/benchmark/op_tester.cc
index 6f68ab23a45669..a06e8c02c8d238 100644
--- a/test/cpp/fluid/benchmark/op_tester.cc
+++ b/test/cpp/fluid/benchmark/op_tester.cc
@@ -280,14 +280,14 @@ void OpTester::SetupTensor(phi::DenseTensor *tensor,
   std::mt19937 rng(seed++);
   std::uniform_real_distribution<double> uniform_dist(0, 1);
 
-  T *ptr = tensor->mutable_data<T>(phi::make_ddim(shape), place_);
+  T *ptr = tensor->mutable_data<T>(common::make_ddim(shape), place_);
 
   phi::DenseTensor cpu_tensor;
   T *cpu_ptr = nullptr;
 
   if (!platform::is_cpu_place(place_)) {
-    cpu_ptr =
-        cpu_tensor.mutable_data<T>(phi::make_ddim(shape), platform::CPUPlace());
+    cpu_ptr = cpu_tensor.mutable_data<T>(common::make_ddim(shape),
+                                         platform::CPUPlace());
   } else {
     cpu_ptr = ptr;
   }
diff --git a/test/cpp/fluid/benchmark/op_tester.h b/test/cpp/fluid/benchmark/op_tester.h
index de8f62cfe07cd6..5dc2461fbd96a4 100644
--- a/test/cpp/fluid/benchmark/op_tester.h
+++ b/test/cpp/fluid/benchmark/op_tester.h
@@ -19,9 +19,9 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/phi/core/ddim.h"
 #include "test/cpp/fluid/benchmark/op_tester_config.h"
 
 namespace paddle {
diff --git a/test/cpp/fluid/cinn/CMakeLists.txt b/test/cpp/fluid/cinn/CMakeLists.txt
index da4a085cef41d5..0fae3ea78737c5 100644
--- a/test/cpp/fluid/cinn/CMakeLists.txt
+++ b/test/cpp/fluid/cinn/CMakeLists.txt
@@ -6,6 +6,7 @@ if(WITH_TESTING)
     DEPS
     fleet_executor
     phi
+    common
     lod_tensor
     scope
     proto_desc
diff --git a/test/cpp/fluid/cinn/cinn_launch_context_test.cc b/test/cpp/fluid/cinn/cinn_launch_context_test.cc
index 5e7fbea5d876ff..032aad828365c9 100644
--- a/test/cpp/fluid/cinn/cinn_launch_context_test.cc
+++ b/test/cpp/fluid/cinn/cinn_launch_context_test.cc
@@ -27,6 +27,7 @@ limitations under the License. */
 #include "paddle/cinn/hlir/framework/scope.h"
 #include "paddle/cinn/hlir/framework/tensor.h"
 #include "paddle/cinn/runtime/cinn_runtime.h"
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/new_executor/interpretercore.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h"
@@ -35,7 +36,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/operators/cinn/cinn_op_helper.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/pir/core/program.h"
 #include "paddle/pir/core/value.h"
 
@@ -200,7 +200,7 @@ TEST_F(CinnLaunchContextTest, TestConstructResult) {
     auto* buffer = launch_context->GetCinnBufferOfVar(var_name);
     auto&& scope = compiled_obj->scope;
     ASSERT_EQ(framework::DDim(buffer->dims, buffer->dimensions),
-              phi::make_ddim(scope->GetTensor(arg_name)->shape().data()));
+              common::make_ddim(scope->GetTensor(arg_name)->shape().data()));
   };
   check_argument_fn("var1", "cinn_var1");
   check_argument_fn("var2", "cinn_var2");
@@ -216,11 +216,11 @@ TEST_F(CinnLaunchContextTest, TestCheckTensorEquivalent) {
   auto* tensor2 = scope.Var("var2")->GetMutable<phi::DenseTensor>();
 
   // dimension not equivalent
-  tensor1->mutable_data<float>(phi::make_ddim({3, 5}), place);
+  tensor1->mutable_data<float>(common::make_ddim({3, 5}), place);
   ASSERT_THROW(launch_context->CheckTensorEquivalent("var1", *tensor1),
                paddle::platform::EnforceNotMet);
   // data type not equivalent
-  tensor2->mutable_data<int>(phi::make_ddim({6, 7, 8}), place);
+  tensor2->mutable_data<int>(common::make_ddim({6, 7, 8}), place);
   ASSERT_THROW(launch_context->CheckTensorEquivalent("var2", *tensor2),
                paddle::platform::EnforceNotMet);
 }
@@ -243,7 +243,7 @@ TEST_F(CinnLaunchContextTest, TestBuildCompiledProgram) {
     ASSERT_NE(var, nullptr);
     auto* buffer = launch_context->GetCinnBufferOfVar(var_name);
     ASSERT_EQ(framework::DDim(buffer->dims, buffer->dimensions),
-              phi::make_ddim(var->GetShape()));
+              common::make_ddim(var->GetShape()));
   }
   ASSERT_TRUE(block.FindVar("var1")->Persistable());
   ASSERT_FALSE(block.FindVar("var5")->Persistable());
@@ -274,7 +274,7 @@ TEST_F(CinnLaunchContextTest, TestCallbackAssignment) {
 
   // assign external variables
   auto* tensor1 = scope.Var("var1")->GetMutable<phi::DenseTensor>();
-  float* data1 = tensor1->mutable_data<float>(phi::make_ddim({3, 4}), place);
+  float* data1 = tensor1->mutable_data<float>(common::make_ddim({3, 4}), place);
   data1[0] = 9.99f;
   data1[10] = 19.99f;
   // check argument is set correctly and alloc/free callbacks work well
diff --git a/test/cpp/fluid/cinn/cinn_launch_op_test.cc b/test/cpp/fluid/cinn/cinn_launch_op_test.cc
index 5765a2c50269ee..487a0e7d7820a4 100644
--- a/test/cpp/fluid/cinn/cinn_launch_op_test.cc
+++ b/test/cpp/fluid/cinn/cinn_launch_op_test.cc
@@ -21,12 +21,12 @@ limitations under the License. */
 #include <string>
 
 #include "gtest/gtest.h"
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/init.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/flags.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "test/cpp/fluid/cinn/test_helper.h"
diff --git a/test/cpp/fluid/cinn/test_helper.h b/test/cpp/fluid/cinn/test_helper.h
index 040a1858101365..5ffb1120bb7886 100644
--- a/test/cpp/fluid/cinn/test_helper.h
+++ b/test/cpp/fluid/cinn/test_helper.h
@@ -20,12 +20,12 @@ limitations under the License. */
 #include <vector>
 
 #include "gtest/gtest.h"
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/phi/core/ddim.h"
 
 namespace paddle::operators {
 
diff --git a/test/cpp/fluid/controlflow/conditional_block_op_test.cc b/test/cpp/fluid/controlflow/conditional_block_op_test.cc
index 62552dc1fc8ad7..a4575f258d72e3 100644
--- a/test/cpp/fluid/controlflow/conditional_block_op_test.cc
+++ b/test/cpp/fluid/controlflow/conditional_block_op_test.cc
@@ -32,14 +32,14 @@ TEST(ConditionalBlockGrad, NoNeedRunLoDTensorArray) {
 
   Variable* cond_var = scope.Var("condition");
   phi::DenseTensor* cond_tensor = cond_var->GetMutable<phi::DenseTensor>();
-  paddle::framework::DDim cond_dims = phi::make_ddim({1});
+  paddle::framework::DDim cond_dims = common::make_ddim({1});
   bool* cond_data = cond_tensor->mutable_data<bool>(cond_dims, place);
   cond_data[0] = false;
 
   Variable* input_var = scope.Var("input_lod_tensor_array");
   LoDTensorArray* input_tensors = input_var->GetMutable<LoDTensorArray>();
   for (int i = 0; i < 5; ++i) {
-    paddle::framework::DDim in_dims = phi::make_ddim({i + 1, i + 2});
+    paddle::framework::DDim in_dims = common::make_ddim({i + 1, i + 2});
     phi::DenseTensor lod_tensor;
     float* in_data = lod_tensor.mutable_data<float>(in_dims, place);
     for (int j = 0; j < (i + 1) * (i + 2); ++j) {
@@ -66,7 +66,7 @@ TEST(ConditionalBlockGrad, NoNeedRunLoDTensorArray) {
   const LoDTensorArray& out_tensors = input_grad_var->Get<LoDTensorArray>();
   for (int i = 0; i < 5; ++i) {
     paddle::framework::DDim out_dims = out_tensors[i].dims();
-    EXPECT_EQ(phi::make_ddim({i + 1, i + 2}), out_dims);
+    EXPECT_EQ(common::make_ddim({i + 1, i + 2}), out_dims);
     const float* out_data = out_tensors[i].data<float>();
     for (int j = 0; j < (i + 1) * (i + 2); ++j) {
       EXPECT_EQ(0, out_data[j]);
diff --git a/test/cpp/fluid/dlnne/dlnne_engine_op_test.cc b/test/cpp/fluid/dlnne/dlnne_engine_op_test.cc
index 01e12bf1132aa0..af373874aa0b2a 100644
--- a/test/cpp/fluid/dlnne/dlnne_engine_op_test.cc
+++ b/test/cpp/fluid/dlnne/dlnne_engine_op_test.cc
@@ -37,7 +37,7 @@ void CreateCUDATensor(framework::Scope* scope,
                       const std::vector<int64_t>& shape) {
   auto* var = scope->Var(name);
   auto* tensor = var->GetMutable<phi::DenseTensor>();
-  auto dims = phi::make_ddim(shape);
+  auto dims = common::make_ddim(shape);
   tensor->Resize(dims);
   platform::CUDAPlace place;
   phi::GPUContext ctx(place);
diff --git a/test/cpp/fluid/elementwise/CMakeLists.txt b/test/cpp/fluid/elementwise/CMakeLists.txt
index 304063cd81c4c2..f8a9f8b061cb30 100644
--- a/test/cpp/fluid/elementwise/CMakeLists.txt
+++ b/test/cpp/fluid/elementwise/CMakeLists.txt
@@ -1,12 +1,30 @@
 cc_test(
   test_elementwise_add_op_inplace
   SRCS test_elementwise_add_op_inplace.cc
-  DEPS executor op_registry elementwise_add_op scope device_context enforce)
+  DEPS executor
+       op_registry
+       elementwise_add_op
+       scope
+       device_context
+       enforce
+       common)
 cc_test(
   test_elementwise_div_grad_grad
   SRCS test_elementwise_div_grad_grad.cc
-  DEPS executor op_registry elementwise_div_op scope device_context enforce)
+  DEPS executor
+       op_registry
+       elementwise_div_op
+       scope
+       device_context
+       enforce
+       common)
 cc_test(
   test_elementwise_add_grad_grad
   SRCS test_elementwise_add_grad_grad.cc
-  DEPS executor op_registry elementwise_add_op scope device_context enforce)
+  DEPS executor
+       op_registry
+       elementwise_add_op
+       scope
+       device_context
+       enforce
+       common)
diff --git a/test/cpp/fluid/elementwise/test_elementwise_add_grad_grad.cc b/test/cpp/fluid/elementwise/test_elementwise_add_grad_grad.cc
index 4fefd8864c5e9f..630bf64a7d6900 100644
--- a/test/cpp/fluid/elementwise/test_elementwise_add_grad_grad.cc
+++ b/test/cpp/fluid/elementwise/test_elementwise_add_grad_grad.cc
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include "gtest/gtest.h"
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "test/cpp/fluid/elementwise/test_elementwise_op_grad_grad.h"
 
@@ -44,7 +44,7 @@ class TestElementwiseAddGradGradWithoutDDX
   using TestElementwiseOpGradGrad<T>::expected_outs_;
   using TestElementwiseOpGradGrad<T>::dims_;
   void ComputeExpectedOuts() override {
-    size_t numel = static_cast<size_t>(phi::product(dims_));
+    size_t numel = static_cast<size_t>(common::product(dims_));
     std::vector<T> dy(numel);
     std::vector<T> ddout(numel);
     for (size_t i = 0; i < numel; ++i) {
diff --git a/test/cpp/fluid/elementwise/test_elementwise_add_op_inplace.cc b/test/cpp/fluid/elementwise/test_elementwise_add_op_inplace.cc
index db026084a56c75..0f665448171017 100644
--- a/test/cpp/fluid/elementwise/test_elementwise_add_op_inplace.cc
+++ b/test/cpp/fluid/elementwise/test_elementwise_add_op_inplace.cc
@@ -62,7 +62,7 @@ bool TestMain(const platform::Place &place,
   y->Resize(dims);
   z->Resize(dims);
 
-  size_t numel = static_cast<size_t>(phi::product(dims));
+  size_t numel = static_cast<size_t>(common::product(dims));
 
   auto x_ptr = x->mutable_data<T>(place);
   auto y_ptr = y->mutable_data<T>(place);
diff --git a/test/cpp/fluid/elementwise/test_elementwise_div_grad_grad.cc b/test/cpp/fluid/elementwise/test_elementwise_div_grad_grad.cc
index e74da1c884f67b..ddf1229cd0367b 100644
--- a/test/cpp/fluid/elementwise/test_elementwise_div_grad_grad.cc
+++ b/test/cpp/fluid/elementwise/test_elementwise_div_grad_grad.cc
@@ -56,7 +56,7 @@ class TestElementwiseDivGradGradWithoutDout
   using TestElementwiseOpGradGrad<T>::expected_outs_;
   using TestElementwiseOpGradGrad<T>::dims_;
   void ComputeExpectedOuts() override {
-    size_t numel = static_cast<size_t>(phi::product(dims_));
+    size_t numel = static_cast<size_t>(common::product(dims_));
     std::vector<T> dy(numel);
     std::vector<T> ddout(numel);
     for (size_t i = 0; i < numel; ++i) {
diff --git a/test/cpp/fluid/elementwise/test_elementwise_op_grad_grad.h b/test/cpp/fluid/elementwise/test_elementwise_op_grad_grad.h
index 4edbf5ddd05c83..ab67c559532d96 100644
--- a/test/cpp/fluid/elementwise/test_elementwise_op_grad_grad.h
+++ b/test/cpp/fluid/elementwise/test_elementwise_op_grad_grad.h
@@ -69,7 +69,7 @@ class TestElementwiseOpGradGrad {
   }
 
   void Setup() {
-    size_t numel = static_cast<size_t>(phi::product(dims_));
+    size_t numel = static_cast<size_t>(common::product(dims_));
     // init vars in scope and feed inputs
     for (auto in_name : inputs_) {
       InitVarInScope(in_name);
@@ -127,7 +127,7 @@ class TestElementwiseOpGradGrad {
         cpu_out = out_tensor;
       }
       auto *out_ptr = cpu_out.data<T>();
-      size_t numel = static_cast<size_t>(phi::product(dims_));
+      size_t numel = static_cast<size_t>(common::product(dims_));
 #ifdef PADDLE_WITH_HIP
       auto is_equal = std::equal(
           out_ptr,
diff --git a/test/cpp/fluid/feed_forward_test.cu b/test/cpp/fluid/feed_forward_test.cu
index 7febf20e771187..b82f22cd03b5f3 100644
--- a/test/cpp/fluid/feed_forward_test.cu
+++ b/test/cpp/fluid/feed_forward_test.cu
@@ -62,8 +62,8 @@ void GetLinearOp(const std::vector<T> &x,
   auto x_ptr = tensor_x->mutable_data<T>(ctx.GetPlace());
   auto y_ptr = tensor_y->mutable_data<T>(ctx.GetPlace());
   auto z_ptr = tensor_out->mutable_data<T>(ctx.GetPlace());
-  auto size_x = static_cast<size_t>(phi::product(x_dim));
-  auto size_y = static_cast<size_t>(phi::product(y_dim));
+  auto size_x = static_cast<size_t>(common::product(x_dim));
+  auto size_y = static_cast<size_t>(common::product(y_dim));
   auto size_z = x_dim[0] * x_dim[1] * y_dim[0];
   cudaMemcpy(x_ptr, x.data(), size_x * sizeof(T), cudaMemcpyHostToDevice);
   cudaMemcpy(y_ptr, y.data(), size_y * sizeof(T), cudaMemcpyHostToDevice);
@@ -158,8 +158,8 @@ void GetLinearOpGrad(const std::vector<T> &x_vec,
   auto dinput_ptr = tensor_dx->mutable_data<T>(ctx.GetPlace());
   auto dweight_ptr = tensor_dy->mutable_data<T>(ctx.GetPlace());
 
-  auto size_x = static_cast<size_t>(phi::product(x_dim));
-  auto size_y = static_cast<size_t>(phi::product(y_dim));
+  auto size_x = static_cast<size_t>(common::product(x_dim));
+  auto size_y = static_cast<size_t>(common::product(y_dim));
   auto size_z = x_dim[0] * x_dim[1] * y_dim[0];
   cudaMemcpy(x_ptr, x_vec.data(), size_x * sizeof(T), cudaMemcpyHostToDevice);
   cudaMemcpy(y_ptr, y_vec.data(), size_y * sizeof(T), cudaMemcpyHostToDevice);
diff --git a/test/cpp/fluid/framework/CMakeLists.txt b/test/cpp/fluid/framework/CMakeLists.txt
index 5085fa1dbab413..9b65af64656c17 100644
--- a/test/cpp/fluid/framework/CMakeLists.txt
+++ b/test/cpp/fluid/framework/CMakeLists.txt
@@ -39,7 +39,7 @@ cc_test(
 cc_test(
   lod_tensor_test
   SRCS lod_tensor_test.cc
-  DEPS phi lod_tensor memory)
+  DEPS phi common lod_tensor memory)
 
 if(WITH_GPU)
   nv_test(
@@ -61,7 +61,7 @@ cc_test(
 cc_test(
   threadpool_test
   SRCS threadpool_test.cc
-  DEPS phi)
+  DEPS phi common)
 
 cc_test(
   var_type_traits_test
@@ -87,12 +87,12 @@ if(WITH_GPU)
   nv_test(
     data_device_transform_test
     SRCS data_device_transform_test.cu
-    DEPS operator op_registry device_context phi scope)
+    DEPS operator op_registry device_context phi common scope)
 elseif(WITH_ROCM)
   hip_test(
     data_device_transform_test
     SRCS data_device_transform_test.cu
-    DEPS operator op_registry device_context phi scope)
+    DEPS operator op_registry device_context phi common scope)
 endif()
 
 if(WITH_GPU)
@@ -206,6 +206,7 @@ if(WITH_PSCORE)
            heter_server
            gloo_wrapper
            phi
+           common
            ${RPC_DEPS}
            graph_gpu_wrapper)
   else()
@@ -223,6 +224,7 @@ if(WITH_PSCORE)
            heter_server
            gloo_wrapper
            phi
+           common
            ${RPC_DEPS})
   endif()
 else()
@@ -274,7 +276,7 @@ cc_test_old(
 cc_test(
   infershape_utils_test
   SRCS infershape_utils_test.cc
-  DEPS infershape_utils phi)
+  DEPS infershape_utils phi common)
 
 if(WITH_TESTING AND TEST selected_rows_utils_test)
   set_tests_properties(selected_rows_utils_test PROPERTIES TIMEOUT 120)
@@ -320,7 +322,8 @@ if(WITH_CINN)
     python)
   set_tests_properties(cinn_cache_key_test PROPERTIES LABELS "RUN_TYPE=CINN")
 
-  paddle_test(build_cinn_pass_test SRCS paddle2cinn/build_cinn_pass_test.cc)
+  paddle_test(build_cinn_pass_test SRCS paddle2cinn/build_cinn_pass_test.cc
+              DEPS common)
   set_tests_properties(build_cinn_pass_test PROPERTIES LABELS "RUN_TYPE=CINN")
   # target_link_libraries(build_cinn_pass_test ${PYTHON_LIBRARIES})
 
diff --git a/test/cpp/fluid/framework/copy_same_tensor_test.cc b/test/cpp/fluid/framework/copy_same_tensor_test.cc
index 9b892c0c1b092e..edb293168256f2 100644
--- a/test/cpp/fluid/framework/copy_same_tensor_test.cc
+++ b/test/cpp/fluid/framework/copy_same_tensor_test.cc
@@ -17,11 +17,11 @@
 #include <random>
 
 #include "gtest/gtest.h"
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/utils/flags.h"
 
 PD_DECLARE_bool(use_system_allocator);
@@ -80,13 +80,13 @@ static bool CopySameTensorTestMain(const DDim &dims,
 
   const void *ground_truth_ptr = src_cpu_tensor.data();
   const void *result_ptr = dst_cpu_tensor.data();
-  size_t byte_num = phi::product(dims) * sizeof(T);
+  size_t byte_num = common::product(dims) * sizeof(T);
   return std::memcmp(ground_truth_ptr, result_ptr, byte_num) == 0;
 }
 
 TEST(test_tensor_copy, test_copy_same_tensor) {
   using DataType = float;
-  auto dims = phi::make_ddim({3, 4, 5});
+  auto dims = common::make_ddim({3, 4, 5});
 
   auto places = CreatePlaceList();
   for (auto &src_p : places) {
diff --git a/test/cpp/fluid/framework/data_layout_transform_test.cc b/test/cpp/fluid/framework/data_layout_transform_test.cc
index 8927ac7b949d7f..85094c8b6b5889 100644
--- a/test/cpp/fluid/framework/data_layout_transform_test.cc
+++ b/test/cpp/fluid/framework/data_layout_transform_test.cc
@@ -21,7 +21,7 @@ TEST(DataTransform, DataLayoutFunction) {
   auto place = paddle::platform::CPUPlace();
   phi::DenseTensor in = phi::DenseTensor();
   phi::DenseTensor out = phi::DenseTensor();
-  in.mutable_data<double>(phi::make_ddim({2, 3, 1, 2}), place);
+  in.mutable_data<double>(common::make_ddim({2, 3, 1, 2}), place);
   in.set_layout(phi::DataLayout::kNHWC);
 
   auto kernel_nhwc =
@@ -32,19 +32,19 @@ TEST(DataTransform, DataLayoutFunction) {
   paddle::framework::TransDataLayout(kernel_nhwc, kernel_ncwh, in, &out, place);
 
   EXPECT_TRUE(out.layout() == phi::DataLayout::kNCHW);
-  EXPECT_TRUE(out.dims() == phi::make_ddim({2, 2, 3, 1}));
+  EXPECT_TRUE(out.dims() == common::make_ddim({2, 2, 3, 1}));
 
   paddle::framework::TransDataLayout(kernel_ncwh, kernel_nhwc, in, &out, place);
 
   EXPECT_TRUE(in.layout() == phi::DataLayout::kNHWC);
-  EXPECT_TRUE(in.dims() == phi::make_ddim({2, 3, 1, 2}));
+  EXPECT_TRUE(in.dims() == common::make_ddim({2, 3, 1, 2}));
 }
 
 #ifdef PADDLE_WITH_DNNL
 TEST(DataTransformBf16, GetDataFromTensorDNNL) {
   auto place = paddle::platform::CPUPlace();
   phi::DenseTensor in = phi::DenseTensor();
-  in.mutable_data<paddle::platform::bfloat16>(phi::make_ddim({2, 3, 1, 2}),
+  in.mutable_data<paddle::platform::bfloat16>(common::make_ddim({2, 3, 1, 2}),
                                               place);
 
   void* in_data =
@@ -56,7 +56,7 @@ TEST(DataTransformBf16, GetDataFromTensorDNNL) {
 TEST(DataTransformInt32, GetDataFromTensorDNNL) {
   auto place = paddle::platform::CPUPlace();
   phi::DenseTensor in = phi::DenseTensor();
-  in.mutable_data<int32_t>(phi::make_ddim({2, 3, 1, 2}), place);
+  in.mutable_data<int32_t>(common::make_ddim({2, 3, 1, 2}), place);
 
   void* in_data =
       phi::funcs::GetDataFromTensor(in, dnnl::memory::data_type::s32);
diff --git a/test/cpp/fluid/framework/data_type_transform_test.cc b/test/cpp/fluid/framework/data_type_transform_test.cc
index b0ed3328348480..528aaa88d63b5a 100644
--- a/test/cpp/fluid/framework/data_type_transform_test.cc
+++ b/test/cpp/fluid/framework/data_type_transform_test.cc
@@ -45,7 +45,7 @@ TEST(DataTypeTransform, CPUTransform) {
     phi::DenseTensor in;
     phi::DenseTensor out;
 
-    float* ptr = in.mutable_data<float>(phi::make_ddim({2, 3}), place);
+    float* ptr = in.mutable_data<float>(common::make_ddim({2, 3}), place);
     int data_number = 2 * 3;
 
     for (int i = 0; i < data_number; ++i) {
@@ -71,7 +71,7 @@ TEST(DataTypeTransform, CPUTransform) {
     phi::DenseTensor out;
 
     paddle::platform::float16* ptr = in.mutable_data<paddle::platform::float16>(
-        phi::make_ddim({2, 3}), place);
+        common::make_ddim({2, 3}), place);
     int data_number = 2 * 3;
 
     for (int i = 0; i < data_number; ++i) {
@@ -111,7 +111,7 @@ TEST(DataTypeTransform, CPUTransform) {
 
     // transform float to float16
     float* in_data_float =
-        in.mutable_data<float>(phi::make_ddim({2, 3}), place);
+        in.mutable_data<float>(common::make_ddim({2, 3}), place);
     for (int i = 0; i < data_number; ++i) {
       in_data_float[i] = static_cast<float>(i);
     }
@@ -125,7 +125,7 @@ TEST(DataTypeTransform, CPUTransform) {
 
     // transform double to float16
     double* in_data_double =
-        in.mutable_data<double>(phi::make_ddim({2, 3}), place);
+        in.mutable_data<double>(common::make_ddim({2, 3}), place);
     for (int i = 0; i < data_number; ++i) {
       in_data_double[i] = i;
     }
@@ -138,7 +138,7 @@ TEST(DataTypeTransform, CPUTransform) {
     }
 
     // transform int to float16
-    int* in_data_int = in.mutable_data<int>(phi::make_ddim({2, 3}), place);
+    int* in_data_int = in.mutable_data<int>(common::make_ddim({2, 3}), place);
     for (int i = 0; i < data_number; ++i) {
       in_data_int[i] = i;
     }
@@ -152,7 +152,7 @@ TEST(DataTypeTransform, CPUTransform) {
 
     // transform int64 to float16
     int64_t* in_data_int64 =
-        in.mutable_data<int64_t>(phi::make_ddim({2, 3}), place);
+        in.mutable_data<int64_t>(common::make_ddim({2, 3}), place);
     for (int i = 0; i < data_number; ++i) {
       in_data_int64[i] = i;
     }
@@ -165,7 +165,8 @@ TEST(DataTypeTransform, CPUTransform) {
     }
 
     // transform bool to float16
-    bool* in_data_bool = in.mutable_data<bool>(phi::make_ddim({2, 3}), place);
+    bool* in_data_bool =
+        in.mutable_data<bool>(common::make_ddim({2, 3}), place);
     for (int i = 0; i < data_number; ++i) {
       in_data_bool[i] = i;
     }
@@ -184,7 +185,7 @@ TEST(DataTypeTransform, CPUTransform) {
     phi::DenseTensor out;
 
     paddle::platform::bfloat16* ptr =
-        in.mutable_data<paddle::platform::bfloat16>(phi::make_ddim({2, 3}),
+        in.mutable_data<paddle::platform::bfloat16>(common::make_ddim({2, 3}),
                                                     place);
     int data_number = 2 * 3;
 
@@ -225,7 +226,7 @@ TEST(DataTypeTransform, CPUTransform) {
 
     // transform float to bfloat16
     float* in_data_float =
-        in.mutable_data<float>(phi::make_ddim({2, 3}), place);
+        in.mutable_data<float>(common::make_ddim({2, 3}), place);
     for (int i = 0; i < data_number; ++i) {
       in_data_float[i] = static_cast<float>(i);
     }
@@ -239,7 +240,7 @@ TEST(DataTypeTransform, CPUTransform) {
 
     // transform double to bfloat16
     double* in_data_double =
-        in.mutable_data<double>(phi::make_ddim({2, 3}), place);
+        in.mutable_data<double>(common::make_ddim({2, 3}), place);
     for (int i = 0; i < data_number; ++i) {
       in_data_double[i] = i;
     }
@@ -252,7 +253,7 @@ TEST(DataTypeTransform, CPUTransform) {
     }
 
     // transform int to bfloat16
-    int* in_data_int = in.mutable_data<int>(phi::make_ddim({2, 3}), place);
+    int* in_data_int = in.mutable_data<int>(common::make_ddim({2, 3}), place);
     for (int i = 0; i < data_number; ++i) {
       in_data_int[i] = i;
     }
@@ -266,7 +267,7 @@ TEST(DataTypeTransform, CPUTransform) {
 
     // transform int64 to bfloat16
     int64_t* in_data_int64 =
-        in.mutable_data<int64_t>(phi::make_ddim({2, 3}), place);
+        in.mutable_data<int64_t>(common::make_ddim({2, 3}), place);
     for (int i = 0; i < data_number; ++i) {
       in_data_int64[i] = i;
     }
@@ -279,7 +280,8 @@ TEST(DataTypeTransform, CPUTransform) {
     }
 
     // transform bool to bfloat16
-    bool* in_data_bool = in.mutable_data<bool>(phi::make_ddim({2, 3}), place);
+    bool* in_data_bool =
+        in.mutable_data<bool>(common::make_ddim({2, 3}), place);
     for (int i = 0; i < data_number; ++i) {
       in_data_bool[i] = i;
     }
@@ -297,7 +299,7 @@ TEST(DataTypeTransform, CPUTransform) {
     phi::DenseTensor in;
     phi::DenseTensor out;
 
-    int32_t* ptr = in.mutable_data<int32_t>(phi::make_ddim({2, 3}), place);
+    int32_t* ptr = in.mutable_data<int32_t>(common::make_ddim({2, 3}), place);
     int data_number = 2 * 3;
 
     for (int i = 0; i < data_number; ++i) {
@@ -339,7 +341,7 @@ TEST(DataTypeTransform, CPUTransform) {
 
     // transform float to int32
     float* in_data_float =
-        in.mutable_data<float>(phi::make_ddim({2, 3}), place);
+        in.mutable_data<float>(common::make_ddim({2, 3}), place);
     for (int i = 0; i < data_number; ++i) {
       in_data_float[i] = static_cast<float>(i);
     }
@@ -352,7 +354,7 @@ TEST(DataTypeTransform, CPUTransform) {
 
     // transform double to int32
     double* in_data_double =
-        in.mutable_data<double>(phi::make_ddim({2, 3}), place);
+        in.mutable_data<double>(common::make_ddim({2, 3}), place);
     for (int i = 0; i < data_number; ++i) {
       in_data_double[i] = i;
     }
@@ -365,7 +367,7 @@ TEST(DataTypeTransform, CPUTransform) {
 
     // transform bfloat16 to int32
     paddle::platform::bfloat16* in_data_bf16 =
-        in.mutable_data<paddle::platform::bfloat16>(phi::make_ddim({2, 3}),
+        in.mutable_data<paddle::platform::bfloat16>(common::make_ddim({2, 3}),
                                                     place);
     for (int i = 0; i < data_number; ++i) {
       in_data_bf16[i] = i;
@@ -379,7 +381,7 @@ TEST(DataTypeTransform, CPUTransform) {
 
     // transform int64 to int32
     int64_t* in_data_int64 =
-        in.mutable_data<int64_t>(phi::make_ddim({2, 3}), place);
+        in.mutable_data<int64_t>(common::make_ddim({2, 3}), place);
     for (int i = 0; i < data_number; ++i) {
       in_data_int64[i] = i;
     }
@@ -391,7 +393,8 @@ TEST(DataTypeTransform, CPUTransform) {
     }
 
     // transform bool to int32
-    bool* in_data_bool = in.mutable_data<bool>(phi::make_ddim({2, 3}), place);
+    bool* in_data_bool =
+        in.mutable_data<bool>(common::make_ddim({2, 3}), place);
     for (int i = 0; i < data_number; ++i) {
       in_data_bool[i] = i;
     }
diff --git a/test/cpp/fluid/framework/data_type_transform_test.cu b/test/cpp/fluid/framework/data_type_transform_test.cu
index f9394bea7fc372..e854408a071721 100644
--- a/test/cpp/fluid/framework/data_type_transform_test.cu
+++ b/test/cpp/fluid/framework/data_type_transform_test.cu
@@ -50,7 +50,8 @@ TEST(DataTypeTransform, GPUTransform) {
     phi::DenseTensor out_gpu;
     phi::DenseTensor out;
 
-    float* in_ptr = in.mutable_data<float>(phi::make_ddim({2, 3}), cpu_place);
+    float* in_ptr =
+        in.mutable_data<float>(common::make_ddim({2, 3}), cpu_place);
     float arr[6] = {0, 1, 2, 3, 4, 5};
     int data_number = sizeof(arr) / sizeof(arr[0]);
     memcpy(in_ptr, arr, sizeof(arr));
@@ -86,7 +87,7 @@ TEST(DataTypeTransform, GPUTransform) {
     phi::DenseTensor out;
 
     paddle::platform::float16* ptr = in.mutable_data<paddle::platform::float16>(
-        phi::make_ddim({2, 3}), cpu_place);
+        common::make_ddim({2, 3}), cpu_place);
     paddle::platform::float16 arr[6] = {paddle::platform::float16(0),
                                         paddle::platform::float16(1),
                                         paddle::platform::float16(2),
@@ -152,7 +153,7 @@ TEST(DataTypeTransform, GPUTransform) {
 
     // transform float to float16
     float* in_data_float =
-        in.mutable_data<float>(phi::make_ddim({2, 3}), cpu_place);
+        in.mutable_data<float>(common::make_ddim({2, 3}), cpu_place);
     for (int i = 0; i < data_number; ++i) {
       in_data_float[i] = i;
     }
@@ -172,7 +173,7 @@ TEST(DataTypeTransform, GPUTransform) {
 
     // transform double to float16
     double* in_data_double =
-        in.mutable_data<double>(phi::make_ddim({2, 3}), cpu_place);
+        in.mutable_data<double>(common::make_ddim({2, 3}), cpu_place);
     for (int i = 0; i < data_number; ++i) {
       in_data_double[i] = i;
     }
@@ -191,7 +192,8 @@ TEST(DataTypeTransform, GPUTransform) {
     }
 
     // transform int to float16
-    int* in_data_int = in.mutable_data<int>(phi::make_ddim({2, 3}), cpu_place);
+    int* in_data_int =
+        in.mutable_data<int>(common::make_ddim({2, 3}), cpu_place);
     for (int i = 0; i < data_number; ++i) {
       in_data_int[i] = i;
     }
@@ -211,7 +213,7 @@ TEST(DataTypeTransform, GPUTransform) {
 
     // transform int64 to float16
     int64_t* in_data_int64 =
-        in.mutable_data<int64_t>(phi::make_ddim({2, 3}), cpu_place);
+        in.mutable_data<int64_t>(common::make_ddim({2, 3}), cpu_place);
     for (int i = 0; i < data_number; ++i) {
       in_data_int64[i] = i;
     }
@@ -231,7 +233,7 @@ TEST(DataTypeTransform, GPUTransform) {
 
     // transform bool to float16
     bool* in_data_bool =
-        in.mutable_data<bool>(phi::make_ddim({2, 3}), cpu_place);
+        in.mutable_data<bool>(common::make_ddim({2, 3}), cpu_place);
     for (int i = 0; i < data_number; ++i) {
       in_data_bool[i] = i;
     }
diff --git a/test/cpp/fluid/framework/eigen_test.cc b/test/cpp/fluid/framework/eigen_test.cc
index 4771922986b62e..2307cca56f152e 100644
--- a/test/cpp/fluid/framework/eigen_test.cc
+++ b/test/cpp/fluid/framework/eigen_test.cc
@@ -17,13 +17,13 @@
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/platform/place.h"
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 
 namespace paddle {
 namespace framework {
 
 TEST(EigenDim, From) {
-  EigenDim<3>::Type ed = EigenDim<3>::From(phi::make_ddim({1, 2, 3}));
+  EigenDim<3>::Type ed = EigenDim<3>::From(common::make_ddim({1, 2, 3}));
   ASSERT_EQ(1, ed[0]);
   ASSERT_EQ(2, ed[1]);
   ASSERT_EQ(3, ed[2]);
@@ -32,7 +32,7 @@ TEST(EigenDim, From) {
 TEST(Eigen, DenseTensor) {
   phi::DenseTensor t;
   float* p =
-      t.mutable_data<float>(phi::make_ddim({1, 2, 3}), platform::CPUPlace());
+      t.mutable_data<float>(common::make_ddim({1, 2, 3}), platform::CPUPlace());
   for (int i = 0; i < 1 * 2 * 3; i++) {
     p[i] = static_cast<float>(i);
   }
@@ -54,7 +54,7 @@ TEST(Eigen, DenseTensor) {
 
 TEST(Eigen, ScalarFrom) {
   phi::DenseTensor t;
-  int* p = t.mutable_data<int>(phi::make_ddim({1}), platform::CPUPlace());
+  int* p = t.mutable_data<int>(common::make_ddim({1}), platform::CPUPlace());
   *p = static_cast<int>(100);
 
   EigenScalar<int>::Type es = EigenScalar<int>::From(t);
@@ -65,7 +65,8 @@ TEST(Eigen, ScalarFrom) {
 
 TEST(Eigen, VectorFrom) {
   phi::DenseTensor t;
-  float* p = t.mutable_data<float>(phi::make_ddim({6}), platform::CPUPlace());
+  float* p =
+      t.mutable_data<float>(common::make_ddim({6}), platform::CPUPlace());
   for (int i = 0; i < 6; i++) {
     p[i] = static_cast<float>(i);
   }
@@ -82,7 +83,7 @@ TEST(Eigen, VectorFrom) {
 TEST(Eigen, VectorFlatten) {
   phi::DenseTensor t;
   float* p =
-      t.mutable_data<float>(phi::make_ddim({1, 2, 3}), platform::CPUPlace());
+      t.mutable_data<float>(common::make_ddim({1, 2, 3}), platform::CPUPlace());
   for (int i = 0; i < 1 * 2 * 3; i++) {
     p[i] = static_cast<float>(i);
   }
@@ -99,7 +100,7 @@ TEST(Eigen, VectorFlatten) {
 TEST(Eigen, Matrix) {
   phi::DenseTensor t;
   float* p =
-      t.mutable_data<float>(phi::make_ddim({2, 3}), platform::CPUPlace());
+      t.mutable_data<float>(common::make_ddim({2, 3}), platform::CPUPlace());
   for (int i = 0; i < 2 * 3; i++) {
     p[i] = static_cast<float>(i);
   }
diff --git a/test/cpp/fluid/framework/operator_test.cc b/test/cpp/fluid/framework/operator_test.cc
index baca5b3f06743a..d40a45ae5172a3 100644
--- a/test/cpp/fluid/framework/operator_test.cc
+++ b/test/cpp/fluid/framework/operator_test.cc
@@ -549,7 +549,7 @@ void SetGetLoDLevelTestMain(std::string op_type) {
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
   auto* x_var = scope.Var("x.0");
   auto* x = x_var->GetMutable<phi::DenseTensor>();
-  x->mutable_data<float>(phi::make_ddim({64}), place);
+  x->mutable_data<float>(common::make_ddim({64}), place);
   auto* out_var = scope.Var("out.0");
   out_var->GetMutable<phi::DenseTensor>();
 
diff --git a/test/cpp/fluid/framework/paddle2cinn/cinn_cache_key_test.cc b/test/cpp/fluid/framework/paddle2cinn/cinn_cache_key_test.cc
index cd2da68a7f6ddf..f9406840dd2d86 100644
--- a/test/cpp/fluid/framework/paddle2cinn/cinn_cache_key_test.cc
+++ b/test/cpp/fluid/framework/paddle2cinn/cinn_cache_key_test.cc
@@ -18,10 +18,10 @@
 #include <unordered_set>
 
 #include "gtest/gtest.h"
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/phi/core/ddim.h"
 
 namespace paddle {
 namespace framework {
@@ -47,7 +47,7 @@ TEST(CinnCacheKeyTest, TestAsUnorderedKeyByStructure) {
   std::map<std::string, const phi::DenseTensor *> feed_tensors = {
       {"X", tensor_pointer}};
 
-  DDim ddim = phi::make_ddim({1, 2, 3});
+  DDim ddim = common::make_ddim({1, 2, 3});
   std::map<std::string, DDim> feed_shapes = {{"X", ddim}};
   std::map<std::string, DataType> feed_dtypes = {{"X", fp32}};
 
@@ -125,7 +125,7 @@ TEST(CinnCacheKeyTest, TestAsUnorderedKeyByAddress) {
   std::map<std::string, const phi::DenseTensor *> feed_tensors = {
       {"X", tensor_pointer}};
 
-  DDim ddim = phi::make_ddim({1, 2, 3});
+  DDim ddim = common::make_ddim({1, 2, 3});
   std::map<std::string, DDim> feed_shapes = {{"X", ddim}};
   std::map<std::string, DataType> feed_dtypes = {{"X", fp32}};
   std::map<std::string, DataType> new_dtypes = {{"X", DataType::FLOAT64}};
diff --git a/test/cpp/fluid/framework/paddle2cinn/cinn_compiler_test.cc b/test/cpp/fluid/framework/paddle2cinn/cinn_compiler_test.cc
index 519b78115748be..63c05d19a738e3 100644
--- a/test/cpp/fluid/framework/paddle2cinn/cinn_compiler_test.cc
+++ b/test/cpp/fluid/framework/paddle2cinn/cinn_compiler_test.cc
@@ -26,6 +26,7 @@
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 #include "paddle/cinn/common/target.h"
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -35,7 +36,6 @@
 #include "paddle/fluid/operators/cinn/cinn_launch_op.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/flags.h"
 #include "paddle/utils/flags.h"
 
@@ -258,7 +258,7 @@ TEST(CinnCompilerTest, Compile) {
   std::unordered_map<std::string, phi::DenseTensor> create_inputs;
   for (const auto& pair : inputs_info) {
     auto& tensor = create_inputs[pair.first];
-    tensor.Resize(phi::make_ddim(pair.second));
+    tensor.Resize(common::make_ddim(pair.second));
     tensor.mutable_data<float>(platform::CPUPlace());
   }
   std::map<std::string, const phi::DenseTensor*> input_tensors;
diff --git a/test/cpp/fluid/framework/reader_test.cc b/test/cpp/fluid/framework/reader_test.cc
index bca4f7de8ad0a0..cb53bdcf080923 100644
--- a/test/cpp/fluid/framework/reader_test.cc
+++ b/test/cpp/fluid/framework/reader_test.cc
@@ -40,7 +40,7 @@ class StubRootReader : public paddle::framework::ReaderBase {
 TEST(READER, decorate_chain) {
   paddle::framework::proto::VarType::Type dtype =
       paddle::framework::proto::VarType::FP32;
-  paddle::framework::DDim dim = phi::make_ddim({5, 7});
+  paddle::framework::DDim dim = common::make_ddim({5, 7});
   std::vector<paddle::framework::DDim> init_dims(4, dim);
   std::vector<paddle::framework::proto::VarType::Type> init_types(4, dtype);
   std::vector<bool> init_need_check(4, true);
diff --git a/test/cpp/fluid/framework/selected_rows_utils_test.cc b/test/cpp/fluid/framework/selected_rows_utils_test.cc
index 15735b87d0f9d5..6af07e03432fe8 100644
--- a/test/cpp/fluid/framework/selected_rows_utils_test.cc
+++ b/test/cpp/fluid/framework/selected_rows_utils_test.cc
@@ -30,7 +30,8 @@ class SelectedRowsTester : public ::testing::Test {
 
     phi::DenseTensor* value = selected_rows_->mutable_value();
     auto* data = value->mutable_data<float>(
-        phi::make_ddim({static_cast<int64_t>(rows.size()), row_numel}), place_);
+        common::make_ddim({static_cast<int64_t>(rows.size()), row_numel}),
+        place_);
     for (int64_t i = 0; i < value->numel(); ++i) {
       data[i] = static_cast<float>(i);
     }
@@ -44,11 +45,11 @@ class SelectedRowsTester : public ::testing::Test {
 TEST_F(SelectedRowsTester, height) { ASSERT_EQ(selected_rows_->height(), 10); }
 
 TEST_F(SelectedRowsTester, dims) {
-  ASSERT_EQ(selected_rows_->value().dims(), phi::make_ddim({3, 100}));
+  ASSERT_EQ(selected_rows_->value().dims(), common::make_ddim({3, 100}));
 }
 
 TEST_F(SelectedRowsTester, complete_dims) {
-  ASSERT_EQ(selected_rows_->GetCompleteDims(), phi::make_ddim({10, 100}));
+  ASSERT_EQ(selected_rows_->GetCompleteDims(), common::make_ddim({10, 100}));
 }
 
 TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
@@ -78,7 +79,8 @@ TEST(SelectedRows, SparseTable) {
   int64_t table_size = 100;
   int64_t embedding_width = 8;
   // initialize a sparse table
-  table.mutable_value()->Resize(phi::make_ddim({table_size, embedding_width}));
+  table.mutable_value()->Resize(
+      common::make_ddim({table_size, embedding_width}));
   auto* data = table.mutable_value()->mutable_data<float>(cpu);
   for (int64_t i = 0; i < table_size; ++i) {
     for (int64_t j = 0; j < embedding_width; ++j) {
@@ -99,7 +101,7 @@ TEST(SelectedRows, SparseTable) {
   ASSERT_EQ(table.rows().size(), 3UL);
 
   phi::DenseTensor ids;
-  ids.Resize(phi::make_ddim({4}));
+  ids.Resize(common::make_ddim({4}));
   auto* ids_data = ids.mutable_data<int64_t>(cpu);
   ids_data[0] = static_cast<int64_t>(6);
   ids_data[1] = static_cast<int64_t>(6);
@@ -107,8 +109,8 @@ TEST(SelectedRows, SparseTable) {
   ids_data[3] = static_cast<int64_t>(10);
 
   phi::DenseTensor get_value;
-  auto* value_data =
-      get_value.mutable_data<float>(phi::make_ddim({4, embedding_width}), cpu);
+  auto* value_data = get_value.mutable_data<float>(
+      common::make_ddim({4, embedding_width}), cpu);
   table.Get(ids, &get_value);
 
   for (int j = 0; j < embedding_width; ++j) {
@@ -176,7 +178,8 @@ TEST(SelectedRows, MultiThreadAutoIndex) {
   int64_t table_size = 100000;
   int64_t embedding_width = 8;
   // initialize a sparse table
-  table.mutable_value()->Resize(phi::make_ddim({table_size, embedding_width}));
+  table.mutable_value()->Resize(
+      common::make_ddim({table_size, embedding_width}));
   auto* data = table.mutable_value()->mutable_data<float>(cpu);
   for (int64_t i = 0; i < table_size; ++i) {
     for (int64_t j = 0; j < embedding_width; ++j) {
diff --git a/test/cpp/fluid/framework/tensor_test.cc b/test/cpp/fluid/framework/tensor_test.cc
index 5ef6f53d38d509..be6da7c50453f3 100644
--- a/test/cpp/fluid/framework/tensor_test.cc
+++ b/test/cpp/fluid/framework/tensor_test.cc
@@ -54,26 +54,26 @@ TEST(DenseTensor, MutableData) {
     float* p1 = nullptr;
     float* p2 = nullptr;
     // initialization
-    p1 = src_tensor.mutable_data<float>(phi::make_ddim({1, 2, 3}),
+    p1 = src_tensor.mutable_data<float>(common::make_ddim({1, 2, 3}),
                                         platform::CPUPlace());
     auto p1_holder = src_tensor.Holder();
     EXPECT_NE(p1, nullptr);
     // set src_tensor a new dim with large size
     // momery is supposed to be re-allocated
-    p2 = src_tensor.mutable_data<float>(phi::make_ddim({3, 4}),
+    p2 = src_tensor.mutable_data<float>(common::make_ddim({3, 4}),
                                         platform::CPUPlace());
     EXPECT_NE(p2, nullptr);
     auto p2_holder1 = src_tensor.Holder();
     EXPECT_NE(p1_holder.get(), p2_holder1.get());
     // set src_tensor a new dim with same size
     // momery block is supposed to be unchanged
-    p1 = src_tensor.mutable_data<float>(phi::make_ddim({2, 2, 3}),
+    p1 = src_tensor.mutable_data<float>(common::make_ddim({2, 2, 3}),
                                         platform::CPUPlace());
     auto p2_holder2 = src_tensor.Holder();
     EXPECT_EQ(p2_holder1.get(), p2_holder2.get());
     // set src_tensor a new dim with smaller size
     // momery block is supposed to be unchanged
-    p2 = src_tensor.mutable_data<float>(phi::make_ddim({2, 2}),
+    p2 = src_tensor.mutable_data<float>(common::make_ddim({2, 2}),
                                         platform::CPUPlace());
     auto p2_holder3 = src_tensor.Holder();
     EXPECT_EQ(p1, p2);
@@ -83,7 +83,7 @@ TEST(DenseTensor, MutableData) {
     float* p4 = nullptr;
     // set src_tensor a different type but smaller size.
     // memory block is supposed to be unchanged.
-    auto* tmp = src_tensor.mutable_data<uint8_t>(phi::make_ddim({2, 2}),
+    auto* tmp = src_tensor.mutable_data<uint8_t>(common::make_ddim({2, 2}),
                                                  platform::CPUPlace());
     p3 = reinterpret_cast<float*>(tmp);
     auto p3_holder1 = src_tensor.Holder();
@@ -92,7 +92,7 @@ TEST(DenseTensor, MutableData) {
 
     // set src_tensor a different type but bigger size.
     // memory block is supposed to be changed.
-    auto* tmp2 = src_tensor.mutable_data<double>(phi::make_ddim({2, 2, 3}),
+    auto* tmp2 = src_tensor.mutable_data<double>(common::make_ddim({2, 2, 3}),
                                                  platform::CPUPlace());
     auto p3_holder2 = src_tensor.Holder();
     p4 = reinterpret_cast<float*>(tmp2);
@@ -103,12 +103,12 @@ TEST(DenseTensor, MutableData) {
   // changed.
   {
     phi::DenseTensor src_tensor;
-    int8_t* p1 = src_tensor.mutable_data<int8_t>(phi::make_ddim({1}),
+    int8_t* p1 = src_tensor.mutable_data<int8_t>(common::make_ddim({1}),
                                                  platform::CPUPlace());
     EXPECT_NE(p1, nullptr);
     *p1 = 1;
 
-    uint8_t* p2 = src_tensor.mutable_data<uint8_t>(phi::make_ddim({1}),
+    uint8_t* p2 = src_tensor.mutable_data<uint8_t>(common::make_ddim({1}),
                                                    platform::CPUPlace());
     EXPECT_NE(p2, nullptr);
     EXPECT_EQ(static_cast<int>(p2[0]), 1);
@@ -120,25 +120,25 @@ TEST(DenseTensor, MutableData) {
     float* p1 = nullptr;
     float* p2 = nullptr;
     // initialization
-    p1 = src_tensor.mutable_data<float>(phi::make_ddim({1, 2, 3}),
+    p1 = src_tensor.mutable_data<float>(common::make_ddim({1, 2, 3}),
                                         platform::CUDAPlace(0));
     auto p1_holder = src_tensor.Holder();
     EXPECT_NE(p1, nullptr);
     // set src_tensor a new dim with large size
     // momery is supposed to be re-allocated
-    p2 = src_tensor.mutable_data<float>(phi::make_ddim({3, 1024}),
+    p2 = src_tensor.mutable_data<float>(common::make_ddim({3, 1024}),
                                         platform::CUDAPlace(0));
     auto p2_holder = src_tensor.Holder();
     EXPECT_NE(p2, nullptr);
     EXPECT_NE(p1_holder.get(), p2_holder.get());
     // set src_tensor a new dim with same size
     // momery block is supposed to be unchanged
-    p1 = src_tensor.mutable_data<float>(phi::make_ddim({2, 2, 3}),
+    p1 = src_tensor.mutable_data<float>(common::make_ddim({2, 2, 3}),
                                         platform::CUDAPlace(0));
     EXPECT_EQ(p1, p2);
     // set src_tensor a new dim with smaller size
     // momery block is supposed to be unchanged
-    p2 = src_tensor.mutable_data<float>(phi::make_ddim({2, 2}),
+    p2 = src_tensor.mutable_data<float>(common::make_ddim({2, 2}),
                                         platform::CUDAPlace(0));
     EXPECT_EQ(p1, p2);
   }
@@ -162,7 +162,7 @@ TEST(DenseTensor, ShareDataWith) {
     }
     ASSERT_TRUE(caught);
 
-    src_tensor.mutable_data<int>(phi::make_ddim({2, 3, 4}),
+    src_tensor.mutable_data<int>(common::make_ddim({2, 3, 4}),
                                  platform::CPUPlace());
     dst_tensor.ShareDataWith(src_tensor);
     ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
@@ -172,7 +172,7 @@ TEST(DenseTensor, ShareDataWith) {
   {
     phi::DenseTensor src_tensor;
     phi::DenseTensor dst_tensor;
-    src_tensor.mutable_data<int>(phi::make_ddim({2, 3, 4}),
+    src_tensor.mutable_data<int>(common::make_ddim({2, 3, 4}),
                                  platform::CUDAPlace(0));
     dst_tensor.ShareDataWith(src_tensor);
     ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
@@ -183,7 +183,7 @@ TEST(DenseTensor, ShareDataWith) {
 TEST(DenseTensor, Slice) {
   {
     phi::DenseTensor src_tensor;
-    src_tensor.mutable_data<int>(phi::make_ddim({5, 3, 4}),
+    src_tensor.mutable_data<int>(common::make_ddim({5, 3, 4}),
                                  platform::CPUPlace());
     phi::DenseTensor slice_tensor = src_tensor.Slice(1, 3);
     phi::DDim slice_dims = slice_tensor.dims();
@@ -209,7 +209,7 @@ TEST(DenseTensor, Slice) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   {
     phi::DenseTensor src_tensor;
-    src_tensor.mutable_data<double>(phi::make_ddim({6, 9}),
+    src_tensor.mutable_data<double>(common::make_ddim({6, 9}),
                                     platform::CUDAPlace(0));
     phi::DenseTensor slice_tensor = src_tensor.Slice(2, 6);
     phi::DDim slice_dims = slice_tensor.dims();
@@ -270,7 +270,8 @@ TEST(DenseTensor, FP16) {
 TEST(DenseTensor, Split) {
   {
     phi::DenseTensor src_tensor;
-    src_tensor.mutable_data<int>(phi::make_ddim({6, 2}), platform::CPUPlace());
+    src_tensor.mutable_data<int>(common::make_ddim({6, 2}),
+                                 platform::CPUPlace());
     std::vector<phi::DenseTensor> split_tensor_list = src_tensor.Split(2, 0);
     ASSERT_EQ(split_tensor_list.size(), 3UL);
     EXPECT_EQ(split_tensor_list[0].dims()[0], 2);
@@ -298,7 +299,7 @@ TEST(DenseTensor, Split) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   {
     phi::DenseTensor src_tensor;
-    src_tensor.mutable_data<double>(phi::make_ddim({6, 4}),
+    src_tensor.mutable_data<double>(common::make_ddim({6, 4}),
                                     platform::CUDAPlace(0));
     std::vector<phi::DenseTensor> split_tensor_list = src_tensor.Split(2, 0);
     ASSERT_EQ(split_tensor_list.size(), 3UL);
@@ -332,7 +333,8 @@ TEST(DenseTensor, Split) {
 TEST(DenseTensor, Chunk) {
   {
     phi::DenseTensor src_tensor;
-    src_tensor.mutable_data<int>(phi::make_ddim({6, 2}), platform::CPUPlace());
+    src_tensor.mutable_data<int>(common::make_ddim({6, 2}),
+                                 platform::CPUPlace());
     std::vector<phi::DenseTensor> split_tensor_list = src_tensor.Chunk(3, 0);
     ASSERT_EQ(split_tensor_list.size(), 3UL);
     EXPECT_EQ(split_tensor_list[0].dims()[0], 2);
@@ -360,7 +362,7 @@ TEST(DenseTensor, Chunk) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   {
     phi::DenseTensor src_tensor;
-    src_tensor.mutable_data<double>(phi::make_ddim({6, 4}),
+    src_tensor.mutable_data<double>(common::make_ddim({6, 4}),
                                     platform::CUDAPlace(0));
     std::vector<phi::DenseTensor> split_tensor_list = src_tensor.Chunk(3, 0);
     ASSERT_EQ(split_tensor_list.size(), 3UL);
diff --git a/test/cpp/fluid/framework/tensor_util_test.cc b/test/cpp/fluid/framework/tensor_util_test.cc
index 65c2bf3b64b4a9..6b9c25750ac070 100644
--- a/test/cpp/fluid/framework/tensor_util_test.cc
+++ b/test/cpp/fluid/framework/tensor_util_test.cc
@@ -26,7 +26,7 @@ TEST(TensorCopy, Tensor) {
   phi::DenseTensor dst_tensor;
   phi::CPUContext cpu_ctx((platform::CPUPlace()));
 
-  int* src_ptr = src_tensor.mutable_data<int>(phi::make_ddim({3, 3}),
+  int* src_ptr = src_tensor.mutable_data<int>(common::make_ddim({3, 3}),
                                               platform::CPUPlace());
 
   std::array<int, 9> arr = {1, 2, 3, 4, 5, 6, 7, 8, 9};
@@ -65,7 +65,7 @@ TEST(TensorCopy, Tensor) {
     phi::DenseTensor gpu_tensor;
     phi::DenseTensor dst_tensor;
 
-    int* src_ptr = src_tensor.mutable_data<int>(phi::make_ddim({3, 3}),
+    int* src_ptr = src_tensor.mutable_data<int>(common::make_ddim({3, 3}),
                                                 platform::CPUPlace());
 
     int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
@@ -129,7 +129,7 @@ TEST(TensorFromVector, Tensor) {
     phi::DenseTensor cpu_tensor;
 
     // Copy to CPU phi::DenseTensor
-    cpu_tensor.Resize(phi::make_ddim({3, 3}));
+    cpu_tensor.Resize(common::make_ddim({3, 3}));
     auto cpu_place = new paddle::platform::CPUPlace();
     paddle::framework::TensorFromVector<int>(src_vec, &cpu_tensor);
 
@@ -142,7 +142,7 @@ TEST(TensorFromVector, Tensor) {
     }
 
     src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
-    cpu_tensor.Resize(phi::make_ddim({2, 2}));
+    cpu_tensor.Resize(common::make_ddim({2, 2}));
     paddle::framework::TensorFromVector<int>(src_vec, &cpu_tensor);
     cpu_ptr = cpu_tensor.data<int>();
     src_ptr = src_vec.data();
@@ -162,13 +162,13 @@ TEST(TensorFromVector, Tensor) {
     phi::DenseTensor dst_tensor;
 
     // Copy to CPU phi::DenseTensor
-    cpu_tensor.Resize(phi::make_ddim({3, 3}));
+    cpu_tensor.Resize(common::make_ddim({3, 3}));
     auto cpu_place = new paddle::platform::CPUPlace();
     phi::CPUContext cpu_ctx(*cpu_place);
     paddle::framework::TensorFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
 
     // Copy to GPUTensor
-    gpu_tensor.Resize(phi::make_ddim({3, 3}));
+    gpu_tensor.Resize(common::make_ddim({3, 3}));
     auto gpu_place = new paddle::platform::CUDAPlace();
     phi::GPUContext gpu_ctx(*gpu_place);
     gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
@@ -193,9 +193,9 @@ TEST(TensorFromVector, Tensor) {
 
     src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
 
-    cpu_tensor.Resize(phi::make_ddim({2, 2}));
+    cpu_tensor.Resize(common::make_ddim({2, 2}));
     paddle::framework::TensorFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
-    gpu_tensor.Resize(phi::make_ddim({2, 2}));
+    gpu_tensor.Resize(common::make_ddim({2, 2}));
     paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
     paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
 
@@ -307,7 +307,7 @@ TEST(TensorFromDLPack, Tensor) {
     std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
     phi::DenseTensor cpu_tensor;
 
-    cpu_tensor.Resize(phi::make_ddim({3, 3}));
+    cpu_tensor.Resize(common::make_ddim({3, 3}));
     paddle::platform::CPUPlace cpu_place;
     phi::CPUContext cpu_ctx(cpu_place);
     paddle::framework::TensorFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
@@ -333,13 +333,13 @@ TEST(TensorFromDLPack, Tensor) {
     phi::DenseTensor gpu_tensor_from_dlpack;
 
     // Copy to CPU phi::DenseTensor
-    cpu_tensor.Resize(phi::make_ddim({3, 3}));
+    cpu_tensor.Resize(common::make_ddim({3, 3}));
     paddle::platform::CPUPlace cpu_place;
     phi::CPUContext cpu_ctx(cpu_place);
     paddle::framework::TensorFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
 
     // Copy to GPUTensor
-    gpu_tensor.Resize(phi::make_ddim({3, 3}));
+    gpu_tensor.Resize(common::make_ddim({3, 3}));
     paddle::platform::CUDAPlace gpu_place;
     auto& gpu_ctx =
         *paddle::platform::DeviceContextPool::Instance().GetByPlace(gpu_place);
diff --git a/test/cpp/fluid/fused/CMakeLists.txt b/test/cpp/fluid/fused/CMakeLists.txt
index 59bf35e05c021a..b27dbaae673678 100644
--- a/test/cpp/fluid/fused/CMakeLists.txt
+++ b/test/cpp/fluid/fused/CMakeLists.txt
@@ -16,6 +16,7 @@ if(WITH_GPU OR WITH_ROCM)
            generated_op
            device_context
            phi
+           common
            memory)
     nv_test(
       test_fused_dropout_act_bias
@@ -26,6 +27,7 @@ if(WITH_GPU OR WITH_ROCM)
            generated_op
            device_context
            phi
+           common
            memory)
     nv_test(
       test_fused_layernorm_residual_dropout_bias
@@ -37,6 +39,7 @@ if(WITH_GPU OR WITH_ROCM)
            generated_op
            device_context
            phi
+           common
            memory
            ${CINN_DEPS})
   endif()
@@ -51,6 +54,7 @@ if(WITH_GPU OR WITH_ROCM)
            op_registry
            device_context
            phi
+           common
            memory)
     cc_test(
       test_cudnn_bn_add_relu
@@ -61,6 +65,7 @@ if(WITH_GPU OR WITH_ROCM)
            op_registry
            device_context
            phi
+           common
            memory)
   endif()
 endif()
diff --git a/test/cpp/fluid/fused/cudnn_bn_add_relu_test.cc b/test/cpp/fluid/fused/cudnn_bn_add_relu_test.cc
index ee220f993bfaa2..ae4697833d7584 100644
--- a/test/cpp/fluid/fused/cudnn_bn_add_relu_test.cc
+++ b/test/cpp/fluid/fused/cudnn_bn_add_relu_test.cc
@@ -44,7 +44,7 @@ template <typename T>
 void InitRandomTensor(const std::vector<int64_t> &dims,
                       phi::DenseTensor *cpu_out) {
   T *cpu_out_ptr =
-      cpu_out->mutable_data<T>(phi::make_ddim(dims), platform::CPUPlace());
+      cpu_out->mutable_data<T>(common::make_ddim(dims), platform::CPUPlace());
   std::default_random_engine random(0);
   std::uniform_real_distribution<float> dis(-1.0, 1.0);
   for (int i = 0; i < cpu_out->numel(); ++i) {
@@ -57,7 +57,7 @@ void InitConstantTensor(const std::vector<int64_t> &dims,
                         T value,
                         phi::DenseTensor *cpu_out) {
   T *cpu_out_ptr =
-      cpu_out->mutable_data<T>(phi::make_ddim(dims), platform::CPUPlace());
+      cpu_out->mutable_data<T>(common::make_ddim(dims), platform::CPUPlace());
   for (int i = 0; i < cpu_out->numel(); ++i) {
     cpu_out_ptr[i] = value;
   }
@@ -652,7 +652,7 @@ class CudnnBNAddReluTester {
     saved_mean->Resize({1, 1, 1, channels_});
     saved_var->Resize({1, 1, 1, channels_});
 
-    auto param_shape = phi::vectorize<int>(bn_scale->dims());
+    auto param_shape = common::vectorize<int>(bn_scale->dims());
     op::CudnnBNStatsFinalize<T> bn_op(ctx, param_shape);
     bn_op.Forward(ctx,
                   *sum,
@@ -759,17 +759,17 @@ class CudnnBNAddReluTester {
                                   &equiv_bias_z);
     }
 
-    y.Resize(phi::make_ddim({batch_size_, height_, width_, channels_}));
+    y.Resize(common::make_ddim({batch_size_, height_, width_, channels_}));
 
     int c = channels_;
     int64_t nhw = ele_count_;
     int32_t c_int32_elems = ((c + 63) & ~63) / 32;
     int32_t nhw_int32_elems = (nhw + 31) & ~31;
-    bitmask.Resize(phi::make_ddim({nhw_int32_elems, c_int32_elems, 1}));
+    bitmask.Resize(common::make_ddim({nhw_int32_elems, c_int32_elems, 1}));
 
-    auto data_shape = phi::vectorize<int>(x.dims());
-    auto param_shape = phi::vectorize<int>(bn_scale_x.dims());
-    auto bitmask_shape = phi::vectorize<int>(bitmask.dims());
+    auto data_shape = common::vectorize<int>(x.dims());
+    auto param_shape = common::vectorize<int>(bn_scale_x.dims());
+    auto bitmask_shape = common::vectorize<int>(bitmask.dims());
 
     // 2. Scale Bias + Relu
     op::CudnnScaleBiasAddRelu<T> sbar_op(ctx,
@@ -841,14 +841,14 @@ class CudnnBNAddReluTester {
     saved_mean.Resize({1, 1, 1, channels_});
     saved_var.Resize({1, 1, 1, channels_});
 
-    dx.Resize(phi::make_ddim({batch_size_, height_, width_, channels_}));
-    dz.Resize(phi::make_ddim({batch_size_, height_, width_, channels_}));
-    dscale.Resize(phi::make_ddim({1, 1, 1, channels_}));
-    dbias.Resize(phi::make_ddim({1, 1, 1, channels_}));
+    dx.Resize(common::make_ddim({batch_size_, height_, width_, channels_}));
+    dz.Resize(common::make_ddim({batch_size_, height_, width_, channels_}));
+    dscale.Resize(common::make_ddim({1, 1, 1, channels_}));
+    dbias.Resize(common::make_ddim({1, 1, 1, channels_}));
 
-    auto data_shape = phi::vectorize<int>(x.dims());
-    auto param_shape = phi::vectorize<int>(bn_scale.dims());
-    auto bitmask_shape = phi::vectorize<int>(bitmask.dims());
+    auto data_shape = common::vectorize<int>(x.dims());
+    auto param_shape = common::vectorize<int>(bn_scale.dims());
+    auto bitmask_shape = common::vectorize<int>(bitmask.dims());
 
     std::string act_type = "relu";
     op::CudnnScaleBiasAddRelu<T> sbar_op(
diff --git a/test/cpp/fluid/fused/cudnn_norm_conv_test.cc b/test/cpp/fluid/fused/cudnn_norm_conv_test.cc
index 16ea8f5ade0842..97d76aa65b0073 100644
--- a/test/cpp/fluid/fused/cudnn_norm_conv_test.cc
+++ b/test/cpp/fluid/fused/cudnn_norm_conv_test.cc
@@ -38,7 +38,7 @@ template <typename T>
 void InitRandomTensor(const std::vector<int64_t> &dims,
                       phi::DenseTensor *cpu_out) {
   T *cpu_out_ptr =
-      cpu_out->mutable_data<T>(phi::make_ddim(dims), platform::CPUPlace());
+      cpu_out->mutable_data<T>(common::make_ddim(dims), platform::CPUPlace());
 
   std::default_random_engine random(0);
   std::uniform_real_distribution<float> dis(0.0, 1.0);
@@ -335,14 +335,14 @@ class CudnnNormConvolutionTester {
     paddle::framework::TensorCopySync(cpu_input_, place, &input);
     paddle::framework::TensorCopySync(cpu_filter_nhwc_, place, &filter_nhwc);
 
-    output.Resize(phi::make_ddim(
+    output.Resize(common::make_ddim(
         {batch_size_, out_height_, out_width_, output_channels_}));
-    sum.Resize(phi::make_ddim({1, 1, 1, output_channels_}));
-    sum_of_square.Resize(phi::make_ddim({1, 1, 1, output_channels_}));
+    sum.Resize(common::make_ddim({1, 1, 1, output_channels_}));
+    sum_of_square.Resize(common::make_ddim({1, 1, 1, output_channels_}));
 
-    auto input_shape = phi::vectorize<int>(input.dims());
-    auto filter_shape = phi::vectorize<int>(filter_nhwc.dims());
-    auto output_shape = phi::vectorize<int>(output.dims());
+    auto input_shape = common::vectorize<int>(input.dims());
+    auto filter_shape = common::vectorize<int>(filter_nhwc.dims());
+    auto output_shape = common::vectorize<int>(output.dims());
     op::CudnnNormConvolution<T> conv_op(ctx,
                                         input_shape,
                                         filter_shape,
@@ -376,9 +376,9 @@ class CudnnNormConvolutionTester {
     input_grad.Resize(input.dims());
     filter_grad.Resize(filter_nhwc.dims());
 
-    auto input_shape = phi::vectorize<int>(input.dims());
-    auto filter_shape = phi::vectorize<int>(filter_nhwc.dims());
-    auto output_shape = phi::vectorize<int>(output_grad.dims());
+    auto input_shape = common::vectorize<int>(input.dims());
+    auto filter_shape = common::vectorize<int>(filter_nhwc.dims());
+    auto output_shape = common::vectorize<int>(output_grad.dims());
     op::CudnnNormConvolutionGrad<T> conv_grad_op(ctx,
                                                  input_shape,
                                                  filter_shape,
diff --git a/test/cpp/fluid/fused/fusion_group_op_test.cc b/test/cpp/fluid/fused/fusion_group_op_test.cc
index 19d7d48ae0fa99..80aff6543e55bf 100644
--- a/test/cpp/fluid/fused/fusion_group_op_test.cc
+++ b/test/cpp/fluid/fused/fusion_group_op_test.cc
@@ -33,7 +33,7 @@ phi::DenseTensor* CreateTensor(framework::Scope* scope,
   auto* var = scope->Var(name);
   auto* tensor = var->GetMutable<phi::DenseTensor>();
   if (!shape.empty()) {
-    tensor->mutable_data<T>(phi::make_ddim(shape), place);
+    tensor->mutable_data<T>(common::make_ddim(shape), place);
   }
   return tensor;
 }
@@ -45,7 +45,8 @@ void SetupRandomCPUTensor(phi::DenseTensor* tensor,
   std::mt19937 rng(seed++);
   std::uniform_real_distribution<double> uniform_dist(0, 1);
 
-  T* ptr = tensor->mutable_data<T>(phi::make_ddim(shape), platform::CPUPlace());
+  T* ptr =
+      tensor->mutable_data<T>(common::make_ddim(shape), platform::CPUPlace());
   for (int64_t i = 0; i < tensor->numel(); ++i) {
     ptr[i] = static_cast<T>(uniform_dist(rng)) - static_cast<T>(0.5);
   }
diff --git a/test/cpp/fluid/gather_test.cc b/test/cpp/fluid/gather_test.cc
index 9a09d747a55658..358334cc9d3271 100644
--- a/test/cpp/fluid/gather_test.cc
+++ b/test/cpp/fluid/gather_test.cc
@@ -26,16 +26,16 @@ TEST(Gather, GatherData) {
 
   int* p_src = nullptr;
   int* p_index = nullptr;
-  p_src = src->mutable_data<int>(phi::make_ddim({3, 4}),
+  p_src = src->mutable_data<int>(common::make_ddim({3, 4}),
                                  paddle::platform::CPUPlace());
-  p_index = index->mutable_data<int>(phi::make_ddim({2}),
+  p_index = index->mutable_data<int>(common::make_ddim({2}),
                                      paddle::platform::CPUPlace());
 
   for (int i = 0; i < 12; ++i) p_src[i] = i;
   p_index[0] = 1;
   p_index[1] = 0;
 
-  int* p_output = output->mutable_data<int>(phi::make_ddim({2, 4}),
+  int* p_output = output->mutable_data<int>(common::make_ddim({2, 4}),
                                             paddle::platform::CPUPlace());
 
   auto* cpu_place = new paddle::platform::CPUPlace();
diff --git a/test/cpp/fluid/lite/CMakeLists.txt b/test/cpp/fluid/lite/CMakeLists.txt
index 6533073258ff5b..325b59582a0994 100644
--- a/test/cpp/fluid/lite/CMakeLists.txt
+++ b/test/cpp/fluid/lite/CMakeLists.txt
@@ -1,4 +1,4 @@
-paddle_test(test_lite_engine_op SRCS lite_engine_op_test.cc)
+paddle_test(test_lite_engine_op SRCS lite_engine_op_test.cc DEPS common)
 
 if(WITH_ONNXRUNTIME AND WIN32)
   # Copy onnxruntime for some c++ test in Windows, since the test will
diff --git a/test/cpp/fluid/math/CMakeLists.txt b/test/cpp/fluid/math/CMakeLists.txt
index 1edc2f25e68341..46d70de2687997 100644
--- a/test/cpp/fluid/math/CMakeLists.txt
+++ b/test/cpp/fluid/math/CMakeLists.txt
@@ -1,15 +1,15 @@
 cc_test(
   selected_rows_functor_test
   SRCS selected_rows_functor_test.cc
-  DEPS allocator phi)
+  DEPS allocator phi common)
 cc_test(
   im2col_test
   SRCS im2col_test.cc
-  DEPS phi)
+  DEPS phi common)
 cc_test(
   vol2col_test
   SRCS vol2col_test.cc
-  DEPS phi)
+  DEPS phi common)
 cc_test(
   beam_search_test
   SRCS beam_search_test.cc
@@ -18,13 +18,13 @@ if(WITH_GPU)
   nv_test(
     selected_rows_functor_gpu_test
     SRCS selected_rows_functor_test.cu.cc
-    DEPS phi)
+    DEPS phi common)
 endif()
 if(WITH_ROCM)
   hip_test(
     selected_rows_functor_gpu_test
     SRCS selected_rows_functor_test.cu.cc
-    DEPS phi)
+    DEPS phi common)
 endif()
 cc_test(
   concat_test
diff --git a/test/cpp/fluid/math/beam_search_test.cc b/test/cpp/fluid/math/beam_search_test.cc
index d8e56e6102dd7a..428828aa2cb17e 100644
--- a/test/cpp/fluid/math/beam_search_test.cc
+++ b/test/cpp/fluid/math/beam_search_test.cc
@@ -32,7 +32,7 @@ void PrepareCPUTensors(phi::DenseTensor* ids,
   ids->set_lod(lod);
   scores->set_lod(lod);
 
-  auto dims = phi::make_ddim({4, 3});
+  auto dims = common::make_ddim({4, 3});
   ids->Resize(dims);
   scores->Resize(dims);
 
@@ -52,13 +52,13 @@ void PrepareCPUTensors(phi::DenseTensor* ids,
   }
 
   // pre_ids
-  pre_ids->Resize(phi::make_ddim({4, 1}));
+  pre_ids->Resize(common::make_ddim({4, 1}));
   for (int i = 0; i < 4; i++) {
     pre_ids->mutable_data<int64_t>(place)[i] = i + 1;
   }
 
   // pre_scores
-  pre_scores->Resize(phi::make_ddim({4, 1}));
+  pre_scores->Resize(common::make_ddim({4, 1}));
   for (int i = 0; i < 4; i++) {
     pre_scores->mutable_data<float>(place)[i] = 0.1 * (i + 1);  // NOLINT
   }
diff --git a/test/cpp/fluid/math/concat_test.cc b/test/cpp/fluid/math/concat_test.cc
index b350167cfb46b8..080a659ecdbbc6 100644
--- a/test/cpp/fluid/math/concat_test.cc
+++ b/test/cpp/fluid/math/concat_test.cc
@@ -37,9 +37,9 @@ void ConcatCase1(DeviceContext* context) {
   phi::DenseTensor input_b;
   phi::DenseTensor out;
 
-  auto dim_a = phi::make_ddim({2, 3, 4});
-  auto dim_b = phi::make_ddim({3, 3, 4});
-  auto dim_out = phi::make_ddim({5, 3, 4});
+  auto dim_a = common::make_ddim({2, 3, 4});
+  auto dim_b = common::make_ddim({3, 3, 4});
+  auto dim_out = common::make_ddim({5, 3, 4});
 
   input_a.mutable_data<int>(dim_a, Place());
   input_b.mutable_data<int>(dim_b, Place());
@@ -142,9 +142,9 @@ void ConcatCase2(DeviceContext* context) {
   phi::DenseTensor input_b;
   phi::DenseTensor out;
 
-  auto dim_a = phi::make_ddim({2, 3, 4});
-  auto dim_b = phi::make_ddim({2, 4, 4});
-  auto dim_out = phi::make_ddim({2, 7, 4});
+  auto dim_a = common::make_ddim({2, 3, 4});
+  auto dim_b = common::make_ddim({2, 4, 4});
+  auto dim_out = common::make_ddim({2, 7, 4});
 
   input_a.mutable_data<int>(dim_a, Place());
   input_b.mutable_data<int>(dim_b, Place());
@@ -251,9 +251,9 @@ void ConcatCase3(DeviceContext* context) {
   phi::DenseTensor input_b;
   phi::DenseTensor out;
 
-  auto dim_a = phi::make_ddim({2, 3, 4});
-  auto dim_b = phi::make_ddim({2, 3, 5});
-  auto dim_out = phi::make_ddim({2, 3, 9});
+  auto dim_a = common::make_ddim({2, 3, 4});
+  auto dim_b = common::make_ddim({2, 3, 5});
+  auto dim_out = common::make_ddim({2, 3, 9});
 
   input_a.mutable_data<int>(dim_a, Place());
   input_b.mutable_data<int>(dim_b, Place());
@@ -362,9 +362,9 @@ void ConcatCase4(DeviceContext* context) {
   phi::DenseTensor input_b;
   phi::DenseTensor out;
 
-  auto dim_a = phi::make_ddim({2, 3, 4});
-  auto dim_b = phi::make_ddim({2, 3, 4});
-  auto dim_out = phi::make_ddim({2, 6, 4});
+  auto dim_a = common::make_ddim({2, 3, 4});
+  auto dim_b = common::make_ddim({2, 3, 4});
+  auto dim_out = common::make_ddim({2, 6, 4});
 
   input_a.mutable_data<int>(dim_a, Place());
   input_b.mutable_data<int>(dim_b, Place());
diff --git a/test/cpp/fluid/math/selected_rows_functor_test.cc b/test/cpp/fluid/math/selected_rows_functor_test.cc
index a2c88c723fefa6..a32140f4a9c35b 100644
--- a/test/cpp/fluid/math/selected_rows_functor_test.cc
+++ b/test/cpp/fluid/math/selected_rows_functor_test.cc
@@ -33,7 +33,7 @@ TEST(selected_rows_functor, cpu_add) {
       new phi::SelectedRows(rows1, height)};
   auto* in1_value = selected_rows1->mutable_value();
   in1_value->mutable_data<float>(
-      phi::make_ddim({static_cast<int64_t>(rows1.size()), row_numel}),
+      common::make_ddim({static_cast<int64_t>(rows1.size()), row_numel}),
       cpu_place);
   functor(ctx, in1_value, 1.0);
 
@@ -42,7 +42,7 @@ TEST(selected_rows_functor, cpu_add) {
       new phi::SelectedRows(rows2, height)};
   auto* in2_value = selected_rows2->mutable_value();
   in2_value->mutable_data<float>(
-      phi::make_ddim({static_cast<int64_t>(rows2.size()), row_numel}),
+      common::make_ddim({static_cast<int64_t>(rows2.size()), row_numel}),
       cpu_place);
   functor(ctx, in2_value, 2.0);
 
@@ -50,7 +50,7 @@ TEST(selected_rows_functor, cpu_add) {
   auto* out_value = output->mutable_value();
 
   // simplely concat two SelectedRows
-  out_value->mutable_data<float>(phi::make_ddim({7, 10}), cpu_place);
+  out_value->mutable_data<float>(common::make_ddim({7, 10}), cpu_place);
 
   phi::funcs::SelectedRowsAdd<phi::CPUContext, float> add_functor;
   add_functor(ctx, *selected_rows1, *selected_rows2, output.get());
@@ -84,11 +84,13 @@ TEST(selected_rows_functor, cpu_add) {
   EXPECT_EQ(out_data[6 * row_numel + 9], 2.0);
 
   std::unique_ptr<phi::DenseTensor> tensor1{new phi::DenseTensor()};
-  tensor1->mutable_data<float>(phi::make_ddim({height, row_numel}), cpu_place);
+  tensor1->mutable_data<float>(common::make_ddim({height, row_numel}),
+                               cpu_place);
   functor(ctx, tensor1.get(), 3.0);
 
   std::unique_ptr<phi::DenseTensor> tensor2{new phi::DenseTensor()};
-  tensor2->mutable_data<float>(phi::make_ddim({height, row_numel}), cpu_place);
+  tensor2->mutable_data<float>(common::make_ddim({height, row_numel}),
+                               cpu_place);
 
   phi::funcs::SelectedRowsAddTensor<phi::CPUContext, float> add_tensor_functor;
   add_tensor_functor(ctx, *output, *tensor1, tensor2.get());
@@ -125,7 +127,7 @@ TEST(selected_rows_functor, cpu_add_to) {
       new phi::SelectedRows(rows1, height)};
   auto* in1_value = selected_rows1->mutable_value();
   in1_value->mutable_data<float>(
-      phi::make_ddim({static_cast<int64_t>(rows1.size()), row_numel}),
+      common::make_ddim({static_cast<int64_t>(rows1.size()), row_numel}),
       cpu_place);
   functor(ctx, in1_value, 1.0);
 
@@ -134,7 +136,7 @@ TEST(selected_rows_functor, cpu_add_to) {
       new phi::SelectedRows(rows2, height)};
   auto* in2_value = selected_rows2->mutable_value();
   in2_value->mutable_data<float>(
-      phi::make_ddim({static_cast<int64_t>(rows2.size()), row_numel}),
+      common::make_ddim({static_cast<int64_t>(rows2.size()), row_numel}),
       cpu_place);
   functor(ctx, in2_value, 2.0);
 
@@ -143,7 +145,7 @@ TEST(selected_rows_functor, cpu_add_to) {
   auto* out_value = output->mutable_value();
 
   // simplely concat two SelectedRows
-  out_value->mutable_data<float>(phi::make_ddim({7, 10}), cpu_place);
+  out_value->mutable_data<float>(common::make_ddim({7, 10}), cpu_place);
 
   phi::funcs::SelectedRowsAddTo<phi::CPUContext, float> add_to_functor;
   add_to_functor(ctx, *selected_rows1, 0, output.get());
@@ -178,7 +180,8 @@ TEST(selected_rows_functor, cpu_add_to) {
   EXPECT_EQ(out_data[6 * row_numel + 9], 2.0);
 
   std::unique_ptr<phi::DenseTensor> tensor1{new phi::DenseTensor()};
-  tensor1->mutable_data<float>(phi::make_ddim({height, row_numel}), cpu_place);
+  tensor1->mutable_data<float>(common::make_ddim({height, row_numel}),
+                               cpu_place);
   functor(ctx, tensor1.get(), 3.0);
 
   phi::funcs::SelectedRowsAddToTensor<phi::CPUContext, float>
@@ -217,7 +220,7 @@ TEST(selected_rows_functor, cpu_merge_average_float) {
       new phi::SelectedRows(rows, height)};
   auto* in_value = selected_rows->mutable_value();
   in_value->mutable_data<float>(
-      phi::make_ddim({static_cast<int64_t>(rows.size()), row_numel}),
+      common::make_ddim({static_cast<int64_t>(rows.size()), row_numel}),
       cpu_place);
   functor(ctx, in_value, 1.0);
 
@@ -255,7 +258,7 @@ TEST(selected_rows_functor, cpu_merge_add_float) {
       new phi::SelectedRows(rows, height)};
   auto* in_value = selected_rows->mutable_value();
   in_value->mutable_data<float>(
-      phi::make_ddim({static_cast<int64_t>(rows.size()), row_numel}),
+      common::make_ddim({static_cast<int64_t>(rows.size()), row_numel}),
       cpu_place);
   functor(ctx, in_value, 1.0);
 
@@ -294,7 +297,7 @@ TEST(selected_rows_functor, cpu_merge_add_int) {
       new phi::SelectedRows(rows, height)};
   auto* in_value = selected_rows->mutable_value();
   in_value->mutable_data<int>(
-      phi::make_ddim({static_cast<int64_t>(rows.size()), row_numel}),
+      common::make_ddim({static_cast<int64_t>(rows.size()), row_numel}),
       cpu_place);
   functor(ctx, in_value, 1);
 
@@ -334,7 +337,7 @@ TEST(selected_rows_functor, cpu_merge_add_multi) {
       new phi::SelectedRows(rows1, height)};
   auto* in1_value = selected_rows1->mutable_value();
   in1_value->mutable_data<float>(
-      phi::make_ddim({static_cast<int64_t>(rows1.size()), row_numel}),
+      common::make_ddim({static_cast<int64_t>(rows1.size()), row_numel}),
       cpu_place);
   set_const(ctx, in1_value, 1.0);
 
@@ -343,7 +346,7 @@ TEST(selected_rows_functor, cpu_merge_add_multi) {
       new phi::SelectedRows(rows2, height)};
   auto* in2_value = selected_rows2->mutable_value();
   in2_value->mutable_data<float>(
-      phi::make_ddim({static_cast<int64_t>(rows2.size()), row_numel}),
+      common::make_ddim({static_cast<int64_t>(rows2.size()), row_numel}),
       cpu_place);
   set_const(ctx, in2_value, 1.0);
 
@@ -357,7 +360,7 @@ TEST(selected_rows_functor, cpu_merge_add_multi) {
   merge_add_functor(ctx, inputs, output.get());
 
   EXPECT_EQ(output->height(), height);
-  EXPECT_EQ(output->value().dims(), phi::make_ddim({3, row_numel}));
+  EXPECT_EQ(output->value().dims(), common::make_ddim({3, row_numel}));
 
   std::vector<int64_t> ret_rows{2, 3, 5};
   EXPECT_EQ(output->rows(), ret_rows);
@@ -386,7 +389,7 @@ TEST(selected_rows_functor, cpu_merge_add_multi_noduplicated) {
       new phi::SelectedRows(rows1, height)};
   auto* in1_value = selected_rows1->mutable_value();
   in1_value->mutable_data<float>(
-      phi::make_ddim({static_cast<int64_t>(rows1.size()), row_numel}),
+      common::make_ddim({static_cast<int64_t>(rows1.size()), row_numel}),
       cpu_place);
   set_const(ctx, in1_value, 1.0);
 
@@ -395,7 +398,7 @@ TEST(selected_rows_functor, cpu_merge_add_multi_noduplicated) {
       new phi::SelectedRows(rows2, height)};
   auto* in2_value = selected_rows2->mutable_value();
   in2_value->mutable_data<float>(
-      phi::make_ddim({static_cast<int64_t>(rows2.size()), row_numel}),
+      common::make_ddim({static_cast<int64_t>(rows2.size()), row_numel}),
       cpu_place);
   set_const(ctx, in2_value, 2.0);
 
@@ -409,7 +412,7 @@ TEST(selected_rows_functor, cpu_merge_add_multi_noduplicated) {
   merge_add_functor(ctx, inputs, output.get());
 
   EXPECT_EQ(output->height(), height);
-  EXPECT_EQ(output->value().dims(), phi::make_ddim({10, row_numel}));
+  EXPECT_EQ(output->value().dims(), common::make_ddim({10, row_numel}));
 
   std::vector<int64_t> ret_rows{1, 3, 5, 7, 9, 0, 2, 4, 6, 8};
   EXPECT_EQ(output->rows(), ret_rows);
@@ -442,7 +445,7 @@ TEST(selected_rows_functor, cpu_sum_to) {
       new phi::SelectedRows(rows1, height)};
   auto* in1_value = selected_rows1->mutable_value();
   in1_value->mutable_data<float>(
-      phi::make_ddim({static_cast<int64_t>(rows1.size()), row_numel}),
+      common::make_ddim({static_cast<int64_t>(rows1.size()), row_numel}),
       cpu_place);
 
   functor(ctx, in1_value, 1.0);
@@ -451,7 +454,7 @@ TEST(selected_rows_functor, cpu_sum_to) {
       new phi::SelectedRows(rows2, height)};
   auto* in2_value = selected_rows2->mutable_value();
   in2_value->mutable_data<float>(
-      phi::make_ddim({static_cast<int64_t>(rows2.size()), row_numel}),
+      common::make_ddim({static_cast<int64_t>(rows2.size()), row_numel}),
       cpu_place);
 
   functor(ctx, in2_value, 2.0);
@@ -459,7 +462,7 @@ TEST(selected_rows_functor, cpu_sum_to) {
   output->set_height(height);
   auto* out_value = output->mutable_value();
   // simplely concat two SelectedRows
-  out_value->mutable_data<float>(phi::make_ddim({7, 10}), cpu_place);
+  out_value->mutable_data<float>(common::make_ddim({7, 10}), cpu_place);
   phi::funcs::SelectedRowsSumTo<phi::CPUContext, float> sum_to_functor;
   sum_to_functor(ctx,
                  std::vector<phi::SelectedRows*>(
@@ -491,7 +494,8 @@ TEST(selected_rows_functor, cpu_sum_to) {
   EXPECT_EQ(out_data[5 * row_numel + 7], 2.0);
   EXPECT_EQ(out_data[6 * row_numel + 9], 2.0);
   std::unique_ptr<phi::DenseTensor> tensor1{new phi::DenseTensor()};
-  tensor1->mutable_data<float>(phi::make_ddim({height, row_numel}), cpu_place);
+  tensor1->mutable_data<float>(common::make_ddim({height, row_numel}),
+                               cpu_place);
   functor(ctx, tensor1.get(), 3.0);
   phi::funcs::SelectedRowsAddToTensor<phi::CPUContext, float>
       add_to_tensor_functor;
diff --git a/test/cpp/fluid/math/selected_rows_functor_test.cu.cc b/test/cpp/fluid/math/selected_rows_functor_test.cu.cc
index b507f096082f94..a11dbe7c8158f7 100644
--- a/test/cpp/fluid/math/selected_rows_functor_test.cu.cc
+++ b/test/cpp/fluid/math/selected_rows_functor_test.cu.cc
@@ -15,10 +15,10 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/selected_rows_functor.h"
 
 #include "gtest/gtest.h"
+#include "paddle/common/errors.h"
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
@@ -36,7 +36,7 @@ TEST(selected_rows_functor, gpu_add) {
       new phi::SelectedRows(rows1, height)};
   auto* in1_value = selected_rows1->mutable_value();
   in1_value->mutable_data<float>(
-      phi::make_ddim({static_cast<int64_t>(rows1.size()), row_numel}),
+      common::make_ddim({static_cast<int64_t>(rows1.size()), row_numel}),
       gpu_place);
   functor(ctx, in1_value, 1.0);
 #ifdef PADDLE_WITH_HIP
@@ -56,7 +56,7 @@ TEST(selected_rows_functor, gpu_add) {
       new phi::SelectedRows(rows2, height)};
   auto* in2_value = selected_rows2->mutable_value();
   in2_value->mutable_data<float>(
-      phi::make_ddim({static_cast<int64_t>(rows2.size()), row_numel}),
+      common::make_ddim({static_cast<int64_t>(rows2.size()), row_numel}),
       gpu_place);
   functor(ctx, in2_value, 2.0);
 
@@ -64,7 +64,7 @@ TEST(selected_rows_functor, gpu_add) {
   auto* out_value = output->mutable_value();
 
   // simply concat two SelectedRows
-  out_value->mutable_data<float>(phi::make_ddim({7, 10}), gpu_place);
+  out_value->mutable_data<float>(common::make_ddim({7, 10}), gpu_place);
 
   phi::funcs::SelectedRowsAdd<phi::GPUContext, float> add_functor;
   add_functor(ctx, *selected_rows1, *selected_rows2, output.get());
@@ -101,11 +101,13 @@ TEST(selected_rows_functor, gpu_add) {
   EXPECT_EQ(out_cpu_data[6 * row_numel + 9], 2.0);
 
   std::unique_ptr<phi::DenseTensor> tensor1{new phi::DenseTensor()};
-  tensor1->mutable_data<float>(phi::make_ddim({height, row_numel}), gpu_place);
+  tensor1->mutable_data<float>(common::make_ddim({height, row_numel}),
+                               gpu_place);
   functor(ctx, tensor1.get(), 3.0);
 
   std::unique_ptr<phi::DenseTensor> tensor2{new phi::DenseTensor()};
-  tensor2->mutable_data<float>(phi::make_ddim({height, row_numel}), gpu_place);
+  tensor2->mutable_data<float>(common::make_ddim({height, row_numel}),
+                               gpu_place);
 
   phi::funcs::SelectedRowsAddTensor<phi::GPUContext, float> add_tensor_functor;
   add_tensor_functor(ctx, *output, *tensor1, tensor2.get());
@@ -144,7 +146,7 @@ TEST(selected_rows_functor, gpu_add_to) {
       new phi::SelectedRows(rows1, height)};
   auto* in1_value = selected_rows1->mutable_value();
   in1_value->mutable_data<float>(
-      phi::make_ddim({static_cast<int64_t>(rows1.size()), row_numel}),
+      common::make_ddim({static_cast<int64_t>(rows1.size()), row_numel}),
       gpu_place);
   functor(ctx, in1_value, 1.0);
 
@@ -153,7 +155,7 @@ TEST(selected_rows_functor, gpu_add_to) {
       new phi::SelectedRows(rows2, height)};
   auto* in2_value = selected_rows2->mutable_value();
   in2_value->mutable_data<float>(
-      phi::make_ddim({static_cast<int64_t>(rows2.size()), row_numel}),
+      common::make_ddim({static_cast<int64_t>(rows2.size()), row_numel}),
       gpu_place);
   functor(ctx, in2_value, 2.0);
 
@@ -162,7 +164,7 @@ TEST(selected_rows_functor, gpu_add_to) {
   auto* out_value = output->mutable_value();
 
   // simply concat two SelectedRows
-  out_value->mutable_data<float>(phi::make_ddim({7, 10}), gpu_place);
+  out_value->mutable_data<float>(common::make_ddim({7, 10}), gpu_place);
 
   phi::funcs::SelectedRowsAddTo<phi::GPUContext, float> add_to_functor;
   add_to_functor(ctx, *selected_rows1, 0, output.get());
@@ -200,7 +202,8 @@ TEST(selected_rows_functor, gpu_add_to) {
   EXPECT_EQ(out_cpu_data[6 * row_numel + 9], 2.0);
 
   std::unique_ptr<phi::DenseTensor> tensor1{new phi::DenseTensor()};
-  tensor1->mutable_data<float>(phi::make_ddim({height, row_numel}), gpu_place);
+  tensor1->mutable_data<float>(common::make_ddim({height, row_numel}),
+                               gpu_place);
   functor(ctx, tensor1.get(), 3.0);
 
   phi::funcs::SelectedRowsAddToTensor<phi::GPUContext, float>
@@ -242,7 +245,7 @@ TEST(selected_rows_functor, gpu_merge_add) {
       new phi::SelectedRows(rows1, height)};
   auto* in1_value = selected_rows1->mutable_value();
   in1_value->mutable_data<float>(
-      phi::make_ddim({static_cast<int64_t>(rows1.size()), row_numel}),
+      common::make_ddim({static_cast<int64_t>(rows1.size()), row_numel}),
       gpu_place);
   set_const(ctx, in1_value, 1.0);
 
@@ -251,7 +254,7 @@ TEST(selected_rows_functor, gpu_merge_add) {
       new phi::SelectedRows(rows2, height)};
   auto* in2_value = selected_rows2->mutable_value();
   in2_value->mutable_data<float>(
-      phi::make_ddim({static_cast<int64_t>(rows2.size()), row_numel}),
+      common::make_ddim({static_cast<int64_t>(rows2.size()), row_numel}),
       gpu_place);
   set_const(ctx, in2_value, 1.0);
 
@@ -268,7 +271,7 @@ TEST(selected_rows_functor, gpu_merge_add) {
   phi::Copy(ctx, output->value(), cpu_place, true, &output_cpu);
 
   EXPECT_EQ(output->height(), height);
-  EXPECT_EQ(output->value().dims(), phi::make_ddim({3, row_numel}));
+  EXPECT_EQ(output->value().dims(), common::make_ddim({3, row_numel}));
 
   std::vector<int64_t> ret_rows{2, 3, 5};
   EXPECT_EQ(output->rows(), ret_rows);
diff --git a/test/cpp/fluid/mkldnn/CMakeLists.txt b/test/cpp/fluid/mkldnn/CMakeLists.txt
index f83fd91963be20..22ea64bdbdb0c1 100644
--- a/test/cpp/fluid/mkldnn/CMakeLists.txt
+++ b/test/cpp/fluid/mkldnn/CMakeLists.txt
@@ -7,6 +7,7 @@ cc_test(
        elementwise_add_op
        activation_op
        phi
+       common
        scope
        device_context
        enforce
@@ -21,6 +22,7 @@ cc_test(
        cpu_quantize_placement_pass
        cpu_quantize_pass
        phi
+       common
        scope
        device_context)
 
@@ -32,6 +34,7 @@ cc_test(
        depthwise_conv
        tensor
        phi
+       common
        scope
        device_context
        enforce
@@ -43,6 +46,7 @@ set(TEST_MKLDNN_CACHING_DEPS
     elementwise_add_op
     activation_op
     phi
+    common
     scope
     device_context
     enforce
@@ -74,6 +78,7 @@ if(WIN32 AND WITH_TESTING)
     generated_op
     generated_static_op
     phi
+    common
     transpose_op
     fused_transpose_op
     scope
@@ -95,6 +100,7 @@ cc_test(
        generated_static_op
        generated_op
        phi
+       common
        scope
        device_context
        enforce)
diff --git a/test/cpp/fluid/mkldnn/test_mkldnn_caching.cc b/test/cpp/fluid/mkldnn/test_mkldnn_caching.cc
index 24be9e518d37a5..0f62301cdfe6df 100644
--- a/test/cpp/fluid/mkldnn/test_mkldnn_caching.cc
+++ b/test/cpp/fluid/mkldnn/test_mkldnn_caching.cc
@@ -102,7 +102,7 @@ void RunOperator(const platform::Place &place,
   std::uniform_real_distribution<T> dist(static_cast<T>(10.0),
                                          static_cast<T>(20.0));
   std::mt19937 engine;
-  size_t numel = static_cast<size_t>(phi::product(dims));
+  size_t numel = static_cast<size_t>(common::product(dims));
   for (int i = 0; i < num_inputs[op_type]; ++i) {
     input_names[i].tensor->Resize(dims);
     auto data_ptr = input_names[i].tensor->mutable_data<T>(place);
diff --git a/test/cpp/fluid/mkldnn/test_mkldnn_op_inplace.cc b/test/cpp/fluid/mkldnn/test_mkldnn_op_inplace.cc
index 4beb314fe6b76b..1c1a0cfb219140 100644
--- a/test/cpp/fluid/mkldnn/test_mkldnn_op_inplace.cc
+++ b/test/cpp/fluid/mkldnn/test_mkldnn_op_inplace.cc
@@ -69,7 +69,7 @@ bool TestMain(const platform::Place &place,
   std::uniform_real_distribution<T> dist(static_cast<T>(10.0),
                                          static_cast<T>(20.0));
   std::mt19937 engine;
-  size_t numel = static_cast<size_t>(phi::product(dims));
+  size_t numel = static_cast<size_t>(common::product(dims));
   for (int i = 0; i < num_inputs; ++i) {
     input_names[i].tensor->Resize(dims);
     auto data_ptr = input_names[i].tensor->mutable_data<T>(place);
diff --git a/test/cpp/fluid/mkldnn/test_mkldnn_op_nhwc.cc b/test/cpp/fluid/mkldnn/test_mkldnn_op_nhwc.cc
index b152623a6ddcdd..bf652e2de943ed 100644
--- a/test/cpp/fluid/mkldnn/test_mkldnn_op_nhwc.cc
+++ b/test/cpp/fluid/mkldnn/test_mkldnn_op_nhwc.cc
@@ -62,7 +62,7 @@ void Test_Pool2d_Transpose_NHWC(const std::string &transpose_type) {
   std::uniform_real_distribution<float> dist(static_cast<float>(10.0),
                                              static_cast<float>(20.0));
   std::mt19937 engine;
-  size_t numel = static_cast<size_t>(phi::product(dims));
+  size_t numel = static_cast<size_t>(common::product(dims));
   input_name.tensor->Resize(dims);
   auto data_ptr = input_name.tensor->mutable_data<float>(p);
   for (size_t i = 0; i < numel; ++i) {
@@ -123,7 +123,7 @@ TEST(test_pool2d_relu_relu_nhwc, cpu_place) {
   std::uniform_real_distribution<float> dist(static_cast<float>(10.0),
                                              static_cast<float>(20.0));
   std::mt19937 engine;
-  size_t numel = static_cast<size_t>(phi::product(dims));
+  size_t numel = static_cast<size_t>(common::product(dims));
   input_name.tensor->Resize(dims);
   auto data_ptr = input_name.tensor->mutable_data<float>(p);
   for (size_t i = 0; i < numel; ++i) {
@@ -186,7 +186,7 @@ TEST(test_pool2d_shape_nhwc, cpu_place) {
   std::uniform_real_distribution<float> dist(static_cast<float>(10.0),
                                              static_cast<float>(20.0));
   std::mt19937 engine;
-  size_t numel = static_cast<size_t>(phi::product(dims));
+  size_t numel = static_cast<size_t>(common::product(dims));
   input_name.tensor->Resize(dims);
   auto data_ptr = input_name.tensor->mutable_data<float>(p);
   for (size_t i = 0; i < numel; ++i) {
@@ -242,7 +242,7 @@ TEST(test_pool2d_crop_nhwc, cpu_place) {
   // Initialize input data
   std::uniform_real_distribution<float> dist(10.0f, 20.0f);
   std::mt19937 engine;
-  size_t numel = static_cast<size_t>(phi::product(dims));
+  size_t numel = static_cast<size_t>(common::product(dims));
   input_name.tensor->Resize(dims);
   auto data_ptr = input_name.tensor->mutable_data<float>(p);
   for (size_t i = 0; i < numel; ++i) {
@@ -250,11 +250,11 @@ TEST(test_pool2d_crop_nhwc, cpu_place) {
   }
   // Second input (Y) to crop is having no buffer
   // but as it is MKLDNN then its shape order should be NCHW
-  auto expected_dims_nchw = phi::vectorize<int64_t>(expected_dims);
+  auto expected_dims_nchw = common::vectorize<int64_t>(expected_dims);
   std::rotate(expected_dims_nchw.begin() + 1,
               expected_dims_nchw.end() - 1,
               expected_dims_nchw.end());
-  second_crop_input_name.tensor->Resize(phi::make_ddim(expected_dims_nchw));
+  second_crop_input_name.tensor->Resize(common::make_ddim(expected_dims_nchw));
   const auto second_crop_input_md =
       dnnl::memory::desc(expected_dims_nchw,
                          dnnl::memory::data_type::f32,
diff --git a/test/cpp/fluid/nccl/nccl_op_test.cu.cc b/test/cpp/fluid/nccl/nccl_op_test.cu.cc
index 87c0708e12d398..b8a47b97031653 100644
--- a/test/cpp/fluid/nccl/nccl_op_test.cu.cc
+++ b/test/cpp/fluid/nccl/nccl_op_test.cu.cc
@@ -102,7 +102,7 @@ class NCCLTester : public ::testing::Test {
     if (!send_tensor->numel()) {
       send_tensor->mutable_data<T>(kDims, place);
 
-      std::vector<T> send_vector(phi::product(kDims), GetGPUData(gpu_id));
+      std::vector<T> send_vector(common::product(kDims), GetGPUData(gpu_id));
       paddle::framework::TensorFromVector<T>(send_vector, *ctx, send_tensor);
       VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel();
     }
@@ -111,7 +111,7 @@ class NCCLTester : public ::testing::Test {
 
     PADDLE_ENFORCE_EQ(
         send_tensor->numel(),
-        phi::product(kDims),
+        common::product(kDims),
         paddle::platform::errors::InvalidArgument("Tensor numel not match!"));
 
     auto op = f::OpRegistry::CreateOp(*op1);
@@ -184,7 +184,7 @@ void NCCLTester::testNcclAllReduceOp() {
                          dev_ctx->stream());
     dev_ctx->Wait();
 
-    for (int64_t j = 0; j < phi::product(kDims); ++j) {
+    for (int64_t j = 0; j < common::product(kDims); ++j) {
       ASSERT_NEAR(ct[j], expected_result, 1e-5);
     }
   }
@@ -241,7 +241,7 @@ void NCCLTester::testNcclReduceOp() {
                        dev_ctx->stream());
   dev_ctx->Wait();
 
-  for (int64_t j = 0; j < phi::product(kDims); ++j) {
+  for (int64_t j = 0; j < common::product(kDims); ++j) {
     ASSERT_NEAR(ct[j], expected_result, 1e-5);
   }
 }
@@ -299,7 +299,7 @@ void NCCLTester::testNcclBcastOp() {
                        dev_ctx->stream());
   dev_ctx->Wait();
 
-  for (int64_t j = 0; j < phi::product(kDims); ++j) {
+  for (int64_t j = 0; j < common::product(kDims); ++j) {
     ASSERT_NEAR(ct[j], result, 1e-5);
   }
 }
diff --git a/test/cpp/fluid/pscore/CMakeLists.txt b/test/cpp/fluid/pscore/CMakeLists.txt
index eb6d3b4385487a..9413c8aaa43f92 100644
--- a/test/cpp/fluid/pscore/CMakeLists.txt
+++ b/test/cpp/fluid/pscore/CMakeLists.txt
@@ -51,25 +51,28 @@ endif()
 
 set_source_files_properties(
   heter_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-paddle_test(heter_server_test SRCS heter_server_test.cc)
+paddle_test(heter_server_test SRCS heter_server_test.cc DEPS common)
 
 set_source_files_properties(
   send_and_recv_op_cpu_test.cc PROPERTIES COMPILE_FLAGS
                                           ${DISTRIBUTE_COMPILE_FLAGS})
-paddle_test(send_and_recv_cpu_test SRCS send_and_recv_op_cpu_test.cc)
+paddle_test(send_and_recv_cpu_test SRCS send_and_recv_op_cpu_test.cc DEPS
+            common)
 
 set_source_files_properties(
   send_and_recv_op_gpu_test.cc PROPERTIES COMPILE_FLAGS
                                           ${DISTRIBUTE_COMPILE_FLAGS})
-paddle_test(send_and_recv_gpu_test SRCS send_and_recv_op_gpu_test.cc)
+paddle_test(send_and_recv_gpu_test SRCS send_and_recv_op_gpu_test.cc DEPS
+            common)
 
 set_source_files_properties(
   heter_listen_and_server_test.cc PROPERTIES COMPILE_FLAGS
                                              ${DISTRIBUTE_COMPILE_FLAGS})
-paddle_test(heter_listen_and_server_test SRCS heter_listen_and_server_test.cc)
+paddle_test(heter_listen_and_server_test SRCS heter_listen_and_server_test.cc
+            DEPS common)
 
 #set_source_files_properties(heter_cloud_comm_cpu_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-#cc_test(heter_cloud_comm_cpu_test SRCS heter_cloud_comm_cpu_test.cc DEPS executor scope proto_desc generated_static_op heter_listen_and_serv_op ${RPC_DEPS} ${DISTRIBUTE_DEPS} phi)
+#cc_test(heter_cloud_comm_cpu_test SRCS heter_cloud_comm_cpu_test.cc DEPS executor scope proto_desc generated_static_op heter_listen_and_serv_op ${RPC_DEPS} ${DISTRIBUTE_DEPS} phi common)
 
 set_source_files_properties(
   switch_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
@@ -80,9 +83,11 @@ cc_binary(
   DEPS
   executor
   scope
+  common
   proto_desc
   generated_op
   heter_listen_and_serv_op
   ${RPC_DEPS}
   ${DISTRIBUTE_DEPS}
-  phi)
+  phi
+  common)
diff --git a/test/cpp/fluid/scatter_test.cc b/test/cpp/fluid/scatter_test.cc
index 7f774089fd9ca5..f4fa2c9894c642 100644
--- a/test/cpp/fluid/scatter_test.cc
+++ b/test/cpp/fluid/scatter_test.cc
@@ -24,9 +24,9 @@ TEST(scatter, ScatterUpdate) {
   phi::DenseTensor index;
   phi::DenseTensor output;
 
-  auto* p_src = src.mutable_data<float>(phi::make_ddim({1, 4}),
+  auto* p_src = src.mutable_data<float>(common::make_ddim({1, 4}),
                                         paddle::platform::CPUPlace());
-  auto* p_index = index.mutable_data<int>(phi::make_ddim({1}),
+  auto* p_index = index.mutable_data<int>(common::make_ddim({1}),
                                           paddle::platform::CPUPlace());
 
   for (size_t i = 0; i < 4; ++i) {
@@ -34,7 +34,7 @@ TEST(scatter, ScatterUpdate) {
   }
   p_index[0] = 1;
 
-  auto* p_output = output.mutable_data<float>(phi::make_ddim({4, 4}),
+  auto* p_output = output.mutable_data<float>(common::make_ddim({4, 4}),
                                               paddle::platform::CPUPlace());
 
   for (int64_t i = 0; i < output.numel(); ++i) {
diff --git a/test/cpp/fluid/test_common_infer_shape_functions.cc b/test/cpp/fluid/test_common_infer_shape_functions.cc
index 84332f110216c3..8519e6ca7f8494 100644
--- a/test/cpp/fluid/test_common_infer_shape_functions.cc
+++ b/test/cpp/fluid/test_common_infer_shape_functions.cc
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "gtest/gtest.h"
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/imperative/infer_shape_context.h"
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
-#include "paddle/phi/core/ddim.h"
 
 USE_OP_ITSELF(relu);
 USE_OP_ITSELF(elementwise_add);
diff --git a/test/cpp/imperative/CMakeLists.txt b/test/cpp/imperative/CMakeLists.txt
index 82ae6c7a3fa749..491a008a963283 100644
--- a/test/cpp/imperative/CMakeLists.txt
+++ b/test/cpp/imperative/CMakeLists.txt
@@ -32,7 +32,7 @@ endif()
 cc_test(
   test_gradient_accmulator
   SRCS test_gradient_accmulator.cc
-  DEPS memcpy selected_rows_utils gradient_accumulator phi phi_utils)
+  DEPS memcpy selected_rows_utils gradient_accumulator phi common phi_utils)
 cc_test(
   test_layer
   SRCS test_layer.cc
diff --git a/test/cpp/imperative/heter_ccl_context_test.cc b/test/cpp/imperative/heter_ccl_context_test.cc
index 8c544669dc799b..37520d10f172af 100644
--- a/test/cpp/imperative/heter_ccl_context_test.cc
+++ b/test/cpp/imperative/heter_ccl_context_test.cc
@@ -50,7 +50,7 @@ void AllReduceByStream(int local_rank, int device_id) {
   // input and output data
   framework::Variable* src_dev_var(new framework::Variable());
   auto* src_dev_tensor = src_dev_var->GetMutable<phi::DenseTensor>();
-  src_dev_tensor->mutable_data<float>(phi::make_ddim({data_size}), place);
+  src_dev_tensor->mutable_data<float>(common::make_ddim({data_size}), place);
 
   std::vector<float> src_vec;
   for (int i = 0; i < data_size; i++) {
@@ -61,7 +61,7 @@ void AllReduceByStream(int local_rank, int device_id) {
 
   framework::Variable* dst_dev_var(new framework::Variable());
   auto* dst_dev_tensor = dst_dev_var->GetMutable<phi::DenseTensor>();
-  dst_dev_tensor->mutable_data<float>(phi::make_ddim({data_size}), place);
+  dst_dev_tensor->mutable_data<float>(common::make_ddim({data_size}), place);
 
   // call allreduce
   hpc.AllReduceByStream(*src_dev_var, dst_dev_var, 0, false);
diff --git a/test/cpp/imperative/nccl_context_test.cc b/test/cpp/imperative/nccl_context_test.cc
index 80bd28e1c8b03f..8b9958ee561824 100644
--- a/test/cpp/imperative/nccl_context_test.cc
+++ b/test/cpp/imperative/nccl_context_test.cc
@@ -87,7 +87,7 @@ void Broadcast(int local_rank, int device_id) {
 
   framework::Variable* src_dev_var(new framework::Variable());
   auto* src_dev_tensor = src_dev_var->GetMutable<phi::DenseTensor>();
-  src_dev_tensor->mutable_data<float>(phi::make_ddim({data_size}), place);
+  src_dev_tensor->mutable_data<float>(common::make_ddim({data_size}), place);
 
   // fill data for rank 0 only
   std::vector<float> src_vec;
diff --git a/test/cpp/imperative/test_gradient_accmulator.cc b/test/cpp/imperative/test_gradient_accmulator.cc
index bb264250ecf567..0af376da0a7310 100644
--- a/test/cpp/imperative/test_gradient_accmulator.cc
+++ b/test/cpp/imperative/test_gradient_accmulator.cc
@@ -40,7 +40,8 @@ TEST(Test__SelectedRowsMerge_Test, SelectedRowsMerge) {
   auto sr2 = std::make_shared<phi::SelectedRows>(rows, table_size);
 
   // initialize a sparse table 1
-  sr1->mutable_value()->Resize(phi::make_ddim({table_size, embedding_width}));
+  sr1->mutable_value()->Resize(
+      common::make_ddim({table_size, embedding_width}));
   auto* data_sr1 = sr1->mutable_value()->mutable_data<float>(cpu);
   for (int64_t i = 0; i < table_size; ++i) {
     for (int64_t j = 0; j < embedding_width; ++j) {
@@ -49,7 +50,8 @@ TEST(Test__SelectedRowsMerge_Test, SelectedRowsMerge) {
   }
 
   // initialize a sparse table 2
-  sr2->mutable_value()->Resize(phi::make_ddim({table_size, embedding_width}));
+  sr2->mutable_value()->Resize(
+      common::make_ddim({table_size, embedding_width}));
   auto* data_sr2 = sr2->mutable_value()->mutable_data<float>(cpu);
   for (int64_t i = 0; i < table_size; ++i) {
     for (int64_t j = 0; j < embedding_width; ++j) {
@@ -92,8 +94,8 @@ int TensorddTest(Place1 place1, Place2 place2, T t1, T t2) {
   std::vector<int64_t> dims = {2, 5};
   auto* src = var1.GetMutable<phi::DenseTensor>();
   auto* dst = var2.GetMutable<phi::DenseTensor>();
-  src->Resize(phi::make_ddim(dims));
-  dst->Resize(phi::make_ddim(dims));
+  src->Resize(common::make_ddim(dims));
+  dst->Resize(common::make_ddim(dims));
   auto* src_mutable = src->mutable_data<T>(place1);
   auto* dst_mutable = dst->mutable_data<T>(place2);
 
diff --git a/test/cpp/imperative/test_group.cc b/test/cpp/imperative/test_group.cc
index f2eeb24b7eccef..2243a24dee90d0 100644
--- a/test/cpp/imperative/test_group.cc
+++ b/test/cpp/imperative/test_group.cc
@@ -99,7 +99,7 @@ void GroupConcatSplit(Place place, size_t size) {
 
   {  // concat
     auto* tensor = group.dense_contents_.GetMutable<phi::DenseTensor>();
-    tensor->Resize(phi::make_ddim({group.all_length_}))
+    tensor->Resize(common::make_ddim({group.all_length_}))
         .mutable_data(place, framework::TransToPhiDataType(group.dtype_));
     group.ConcatTensors(*dev_ctx);
 
diff --git a/test/cpp/imperative/test_hooks.cc b/test/cpp/imperative/test_hooks.cc
index 5307139a42652e..3118d38be3a933 100644
--- a/test/cpp/imperative/test_hooks.cc
+++ b/test/cpp/imperative/test_hooks.cc
@@ -85,7 +85,7 @@ TEST(TestHooks, TestGradVarLeafBackwardHook) {
   auto* x_tensor = x->MutableVar()->GetMutable<phi::DenseTensor>();
   auto* y_tensor = y->MutableVar()->GetMutable<phi::DenseTensor>();
 
-  x_tensor->Resize(phi::make_ddim(x_dims));
+  x_tensor->Resize(common::make_ddim(x_dims));
   auto* mutable_x = x_tensor->mutable_data<float>(place);
   memory::Copy(place,
                mutable_x,
@@ -93,7 +93,7 @@ TEST(TestHooks, TestGradVarLeafBackwardHook) {
                src_data.data(),
                sizeof(float) * src_data.size());
 
-  y_tensor->Resize(phi::make_ddim(y_dims));
+  y_tensor->Resize(common::make_ddim(y_dims));
   auto* mutable_y = y_tensor->mutable_data<float>(place);
   memory::Copy(place,
                mutable_y,
@@ -175,7 +175,7 @@ void GradVarLeafBackwardHookWithGradAccmulatedTest() {
   auto* y_tensor = y->MutableVar()->GetMutable<phi::DenseTensor>();
   auto* z_tensor = z->MutableVar()->GetMutable<phi::DenseTensor>();
 
-  x_tensor->Resize(phi::make_ddim(x_dims));
+  x_tensor->Resize(common::make_ddim(x_dims));
   auto* mutable_x = x_tensor->mutable_data<float>(place);
   memory::Copy(place,
                mutable_x,
@@ -183,7 +183,7 @@ void GradVarLeafBackwardHookWithGradAccmulatedTest() {
                src_data.data(),
                sizeof(float) * src_data.size());
 
-  y_tensor->Resize(phi::make_ddim(y_dims));
+  y_tensor->Resize(common::make_ddim(y_dims));
   auto* mutable_y = y_tensor->mutable_data<float>(place);
   memory::Copy(place,
                mutable_y,
@@ -191,7 +191,7 @@ void GradVarLeafBackwardHookWithGradAccmulatedTest() {
                src_data.data(),
                sizeof(float) * src_data.size());
 
-  z_tensor->Resize(phi::make_ddim(z_dims));
+  z_tensor->Resize(common::make_ddim(z_dims));
   auto* mutable_z = z_tensor->mutable_data<float>(place);
   memory::Copy(place,
                mutable_z,
diff --git a/test/cpp/imperative/test_prepare_op.cc b/test/cpp/imperative/test_prepare_op.cc
index 22473cac68dfef..e46390d88bdc4f 100644
--- a/test/cpp/imperative/test_prepare_op.cc
+++ b/test/cpp/imperative/test_prepare_op.cc
@@ -138,7 +138,7 @@ TEST(test_prepare_op, test_prepare_data) {
 
   // prepare an cpu only input
   auto* vin_tensor = vin->MutableVar()->GetMutable<phi::DenseTensor>();
-  vin_tensor->Resize(phi::make_ddim(dims));
+  vin_tensor->Resize(common::make_ddim(dims));
   auto* vin_mutable_tensor = vin_tensor->mutable_data<float>(cpu_place);
   paddle::memory::Copy(cpu_place,
                        vin_mutable_tensor,
@@ -196,7 +196,7 @@ void TestPrepareDataSamePlace(framework::AttributeMap attr_map) {
 
   // prepare an cpu only input
   auto* vin_tensor = vin->MutableVar()->GetMutable<phi::DenseTensor>();
-  vin_tensor->Resize(phi::make_ddim(dims));
+  vin_tensor->Resize(common::make_ddim(dims));
   auto* vin_mutable_tensor = vin_tensor->mutable_data<float>(cpu_place);
   paddle::memory::Copy(cpu_place,
                        vin_mutable_tensor,
diff --git a/test/cpp/imperative/test_tracer.cc b/test/cpp/imperative/test_tracer.cc
index efb7dbf3603ec7..5c29b61dfbe23b 100644
--- a/test/cpp/imperative/test_tracer.cc
+++ b/test/cpp/imperative/test_tracer.cc
@@ -72,14 +72,14 @@ TEST(test_tracer, test_trace_op) {
 
   auto* x_in_tensor = x_in->MutableVar()->GetMutable<phi::DenseTensor>();
   auto* y_in_tensor = y_in->MutableVar()->GetMutable<phi::DenseTensor>();
-  x_in_tensor->Resize(phi::make_ddim(dims1));
+  x_in_tensor->Resize(common::make_ddim(dims1));
   auto* mutable_x = x_in_tensor->mutable_data<float>(place);
   paddle::memory::Copy(place,
                        mutable_x,
                        place,
                        src_data.data(),
                        sizeof(float) * src_data.size());
-  y_in_tensor->Resize(phi::make_ddim(dims2));
+  y_in_tensor->Resize(common::make_ddim(dims2));
   auto* mutable_y = y_in_tensor->mutable_data<float>(place);
   paddle::memory::Copy(place,
                        mutable_y,
@@ -124,14 +124,14 @@ TEST(test_tracer, test_trace_op_with_backward) {
 
   auto* x_in_tensor = x_in->MutableVar()->GetMutable<phi::DenseTensor>();
   auto* y_in_tensor = y_in->MutableVar()->GetMutable<phi::DenseTensor>();
-  x_in_tensor->Resize(phi::make_ddim(dims1));
+  x_in_tensor->Resize(common::make_ddim(dims1));
   auto* mutable_x = x_in_tensor->mutable_data<float>(place);
   paddle::memory::Copy(place,
                        mutable_x,
                        place,
                        src_data.data(),
                        sizeof(float) * src_data.size());
-  y_in_tensor->Resize(phi::make_ddim(dims2));
+  y_in_tensor->Resize(common::make_ddim(dims2));
   auto* mutable_y = y_in_tensor->mutable_data<float>(place);
   paddle::memory::Copy(place,
                        mutable_y,
@@ -170,14 +170,14 @@ TEST(test_tracer, test_track_backward_output) {
 
   auto* x_in_tensor = x_in->MutableVar()->GetMutable<phi::DenseTensor>();
   auto* y_in_tensor = y_in->MutableVar()->GetMutable<phi::DenseTensor>();
-  x_in_tensor->Resize(phi::make_ddim(dims1));
+  x_in_tensor->Resize(common::make_ddim(dims1));
   auto* mutable_x = x_in_tensor->mutable_data<float>(place);
   paddle::memory::Copy(place,
                        mutable_x,
                        place,
                        src_data.data(),
                        sizeof(float) * src_data.size());
-  y_in_tensor->Resize(phi::make_ddim(dims2));
+  y_in_tensor->Resize(common::make_ddim(dims2));
   auto* mutable_y = y_in_tensor->mutable_data<float>(place);
   paddle::memory::Copy(place,
                        mutable_y,
@@ -215,14 +215,14 @@ TEST(test_tracer, test_track_backward_input) {
 
   auto* x_in_tensor = x_in->MutableVar()->GetMutable<phi::DenseTensor>();
   auto* y_in_tensor = y_in->MutableVar()->GetMutable<phi::DenseTensor>();
-  x_in_tensor->Resize(phi::make_ddim(dims1));
+  x_in_tensor->Resize(common::make_ddim(dims1));
   auto* mutable_x = x_in_tensor->mutable_data<float>(place);
   paddle::memory::Copy(place,
                        mutable_x,
                        place,
                        src_data.data(),
                        sizeof(float) * src_data.size());
-  y_in_tensor->Resize(phi::make_ddim(dims2));
+  y_in_tensor->Resize(common::make_ddim(dims2));
   auto* mutable_y = y_in_tensor->mutable_data<float>(place);
   paddle::memory::Copy(place,
                        mutable_y,
@@ -263,14 +263,14 @@ TEST(test_tracer, test_trace_op_with_multi_device_inputs) {
 
   auto* x_in_tensor = x_in->MutableVar()->GetMutable<phi::DenseTensor>();
   auto* y_in_tensor = y_in->MutableVar()->GetMutable<phi::DenseTensor>();
-  x_in_tensor->Resize(phi::make_ddim(dims1));
+  x_in_tensor->Resize(common::make_ddim(dims1));
   auto* mutable_x = x_in_tensor->mutable_data<float>(place);
   paddle::memory::Copy(place,
                        mutable_x,
                        place,
                        src_data.data(),
                        sizeof(float) * src_data.size());
-  y_in_tensor->Resize(phi::make_ddim(dims2));
+  y_in_tensor->Resize(common::make_ddim(dims2));
   auto* mutable_y = y_in_tensor->mutable_data<float>(gpu_place);
   paddle::memory::Copy(gpu_place,
                        mutable_y,
@@ -400,14 +400,14 @@ TEST(test_tracer, test_var_without_grad_var) {
 
   auto* x_in_tensor = x_in->MutableVar()->GetMutable<phi::DenseTensor>();
   auto* y_in_tensor = y_in->MutableVar()->GetMutable<phi::DenseTensor>();
-  x_in_tensor->Resize(phi::make_ddim(dims1));
+  x_in_tensor->Resize(common::make_ddim(dims1));
   auto* mutable_x = x_in_tensor->mutable_data<float>(place);
   paddle::memory::Copy(place,
                        mutable_x,
                        place,
                        src_data.data(),
                        sizeof(float) * src_data.size());
-  y_in_tensor->Resize(phi::make_ddim(dims2));
+  y_in_tensor->Resize(common::make_ddim(dims2));
   auto* mutable_y = y_in_tensor->mutable_data<float>(place);
   paddle::memory::Copy(place,
                        mutable_y,
@@ -619,14 +619,14 @@ TEST(test_tracer, eager_tracer) {
 
   auto* x_in_tensor = x_in->MutableVar()->GetMutable<phi::DenseTensor>();
   auto* y_in_tensor = y_in->MutableVar()->GetMutable<phi::DenseTensor>();
-  x_in_tensor->Resize(phi::make_ddim(dims1));
+  x_in_tensor->Resize(common::make_ddim(dims1));
   auto* mutable_x = x_in_tensor->mutable_data<float>(place);
   paddle::memory::Copy(place,
                        mutable_x,
                        place,
                        src_data.data(),
                        sizeof(float) * src_data.size());
-  y_in_tensor->Resize(phi::make_ddim(dims2));
+  y_in_tensor->Resize(common::make_ddim(dims2));
   auto* mutable_y = y_in_tensor->mutable_data<float>(place);
   paddle::memory::Copy(place,
                        mutable_y,
diff --git a/test/cpp/inference/analysis/CMakeLists.txt b/test/cpp/inference/analysis/CMakeLists.txt
index 74a1e91fc4dc1c..5094272adaadf1 100644
--- a/test/cpp/inference/analysis/CMakeLists.txt
+++ b/test/cpp/inference/analysis/CMakeLists.txt
@@ -44,6 +44,7 @@ if(NOT APPLE)
     SRCS
     analyzer_tester.cc
     EXTRA_DEPS
+    common
     paddle_inference_shared
     ARGS
     --inference_model_dir=${WORD2VEC_MODEL_DIR})
diff --git a/test/cpp/inference/api/CMakeLists.txt b/test/cpp/inference/api/CMakeLists.txt
index a0ac3631b7181d..bb4a8ed761ad60 100644
--- a/test/cpp/inference/api/CMakeLists.txt
+++ b/test/cpp/inference/api/CMakeLists.txt
@@ -7,12 +7,12 @@ set(inference_api_tester_deps paddle_inference_api analysis_config)
 cc_test(
   test_paddle_inference_api
   SRCS api_tester.cc
-  DEPS ${inference_api_tester_deps})
+  DEPS ${inference_api_tester_deps} common)
 
 cc_test(
   inference_api_helper_test
   SRCS helper_test.cc
-  DEPS ${inference_api_tester_deps})
+  DEPS ${inference_api_tester_deps} common)
 
 if(WITH_ONNXRUNTIME AND WIN32)
   # Copy onnxruntime for some c++ test in Windows, since the test will
@@ -120,6 +120,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
       SRCS
       ${filename}
       EXTRA_DEPS
+      common
       paddle_inference_shared
       ARGS
       --infer_model=${install_dir}/model
@@ -133,6 +134,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
       SRCS
       ${filename}
       EXTRA_DEPS
+      common
       paddle_inference_shared
       ARGS
       --infer_model=${install_dir}/model
@@ -150,6 +152,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
       SRCS
       ${filename}
       EXTRA_DEPS
+      common
       paddle_inference_shared
       ARGS
       --infer_model=${install_dir}/mobilenet_v2_models/1
@@ -159,7 +162,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
 
   function(inference_analysis_api_test_build TARGET_NAME filename)
     inference_analysis_test_build(${TARGET_NAME} SRCS ${filename} EXTRA_DEPS
-                                  paddle_inference_shared)
+                                  common paddle_inference_shared)
   endfunction()
 
   function(inference_analysis_api_int8_test_run TARGET_NAME test_binary
@@ -219,7 +222,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
   function(inference_analysis_api_test_with_fake_data_build TARGET_NAME
            filename)
     inference_analysis_test_build(${TARGET_NAME} SRCS ${filename} EXTRA_DEPS
-                                  paddle_inference_shared)
+                                  common paddle_inference_shared)
   endfunction()
 
   function(inference_analysis_api_test_with_fake_data_run TARGET_NAME
@@ -325,7 +328,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
     download_model_and_data_without_verify(
       ${RNN1_INSTALL_DIR} "rnn1/model.tar.gz" "rnn1/data.txt.tar.gz")
     inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR}
-                                analyzer_rnn1_tester.cc)
+                                analyzer_rnn1_tester.cc EXTRA_DEPS common)
 
     # seq_pool1
     set(SEQ_POOL1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_pool")
@@ -334,19 +337,19 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
       "seq_pool1_data.txt.tar.gz")
     inference_analysis_api_test(
       test_analyzer_seq_pool1_compare_determine ${SEQ_POOL1_INSTALL_DIR}
-      analyzer_seq_pool1_compare_determine_tester.cc)
+      analyzer_seq_pool1_compare_determine_tester.cc EXTRA_DEPS common)
     inference_analysis_api_test(
       test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR}
-      analyzer_seq_pool1_compare_tester.cc)
+      analyzer_seq_pool1_compare_tester.cc EXTRA_DEPS common)
     inference_analysis_api_test(
       test_analyzer_seq_pool1_fuse_compare_zero_copy ${SEQ_POOL1_INSTALL_DIR}
-      analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc)
+      analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc EXTRA_DEPS common)
     inference_analysis_api_test(
       test_analyzer_seq_pool1_fuse_statis ${SEQ_POOL1_INSTALL_DIR}
-      analyzer_seq_pool1_fuse_statis_tester.cc)
+      analyzer_seq_pool1_fuse_statis_tester.cc EXTRA_DEPS common)
     inference_analysis_api_test(
       test_analyzer_seq_pool1_profile ${SEQ_POOL1_INSTALL_DIR}
-      analyzer_seq_pool1_profile_tester.cc)
+      analyzer_seq_pool1_profile_tester.cc EXTRA_DEPS common)
     if(NOT WIN32)
       set_tests_properties(test_analyzer_seq_pool1_compare_determine
                            PROPERTIES TIMEOUT 120)
@@ -376,7 +379,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
   download_model_and_data_without_verify(
     ${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz")
   inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR}
-                              analyzer_rnn2_tester.cc)
+                              analyzer_rnn2_tester.cc EXTRA_DEPS common)
 
   # TODO(luotao, Superjom) Disable DAM test, temporarily fix
   # https://github.com/PaddlePaddle/Paddle/issues/15032#issuecomment-455990914.
@@ -385,7 +388,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
   set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam")
   download_model_and_data_without_verify(${DAM_INSTALL_DIR} "DAM_model.tar.gz"
                                          "DAM_data.txt.tar.gz")
-  #inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc EXTRA_DEPS legacy_allocator)
+  #inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc EXTRA_DEPS legacy_allocator common)
 
   # small DAM
   set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam")
@@ -398,13 +401,14 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
     analyzer_dam_tester.cc
     EXTRA_DEPS
     paddle_inference_shared
+    common
     ARGS
     --infer_model=${DAM_SMALL_INSTALL_DIR}/model
     --infer_data=${DAM_SMALL_INSTALL_DIR}/data.txt)
 
   #save model
   inference_analysis_api_test(test_analyzer_save_model ${DAM_SMALL_INSTALL_DIR}
-                              analyzer_save_model_tester.cc)
+                              analyzer_save_model_tester.cc EXTRA_DEPS common)
 
   # chinese_ner
   set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner")
@@ -412,7 +416,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
     ${CHINESE_NER_INSTALL_DIR} "chinese_ner_model.tar.gz"
     "chinese_ner-data.txt.tar.gz")
   inference_analysis_api_test(test_analyzer_ner ${CHINESE_NER_INSTALL_DIR}
-                              analyzer_ner_tester.cc)
+                              analyzer_ner_tester.cc EXTRA_DEPS common)
 
   # lac
   set(LAC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lac")
@@ -420,7 +424,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
     ${LAC_INSTALL_DIR} "lac_model.tar.gz" 419ca6eb85f57a01bfe173591910aec5
     "lac_data.txt.tar.gz" 9983539cd6b34fbdc411e43422776bfd)
   inference_analysis_api_test(test_analyzer_lac ${LAC_INSTALL_DIR}
-                              analyzer_lac_tester.cc)
+                              analyzer_lac_tester.cc EXTRA_DEPS common)
 
   # Ernie
   set(ERNIE_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/Ernie")
@@ -431,13 +435,14 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
                   73beea65abda2edb61c1662cd3180c62)
   if(WITH_GPU)
     inference_analysis_api_test(test_analyzer_ernie ${ERNIE_INSTALL_DIR}
-                                analyzer_ernie_tester.cc)
+                                analyzer_ernie_tester.cc EXTRA_DEPS common)
     inference_analysis_api_test(gpu_ernie_half_test ${ERNIE_INSTALL_DIR}
-                                gpu_ernie_half_test.cc)
+                                gpu_ernie_half_test.cc EXTRA_DEPS common)
     set_tests_properties(gpu_ernie_half_test PROPERTIES TIMEOUT 60)
   endif()
   inference_analysis_api_int8_test(
-    test_analyzer_ernie_int8 ${ERNIE_INSTALL_DIR} analyzer_ernie_int8_tester.cc)
+    test_analyzer_ernie_int8 ${ERNIE_INSTALL_DIR} analyzer_ernie_int8_tester.cc
+    EXTRA_DEPS common)
 
   # Ernie large
   set(ERNIE_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/Ernie_Large")
@@ -453,6 +458,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
     analyzer_ernie_tester.cc
     EXTRA_DEPS
     paddle_inference_shared
+    common
     ARGS
     --infer_model=${ERNIE_INSTALL_DIR}/model
     --infer_data=${ERNIE_INSTALL_DIR}/data.txt
@@ -477,7 +483,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
     36ae620020cc3377f45ed330dd36238f)
   inference_analysis_api_test(
     test_analyzer_text_classification ${TEXT_CLASSIFICATION_INSTALL_DIR}
-    analyzer_text_classification_tester.cc)
+    analyzer_text_classification_tester.cc EXTRA_DEPS common)
 
   # seq_conv1
   set(SEQ_CONV1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_conv1")
@@ -485,7 +491,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
     ${SEQ_CONV1_INSTALL_DIR} "seq_conv1_model.tar.gz"
     "seq_conv1_data.txt.tar.gz")
   inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR}
-                              analyzer_seq_conv1_tester.cc)
+                              analyzer_seq_conv1_tester.cc EXTRA_DEPS common)
 
   # transformer, the dataset only works on batch_size=8 now
   set(TRANSFORMER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/transformer")
@@ -497,6 +503,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
     SRCS
     analyzer_transformer_compare_tester.cc
     EXTRA_DEPS
+    common
     paddle_inference_shared
     ARGS
     --infer_model=${TRANSFORMER_INSTALL_DIR}/model
@@ -508,6 +515,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
     SRCS
     analyzer_transformer_fuse_tester.cc
     EXTRA_DEPS
+    common
     paddle_inference_shared
     ARGS
     --infer_model=${TRANSFORMER_INSTALL_DIR}/model
@@ -519,6 +527,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
     SRCS
     analyzer_transformer_profile_tester.cc
     EXTRA_DEPS
+    common
     paddle_inference_shared
     ARGS
     --infer_model=${TRANSFORMER_INSTALL_DIR}/model
@@ -537,6 +546,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
     SRCS
     analyzer_vit_ocr_tester.cc
     EXTRA_DEPS
+    common
     paddle_inference_shared
     ARGS
     --infer_model=${VIT_OCR_INSTALL_DIR}/vit_ocr/model
@@ -550,7 +560,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
       "inference-vis-demos/ocr.tar.gz")
   endif()
   inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR}
-                              analyzer_vis_tester.cc)
+                              analyzer_vis_tester.cc EXTRA_DEPS common)
 
   # densebox
   set(DENSEBOX_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/densebox")
@@ -560,6 +570,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
     SRCS
     analyzer_detect_functional_mkldnn_tester.cc
     EXTRA_DEPS
+    common
     paddle_inference_shared
     ARGS
     --infer_model=${DENSEBOX_INSTALL_DIR}/model
@@ -573,8 +584,9 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
       ${MOBILENET_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/"
       "inference-vis-demos/mobilenet.tar.gz")
   endif()
-  inference_analysis_api_test(test_analyzer_mobilenet_transpose
-                              ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc)
+  inference_analysis_api_test(
+    test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR}
+    analyzer_vis_tester.cc EXTRA_DEPS common)
 
   ### Image classification tests with fake data
   set(IMG_CLASS_TEST_APP "test_analyzer_image_classification")
@@ -913,7 +925,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
   download_model_and_data_without_verify(
     ${BERT_INSTALL_DIR} "bert_emb128_model.tar.gz" "bert_data_len20.txt.tar.gz")
   inference_analysis_api_test(test_analyzer_bert ${BERT_INSTALL_DIR}
-                              analyzer_bert_tester.cc)
+                              analyzer_bert_tester.cc EXTRA_DEPS common)
 
   # multiple models prediction
   set(MMP_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/multi_model_prediction")
@@ -921,7 +933,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
                                PaddleInference/mobilenet_v2_models.tar.gz)
   inference_multiple_models_analysis_api_test(
     test_analyzer_multi_model_prediction ${MMP_INSTALL_DIR}
-    analyzer_mmp_tester.cc)
+    analyzer_mmp_tester.cc EXTRA_DEPS common)
 
   if(WITH_GPU AND TENSORRT_FOUND)
     set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt_models")
@@ -942,6 +954,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
       SRCS
       trt_mobilenet_test.cc
       EXTRA_DEPS
+      common
       paddle_inference_shared
       ARGS
       --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
@@ -950,6 +963,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
       SRCS
       trt_mark_trt_engine_outputs_test.cc
       EXTRA_DEPS
+      common
       paddle_inference_shared
       ARGS
       --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
@@ -958,6 +972,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
       SRCS
       trt_disable_tensorrt_half_ops_test.cc
       EXTRA_DEPS
+      common
       paddle_inference_shared
       ARGS
       --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
@@ -966,6 +981,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
       SRCS
       trt_cascade_rcnn_test.cc
       EXTRA_DEPS
+      common
       paddle_inference_shared
       ARGS
       --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
@@ -975,6 +991,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
       trt_split_converter_test.cc
       EXTRA_DEPS
       paddle_inference_shared
+      common
       ARGS
       --infer_model=${TEST_SPLIT_CONVERTER_MODEL}/)
     inference_analysis_test(
@@ -982,6 +999,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
       SRCS
       analyzer_capi_exp_gpu_tester.cc
       EXTRA_DEPS
+      common
       paddle_inference_c_shared
       ARGS
       --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
@@ -990,6 +1008,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
       SRCS
       analyzer_capi_exp_xpu_tester.cc
       EXTRA_DEPS
+      common
       paddle_inference_c_shared
       ARGS
       --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
@@ -1000,6 +1019,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
         SRCS
         trt_rebind_stream_test.cc
         EXTRA_DEPS
+        common
         paddle_inference_shared
         ARGS
         --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
@@ -1017,6 +1037,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
       SRCS
       trt_quant_int8_test.cc
       EXTRA_DEPS
+      common
       paddle_inference_shared
       ARGS
       --infer_model=${TRT_MODEL_QUANT_RESNET_DIR})
@@ -1033,6 +1054,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
       SRCS
       trt_quant_int8_yolov3_r50_test.cc
       EXTRA_DEPS
+      common
       paddle_inference_shared
       ARGS
       --infer_model=${TRT_MODEL_QUANT_YOLOV3_DIR})
@@ -1057,6 +1079,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
       SRCS
       trt_dynamic_shape_test.cc
       EXTRA_DEPS
+      common
       paddle_inference_shared
       ARGS
       --infer_model=${TRT_MODEL_INSTALL_DIR})
@@ -1073,6 +1096,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
       SRCS
       trt_dynamic_shape_ernie_test.cc
       EXTRA_DEPS
+      common
       paddle_inference_shared
       ARGS
       --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4)
@@ -1091,6 +1115,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
       trt_dynamic_shape_transformer_prune_test.cc
       EXTRA_DEPS
       paddle_inference_shared
+      common
       ARGS
       --infer_model=${TEST_TRT_TRANSFORMER_PRUNE_MODEL}/transformer_prune)
 
@@ -1105,6 +1130,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
       SRCS
       trt_dynamic_shape_ernie_serialize_deserialize_test.cc
       EXTRA_DEPS
+      common
       paddle_inference_shared
       ARGS
       --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized)
@@ -1120,6 +1146,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
       SRCS
       trt_dynamic_shape_ernie_fp16_serialize_deserialize_test.cc
       EXTRA_DEPS
+      common
       paddle_inference_shared
       ARGS
       --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_fp16_unserialized)
@@ -1134,6 +1161,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
     SRCS
     lite_mul_model_test.cc
     EXTRA_DEPS
+    common
     paddle_inference_shared
     ARGS
     --infer_model=${LITE_MODEL_INSTALL_DIR})
@@ -1142,6 +1170,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
     SRCS
     lite_resnet50_test.cc
     EXTRA_DEPS
+    common
     paddle_inference_shared
     ARGS
     --infer_model=${RESNET50_MODEL_DIR})
@@ -1151,6 +1180,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
     SRCS
     analyzer_capi_exp_tester.cc
     EXTRA_DEPS
+    common
     paddle_inference_c_shared
     ARGS
     --infer_model=${RESNET50_MODEL_DIR}/model)
@@ -1160,6 +1190,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
     SRCS
     analyzer_capi_exp_pd_config_tester.cc
     EXTRA_DEPS
+    common
     paddle_inference_c_shared
     ARGS
     --infer_model=${MOBILENET_INSTALL_DIR}/model)
@@ -1169,6 +1200,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
     SRCS
     analyzer_capi_exp_pd_tensor_tester.cc
     EXTRA_DEPS
+    common
     paddle_inference_c_shared
     ARGS
     --infer_model=${MOBILENET_INSTALL_DIR}/model)
@@ -1179,6 +1211,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
       SRCS
       analyzer_capi_exp_pd_threads_tester.cc
       EXTRA_DEPS
+      common
       paddle_inference_c_shared
       ARGS
       --infer_model=${MOBILENET_INSTALL_DIR}/model)
@@ -1189,6 +1222,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
     SRCS
     analyzer_zerocopy_tensor_tester.cc
     EXTRA_DEPS
+    common
     paddle_inference_shared
     ARGS
     --infer_model=${OCR_INSTALL_DIR}/model)
@@ -1199,6 +1233,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
       SRCS
       analyzer_dist_model_tester.cc
       EXTRA_DEPS
+      common
       paddle_inference_shared
       ARGS
       --infer_model=${OCR_INSTALL_DIR}/model)
@@ -1213,6 +1248,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
       SRCS
       analyzer_dist_model_xpu_tester.cc
       EXTRA_DEPS
+      common
       paddle_inference_shared
       ARGS
       --infer_model=${OCR_INSTALL_DIR}/model)
@@ -1223,6 +1259,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
     SRCS
     analyzer_paddle_tensor_tester.cc
     EXTRA_DEPS
+    common
     paddle_inference_shared
     ARGS
     --infer_model=${OCR_INSTALL_DIR}/model
@@ -1235,6 +1272,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
       SRCS
       analyzer_capi_exp_int_tester.cc
       EXTRA_DEPS
+      common
       paddle_inference_c_shared
       ARGS
       --infer_model=${INT8_DATA_DIR}/resnet50/model)
@@ -1245,6 +1283,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
     SRCS
     analyzer_capi_exp_ner_tester.cc
     EXTRA_DEPS
+    common
     paddle_inference_c_shared
     ARGS
     --infer_model=${CHINESE_NER_INSTALL_DIR}/model)
@@ -1255,6 +1294,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
       SRCS
       paddle_infer_api_test.cc
       EXTRA_DEPS
+      common
       paddle_inference_shared
       ARGS
       --infer_model=${RESNET50_MODEL_DIR})
@@ -1264,6 +1304,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
       SRCS
       paddle_infer_api_copy_tensor_tester.cc
       EXTRA_DEPS
+      common
       paddle_inference_shared
       ARGS
       --infer_model=${RESNET50_MODEL_DIR})
@@ -1274,7 +1315,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
   cc_test(
     paddle_infer_api_errors_test
     SRCS paddle_infer_api_errors_tester.cc
-    DEPS ${inference_api_tester_deps})
+    DEPS ${inference_api_tester_deps} common)
 
   if(WITH_GPU AND TENSORRT_FOUND)
     set_tests_properties(trt_quant_int8_yolov3_r50_test PROPERTIES TIMEOUT 400)
@@ -1345,6 +1386,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
       SRCS
       ipu_word2vec_sample.cc
       EXTRA_DEPS
+      common
       paddle_inference_shared
       ARGS
       --infer_model=${WORD2VEC_INSTALL_DIR})
@@ -1352,11 +1394,23 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
     # ERNIE
     set(ERNIE_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/Ernie")
     inference_analysis_api_test(
-      ipu_ernie_test ${ERNIE_INSTALL_DIR} ipu_ernie_test.cc ARGS --warmup=true
-      --repeat=10)
+      ipu_ernie_test
+      ${ERNIE_INSTALL_DIR}
+      ipu_ernie_test.cc
+      ARGS
+      --warmup=true
+      --repeat=10
+      EXTRA_DEPS
+      common)
     inference_analysis_api_test(
-      ipu_ernie_fp16_test ${ERNIE_INSTALL_DIR} ipu_ernie_fp16_test.cc ARGS
-      --warmup=true --repeat=10)
+      ipu_ernie_fp16_test
+      ${ERNIE_INSTALL_DIR}
+      ipu_ernie_fp16_test.cc
+      ARGS
+      --warmup=true
+      --repeat=10
+      EXTRA_DEPS
+      common)
 
     # Resnet50
     set(RESNET50_MODEL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/resnet50")
@@ -1365,6 +1419,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
       SRCS
       ipu_resnet50_test.cc
       EXTRA_DEPS
+      common
       paddle_inference_shared
       ARGS
       --infer_model=${RESNET50_MODEL_DIR}
@@ -1375,6 +1430,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
       SRCS
       ipu_resnet50_fp16_test.cc
       EXTRA_DEPS
+      common
       paddle_inference_shared
       ARGS
       --infer_model=${RESNET50_MODEL_DIR}
@@ -1390,7 +1446,9 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
       --model_name="Resnet50"
       --infer_model=${RESNET50_MODEL_DIR}
       --warmup=true
-      --repeat=10)
+      --repeat=10
+      EXTRA_DEPS
+      common)
   endif()
 
   if(WITH_XPU)
@@ -1399,6 +1457,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
       SRCS
       xpu_config_resnet50_test.cc
       EXTRA_DEPS
+      common
       paddle_inference_shared
       ARGS
       --infer_model=${RESNET50_MODEL_DIR})
@@ -1407,6 +1466,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
       SRCS
       xpu_runtime_config_resnet50_test.cc
       EXTRA_DEPS
+      common
       paddle_inference_shared
       ARGS
       --infer_model=${RESNET50_MODEL_DIR})
@@ -1419,6 +1479,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
         SRCS
         api_impl_tester.cc
         DEPS
+        common
         paddle_inference_shared
         ARGS
         --word2vec_dirname=${WORD2VEC_MODEL_DIR}
@@ -1433,6 +1494,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
       analysis_predictor_tester.cc
       DEPS
       paddle_inference_shared
+      common
       ARGS
       --dirname=${WORD2VEC_MODEL_DIR})
   endif()
@@ -1445,6 +1507,7 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
         mkldnn_quantizer_tester.cc
         DEPS
         paddle_inference_shared
+        common
         ARGS
         --dirname=${WORD2VEC_MODEL_DIR})
     endif()
diff --git a/test/cpp/inference/api/api_impl_tester.cc b/test/cpp/inference/api/api_impl_tester.cc
index 78e908189cc1d4..535a4995665ed2 100644
--- a/test/cpp/inference/api/api_impl_tester.cc
+++ b/test/cpp/inference/api/api_impl_tester.cc
@@ -54,7 +54,7 @@ PaddleTensor LodTensorToPaddleTensor(phi::DenseTensor* t) {
     PADDLE_THROW(platform::errors::Unimplemented(
         "Unsupported tensor date type. Now only supports INT64, FP32, INT32."));
   }
-  pt.shape = phi::vectorize<int>(t->dims());
+  pt.shape = common::vectorize<int>(t->dims());
   return pt;
 }
 
@@ -135,7 +135,7 @@ void MainImageClassification(const ::paddle::PaddlePlace& place) {
   // Use normilized image pixels as input data,
   // which should be in the range [0.0, 1.0].
   feed_target_shapes[0][0] = batch_size;
-  framework::DDim input_dims = phi::make_ddim(feed_target_shapes[0]);
+  framework::DDim input_dims = common::make_ddim(feed_target_shapes[0]);
   SetupTensor<float>(
       &input, input_dims, static_cast<float>(0), static_cast<float>(1));
   std::vector<phi::DenseTensor*> cpu_feeds;
@@ -243,7 +243,7 @@ void MainThreadsImageClassification(const ::paddle::PaddlePlace& place) {
     std::vector<std::vector<int64_t>> feed_target_shapes =
         GetFeedTargetShapes(config.model_dir, /*is_combined*/ false);
     feed_target_shapes[0][0] = batch_size;
-    framework::DDim input_dims = phi::make_ddim(feed_target_shapes[0]);
+    framework::DDim input_dims = common::make_ddim(feed_target_shapes[0]);
     SetupTensor<float>(&jobs[i], input_dims, 0.f, 1.f);
     paddle_tensor_feeds[i].push_back(LodTensorToPaddleTensor(&jobs[i]));
 
diff --git a/test/cpp/inference/api/mkldnn_quantizer_tester.cc b/test/cpp/inference/api/mkldnn_quantizer_tester.cc
index 8edad9fe27127a..28840dbbb0fb40 100644
--- a/test/cpp/inference/api/mkldnn_quantizer_tester.cc
+++ b/test/cpp/inference/api/mkldnn_quantizer_tester.cc
@@ -108,7 +108,7 @@ TEST_F(MkldnnQuantizerTest, histogram_inverted_min_max) {
   auto max_val = *std::max_element(values.begin(), values.end());
 
   phi::DenseTensor var_tensor;
-  var_tensor.Resize(phi::make_dim(values.size()));
+  var_tensor.Resize(common::make_dim(values.size()));
   std::copy(begin(values),
             end(values),
             var_tensor.mutable_data<float>(phi::CPUPlace()));
@@ -124,7 +124,7 @@ TEST_F(MkldnnQuantizerTest, histogram_non_negative_to_3) {
   auto max_val = *std::max_element(values.begin(), values.end());
 
   phi::DenseTensor var_tensor;
-  var_tensor.Resize(phi::make_dim(values.size()));
+  var_tensor.Resize(common::make_dim(values.size()));
   std::copy(begin(values),
             end(values),
             var_tensor.mutable_data<float>(phi::CPUPlace()));
@@ -148,7 +148,7 @@ TEST_F(MkldnnQuantizerTest, histogram_positive_and_negative_to_3) {
   auto max_val = *std::max_element(values.begin(), values.end());
 
   phi::DenseTensor var_tensor;
-  var_tensor.Resize(phi::make_dim(values.size()));
+  var_tensor.Resize(common::make_dim(values.size()));
   std::copy(begin(values),
             end(values),
             var_tensor.mutable_data<float>(phi::CPUPlace()));
@@ -172,7 +172,7 @@ TEST_F(MkldnnQuantizerTest, histogram_zero_bins) {
   auto max_val = *std::max_element(values.begin(), values.end());
 
   phi::DenseTensor var_tensor;
-  var_tensor.Resize(phi::make_dim(values.size()));
+  var_tensor.Resize(common::make_dim(values.size()));
   std::copy(begin(values),
             end(values),
             var_tensor.mutable_data<float>(phi::CPUPlace()));
@@ -197,7 +197,7 @@ TEST_F(MkldnnQuantizerTest, kl_scaling_factor_signed) {
   const auto& values = positive_and_negative_values;
 
   phi::DenseTensor var_tensor;
-  var_tensor.Resize(phi::make_dim(values.size()));
+  var_tensor.Resize(common::make_dim(values.size()));
   std::copy(begin(values),
             end(values),
             var_tensor.mutable_data<float>(phi::CPUPlace()));
@@ -217,7 +217,7 @@ TEST_F(MkldnnQuantizerTest, max_scaling_factor_signed) {
   auto max_val = *std::max_element(values.begin(), values.end());
 
   phi::DenseTensor var_tensor;
-  var_tensor.Resize(phi::make_dim(values.size()));
+  var_tensor.Resize(common::make_dim(values.size()));
   std::copy(begin(values),
             end(values),
             var_tensor.mutable_data<float>(phi::CPUPlace()));
@@ -237,7 +237,7 @@ TEST_F(MkldnnQuantizerTest, max_scaling_factor_unsigned) {
   auto max_val = *std::max_element(values.begin(), values.end());
 
   phi::DenseTensor var_tensor;
-  var_tensor.Resize(phi::make_dim(values.size()));
+  var_tensor.Resize(common::make_dim(values.size()));
   std::copy(begin(values),
             end(values),
             var_tensor.mutable_data<float>(phi::CPUPlace()));
@@ -258,7 +258,7 @@ TEST_F(MkldnnQuantizerTest, max_scaling_factor_chwise_unsigned) {
   int channels = 3;
 
   phi::DenseTensor var_tensor;
-  var_tensor.Resize(phi::make_dim(channels, 1, 1, values.size()));
+  var_tensor.Resize(common::make_dim(channels, 1, 1, values.size()));
   for (int i = 0; i < channels; i++)
     std::copy(
         begin(values),
@@ -281,7 +281,7 @@ TEST_F(MkldnnQuantizerTest, kl_scaling_factor_unsigned) {
   const auto& values = non_negative_values;
 
   phi::DenseTensor var_tensor;
-  var_tensor.Resize(phi::make_dim(values.size()));
+  var_tensor.Resize(common::make_dim(values.size()));
   std::copy(begin(values),
             end(values),
             var_tensor.mutable_data<float>(phi::CPUPlace()));
@@ -307,14 +307,14 @@ const std::vector<std::vector<float>> wh = {
 TEST_F(MkldnnQuantizerTest, max_ch_gru_scaling_factor) {
   phi::DenseTensor wx_tensor, wh_tensor, lod_tensor;
 
-  wx_tensor.Resize(phi::make_dim(wx.size(), wx[0].size()));
+  wx_tensor.Resize(common::make_dim(wx.size(), wx[0].size()));
   for (size_t i = 0; i < wx.size(); i++)
     std::copy(
         begin(wx[i]),
         end(wx[i]),
         wx_tensor.mutable_data<float>(phi::CPUPlace()) + i * wx[0].size());
 
-  wh_tensor.Resize(phi::make_dim(wh.size(), wh[0].size()));
+  wh_tensor.Resize(common::make_dim(wh.size(), wh[0].size()));
   for (size_t i = 0; i < wh.size(); i++)
     std::copy(
         begin(wh[i]),
@@ -337,14 +337,14 @@ TEST_F(MkldnnQuantizerTest, max_ch_gru_scaling_factor) {
 TEST_F(MkldnnQuantizerTest, max_ch_lstm_scaling_factor) {
   phi::DenseTensor wx_tensor, wh_tensor, lod_tensor;
 
-  wx_tensor.Resize(phi::make_dim(wx.size(), wx[0].size()));
+  wx_tensor.Resize(common::make_dim(wx.size(), wx[0].size()));
   for (size_t i = 0; i < wx.size(); i++)
     std::copy(
         begin(wx[i]),
         end(wx[i]),
         wx_tensor.mutable_data<float>(phi::CPUPlace()) + i * wx[0].size());
 
-  wh_tensor.Resize(phi::make_dim(wh.size(), wh[0].size()));
+  wh_tensor.Resize(common::make_dim(wh.size(), wh[0].size()));
   for (size_t i = 0; i < wh.size(); i++)
     std::copy(
         begin(wh[i]),
diff --git a/test/cpp/inference/api/tester_helper.h b/test/cpp/inference/api/tester_helper.h
index 8e5a9cd5034238..a410df859fe450 100644
--- a/test/cpp/inference/api/tester_helper.h
+++ b/test/cpp/inference/api/tester_helper.h
@@ -1098,8 +1098,8 @@ static bool CompareShape(const std::vector<int64_t> &a,
 
 static bool CompareTensorData(const phi::DenseTensor &a,
                               const phi::DenseTensor &b) {
-  auto a_shape = phi::vectorize(a.dims());
-  auto b_shape = phi::vectorize(b.dims());
+  auto a_shape = common::vectorize(a.dims());
+  auto b_shape = common::vectorize(b.dims());
   size_t a_size = std::accumulate(
       a_shape.begin(), a_shape.end(), size_t{1}, [](int a, int b) {
         return a * b;
@@ -1147,7 +1147,7 @@ static bool CompareTensor(const phi::DenseTensor &a,
   if (!CompareLoD(a.lod(), b.lod())) {
     return false;
   }
-  if (!CompareShape(phi::vectorize(a.dims()), phi::vectorize(b.dims()))) {
+  if (!CompareShape(common::vectorize(a.dims()), common::vectorize(b.dims()))) {
     return false;
   }
 
diff --git a/test/cpp/inference/test_helper.h b/test/cpp/inference/test_helper.h
index f66712401858ab..0107654d349b17 100644
--- a/test/cpp/inference/test_helper.h
+++ b/test/cpp/inference/test_helper.h
@@ -59,7 +59,7 @@ template <typename T>
 void SetupTensor(phi::DenseTensor* input,
                  paddle::framework::DDim dims,
                  const std::vector<T>& data) {
-  CHECK_EQ(phi::product(dims), static_cast<int64_t>(data.size()));
+  CHECK_EQ(common::product(dims), static_cast<int64_t>(data.size()));
   T* input_ptr = input->mutable_data<T>(dims, paddle::platform::CPUPlace());
   memcpy(input_ptr, data.data(), input->numel() * sizeof(T));
 }
diff --git a/test/cpp/jit/CMakeLists.txt b/test/cpp/jit/CMakeLists.txt
index ee1d5c94a9a17d..b5247b1902c425 100644
--- a/test/cpp/jit/CMakeLists.txt
+++ b/test/cpp/jit/CMakeLists.txt
@@ -8,6 +8,7 @@ if(WITH_TESTING AND NOT WIN32)
     WORKING_DIRECTORY "${CC_TESTS_DIR}")
   set(JIT_DEPS
       phi
+      common
       elementwise_add_op
       activation_op
       reduce_mean_op
@@ -21,13 +22,4 @@ if(WITH_TESTING AND NOT WIN32)
     layer_test
     SRCS layer_test.cc
     DEPS ${JIT_DEPS})
-  # add_dependencies(layer_test jit_download_program)
-
-  cc_test(
-    layer_test_new
-    SRCS layer_test.cc
-    DEPS ${JIT_DEPS})
-  # add_dependencies(layer_test_new jit_download_program)
-  # set_tests_properties(layer_test_new PROPERTIES ENVIRONMENT
-  #                                                "FLAGS_jit_engine_type=New")
 endif()
diff --git a/test/cpp/jit/layer_test.cc b/test/cpp/jit/layer_test.cc
index c163f3c50d9dd3..1c3e76dbc6b15d 100644
--- a/test/cpp/jit/layer_test.cc
+++ b/test/cpp/jit/layer_test.cc
@@ -64,7 +64,7 @@ std::vector<Tensor> PrepareInputs(const phi::Place& place) {
   auto& dev_ctx = *pool.Get(place);
 
   DenseTensor t;
-  t.Resize(phi::make_ddim({2, 4}));
+  t.Resize(common::make_ddim({2, 4}));
   t.mutable_data<float>(place);
   phi::funcs::set_constant(dev_ctx, &t, 2.);
 
diff --git a/test/cpp/new_executor/CMakeLists.txt b/test/cpp/new_executor/CMakeLists.txt
index c5906fc0f263e3..cd80ee4944a533 100644
--- a/test/cpp/new_executor/CMakeLists.txt
+++ b/test/cpp/new_executor/CMakeLists.txt
@@ -1,7 +1,8 @@
 # skip win32 since wget is not installed by default on windows machine.
 
 if(NOT WIN32)
-  paddle_test(standalone_executor_pir_test SRCS standalone_executor_pir_test.cc)
+  paddle_test(standalone_executor_pir_test SRCS standalone_executor_pir_test.cc
+              DEPS common)
 endif()
 
 set(OPS
@@ -41,7 +42,7 @@ if(WITH_GPU
   # all operators used in the program
 
   # All deps of the operators above, part of GLOB_OPERATOR_DEPS.
-  set(OP_DEPS phi concat_and_split cross_entropy)
+  set(OP_DEPS phi common concat_and_split cross_entropy)
   cc_test(standalone_executor_test SRCS standalone_executor_test.cc)
 
   # add_dependencies(standalone_executor_test download_program)
diff --git a/test/cpp/new_executor/standalone_executor_test.cc b/test/cpp/new_executor/standalone_executor_test.cc
index e25f8e0aec99d2..727cb895c5e6b4 100644
--- a/test/cpp/new_executor/standalone_executor_test.cc
+++ b/test/cpp/new_executor/standalone_executor_test.cc
@@ -289,7 +289,7 @@ TEST(InterpreterCore, workqueue_multiplexing) {
   float data_a[] = {0, 1, 2, 3};
   float data_b[] = {0.0, 0.1, 0.2, 0.3};
 
-  phi::DDim dims = phi::make_ddim({2, 2});
+  phi::DDim dims = common::make_ddim({2, 2});
   const platform::CPUPlace place = platform::CPUPlace();
 
   phi::DenseTensor tensor_a = phi::DenseTensor();
diff --git a/test/cpp/phi/api/CMakeLists.txt b/test/cpp/phi/api/CMakeLists.txt
index fd06e6d460df97..c0b392b347fb9b 100644
--- a/test/cpp/phi/api/CMakeLists.txt
+++ b/test/cpp/phi/api/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(COMMON_API_TEST_DEPS phi)
+set(COMMON_API_TEST_DEPS phi common)
 
 if(WITH_GPU)
   nv_test(
@@ -8,11 +8,11 @@ if(WITH_GPU)
   nv_test(
     test_allocator
     SRCS test_allocator.cu
-    DEPS place device_context phi)
+    DEPS place device_context phi common)
   nv_test(
     test_cuda_stream
     SRCS test_cuda_stream.cu
-    DEPS phi)
+    DEPS phi common)
   nv_test(
     test_from_blob
     SRCS test_from_blob.cc
@@ -25,11 +25,11 @@ elseif(WITH_ROCM)
   hip_test(
     test_allocator
     SRCS test_allocator.cu
-    DEPS place device_context phi)
+    DEPS place device_context phi common)
   hip_test(
     test_cuda_stream
     SRCS test_cuda_stream.cu
-    DEPS phi)
+    DEPS phi common)
   hip_test(
     test_from_blob
     SRCS test_from_blob.cc
diff --git a/test/cpp/phi/api/test_phi_exception.cc b/test/cpp/phi/api/test_phi_exception.cc
index 99576a0254b5b5..7d0fdf1b57fab0 100644
--- a/test/cpp/phi/api/test_phi_exception.cc
+++ b/test/cpp/phi/api/test_phi_exception.cc
@@ -13,7 +13,7 @@ limitations under the License. */
 #include <string>
 
 #include "gtest/gtest.h"
-#include "paddle/phi/api/ext/exception.h"
+#include "paddle/common/exception.h"
 
 namespace paddle {
 namespace tests {
diff --git a/test/cpp/phi/api/test_phi_tensor.cc b/test/cpp/phi/api/test_phi_tensor.cc
index 67fdc4d0e53947..419e8a10e167d8 100644
--- a/test/cpp/phi/api/test_phi_tensor.cc
+++ b/test/cpp/phi/api/test_phi_tensor.cc
@@ -227,7 +227,7 @@ void TestDataInterface() {
   std::vector<int64_t> rows = {0};
   std::shared_ptr<phi::SelectedRows> selected_rows =
       std::make_shared<phi::SelectedRows>(rows, 1);
-  selected_rows->mutable_value()->Resize(phi::make_ddim({1, 1}));
+  selected_rows->mutable_value()->Resize(common::make_ddim({1, 1}));
   selected_rows->mutable_value()->mutable_data<float>(phi::CPUPlace())[0] =
       static_cast<float>(10.0f);
   paddle::Tensor sr_tensor = paddle::Tensor(selected_rows);
diff --git a/test/cpp/phi/api/test_strings_empty_api.cc b/test/cpp/phi/api/test_strings_empty_api.cc
index 02c7705735170b..5160acf73f10cb 100644
--- a/test/cpp/phi/api/test_strings_empty_api.cc
+++ b/test/cpp/phi/api/test_strings_empty_api.cc
@@ -43,7 +43,7 @@ TEST(API, strings_empty) {
   auto dense_shape = std::make_shared<phi::DenseTensor>(
       alloc.get(),
       phi::DenseTensorMeta(
-          phi::DataType::INT64, phi::make_ddim({2}), phi::DataLayout::NCHW));
+          phi::DataType::INT64, common::make_ddim({2}), phi::DataLayout::NCHW));
   auto* dev_ctx =
       phi::DeviceContextPool::Instance().GetByPlace(phi::CPUPlace());
   auto* shape_data = dev_ctx->template Alloc<int64_t>(dense_shape.get());
diff --git a/test/cpp/phi/api/test_to_api.cc b/test/cpp/phi/api/test_to_api.cc
index 7a83003e118cf0..beef25a5fb9bce 100644
--- a/test/cpp/phi/api/test_to_api.cc
+++ b/test/cpp/phi/api/test_to_api.cc
@@ -33,8 +33,9 @@ paddle::Tensor CreateInputTensor() {
       std::make_unique<paddle::experimental::DefaultAllocator>(phi::CPUPlace());
   auto dense_x = std::make_shared<phi::DenseTensor>(
       alloc.get(),
-      phi::DenseTensorMeta(
-          phi::DataType::INT64, phi::make_ddim({3, 4}), phi::DataLayout::NCHW));
+      phi::DenseTensorMeta(phi::DataType::INT64,
+                           common::make_ddim({3, 4}),
+                           phi::DataLayout::NCHW));
   auto* dev_ctx =
       phi::DeviceContextPool::Instance().GetByPlace(phi::CPUPlace());
   auto* dense_x_data = dev_ctx->template Alloc<int64_t>(dense_x.get());
diff --git a/test/cpp/phi/common/CMakeLists.txt b/test/cpp/phi/common/CMakeLists.txt
index b40e7e9f5a41e7..854a870420fcc0 100644
--- a/test/cpp/phi/common/CMakeLists.txt
+++ b/test/cpp/phi/common/CMakeLists.txt
@@ -13,32 +13,32 @@ cc_test(
 cc_test(
   phi_test_place
   SRCS test_place.cc
-  DEPS phi)
+  DEPS phi common)
 cc_test(
   phi_test_int_array
   SRCS test_int_array.cc
-  DEPS phi)
+  DEPS phi common)
 cc_test(
   phi_test_scalar_cpu
   SRCS test_scalar.cc
-  DEPS phi)
+  DEPS phi common)
 if(WITH_GPU)
   nv_test(
     phi_test_scalar
     SRCS test_scalar.cu
-    DEPS phi)
+    DEPS phi common)
   nv_test(
     transform_test
     SRCS transform_test.cu
-    DEPS memory place phi)
+    DEPS memory place phi common)
 endif()
 if(WITH_ROCM)
   hip_test(
     phi_test_scalar
     SRCS test_scalar.cu
-    DEPS phi)
+    DEPS phi common)
   hip_test(
     transform_test
     SRCS transform_test.cu
-    DEPS memory place phi)
+    DEPS memory place phi common)
 endif()
diff --git a/test/cpp/phi/common/test_backend.cc b/test/cpp/phi/common/test_backend.cc
index 516deeee34af20..97b5336dc17bfc 100644
--- a/test/cpp/phi/common/test_backend.cc
+++ b/test/cpp/phi/common/test_backend.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <iostream>
 
-#include "paddle/phi/api/ext/exception.h"
+#include "paddle/common/exception.h"
 #include "paddle/phi/common/backend.h"
 
 namespace phi {
diff --git a/test/cpp/phi/common/test_data_layout.cc b/test/cpp/phi/common/test_data_layout.cc
index 889dfe07860c56..e267ea389f07a1 100644
--- a/test/cpp/phi/common/test_data_layout.cc
+++ b/test/cpp/phi/common/test_data_layout.cc
@@ -17,8 +17,8 @@ limitations under the License. */
 #include <iostream>
 #include <sstream>
 
-#include "paddle/phi/api/ext/exception.h"
-#include "paddle/phi/common/layout.h"
+#include "paddle/common/exception.h"
+#include "paddle/common/layout.h"
 
 namespace phi {
 namespace tests {
diff --git a/test/cpp/phi/common/test_data_type.cc b/test/cpp/phi/common/test_data_type.cc
index 4d3d1de64924da..3bc2935b5abc82 100644
--- a/test/cpp/phi/common/test_data_type.cc
+++ b/test/cpp/phi/common/test_data_type.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <iostream>
 #include <sstream>
 
-#include "paddle/phi/api/ext/exception.h"
+#include "paddle/common/exception.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/type_traits.h"
 
diff --git a/test/cpp/phi/common/test_scalar.cu b/test/cpp/phi/common/test_scalar.cu
index 9fbcb99cece136..b1748e957c565e 100644
--- a/test/cpp/phi/common/test_scalar.cu
+++ b/test/cpp/phi/common/test_scalar.cu
@@ -40,10 +40,10 @@ TEST(Scalar, ConstructFromDenseTensor1) {
   // 1. create tensor
   const auto alloc =
       std::make_unique<paddle::experimental::DefaultAllocator>(phi::CPUPlace());
-  phi::DenseTensor dense_x(
-      alloc.get(),
-      phi::DenseTensorMeta(
-          phi::DataType::FLOAT16, phi::make_ddim({1}), phi::DataLayout::NCHW));
+  phi::DenseTensor dense_x(alloc.get(),
+                           phi::DenseTensorMeta(phi::DataType::FLOAT16,
+                                                common::make_ddim({1}),
+                                                phi::DataLayout::NCHW));
   phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
   auto* dev_ctx = reinterpret_cast<phi::CPUContext*>(pool.Get(phi::CPUPlace()));
 
@@ -60,7 +60,7 @@ TEST(Scalar, ConstructFromDenseTensor2) {
   phi::DenseTensor dense_x(
       alloc.get(),
       phi::DenseTensorMeta(
-          phi::DataType::INT16, phi::make_ddim({1}), phi::DataLayout::NCHW));
+          phi::DataType::INT16, common::make_ddim({1}), phi::DataLayout::NCHW));
   phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
   auto* dev_ctx = reinterpret_cast<phi::CPUContext*>(pool.Get(phi::CPUPlace()));
 
@@ -77,7 +77,7 @@ TEST(Scalar, ConstructFromDenseTensor3) {
   phi::DenseTensor dense_x(
       alloc.get(),
       phi::DenseTensorMeta(
-          phi::DataType::INT8, phi::make_ddim({1}), phi::DataLayout::NCHW));
+          phi::DataType::INT8, common::make_ddim({1}), phi::DataLayout::NCHW));
   phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
   auto* dev_ctx = reinterpret_cast<phi::CPUContext*>(pool.Get(phi::CPUPlace()));
 
@@ -94,7 +94,7 @@ TEST(Scalar, ConstructFromDenseTensor4) {
   phi::DenseTensor dense_x(
       alloc.get(),
       phi::DenseTensorMeta(
-          phi::DataType::BOOL, phi::make_ddim({1}), phi::DataLayout::NCHW));
+          phi::DataType::BOOL, common::make_ddim({1}), phi::DataLayout::NCHW));
   phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
   auto* dev_ctx = reinterpret_cast<phi::CPUContext*>(pool.Get(phi::CPUPlace()));
 
@@ -110,7 +110,7 @@ TEST(Scalar, ConstructFromDenseTensor5) {
       std::make_unique<paddle::experimental::DefaultAllocator>(phi::CPUPlace());
   phi::DenseTensor dense_x(alloc.get(),
                            phi::DenseTensorMeta(phi::DataType::COMPLEX64,
-                                                phi::make_ddim({1}),
+                                                common::make_ddim({1}),
                                                 phi::DataLayout::NCHW));
   phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
   auto* dev_ctx = reinterpret_cast<phi::CPUContext*>(pool.Get(phi::CPUPlace()));
@@ -128,7 +128,7 @@ TEST(Scalar, ConstructFromDenseTensor6) {
       std::make_unique<paddle::experimental::DefaultAllocator>(phi::CPUPlace());
   phi::DenseTensor dense_x(alloc.get(),
                            phi::DenseTensorMeta(phi::DataType::COMPLEX128,
-                                                phi::make_ddim({1}),
+                                                common::make_ddim({1}),
                                                 phi::DataLayout::NCHW));
   phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
   auto* dev_ctx = reinterpret_cast<phi::CPUContext*>(pool.Get(phi::CPUPlace()));
@@ -144,10 +144,10 @@ TEST(Scalar, ConstructFromDenseTensor7) {
   // 1. create tensor
   const auto alloc =
       std::make_unique<paddle::experimental::DefaultAllocator>(phi::GPUPlace());
-  phi::DenseTensor dense_x(
-      alloc.get(),
-      phi::DenseTensorMeta(
-          phi::DataType::FLOAT32, phi::make_ddim({1}), phi::DataLayout::NCHW));
+  phi::DenseTensor dense_x(alloc.get(),
+                           phi::DenseTensorMeta(phi::DataType::FLOAT32,
+                                                common::make_ddim({1}),
+                                                phi::DataLayout::NCHW));
   phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
   auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
 
@@ -164,8 +164,9 @@ TEST(Scalar, ConstructFromTensor) {
       std::make_unique<paddle::experimental::DefaultAllocator>(phi::GPUPlace());
   auto dense_x = std::make_shared<phi::DenseTensor>(
       alloc.get(),
-      phi::DenseTensorMeta(
-          phi::DataType::FLOAT32, phi::make_ddim({1}), phi::DataLayout::NCHW));
+      phi::DenseTensorMeta(phi::DataType::FLOAT32,
+                           common::make_ddim({1}),
+                           phi::DataLayout::NCHW));
 
   phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
   auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));
diff --git a/test/cpp/phi/core/CMakeLists.txt b/test/cpp/phi/core/CMakeLists.txt
index be16b8c4e3508c..b5d83f69aabf8b 100644
--- a/test/cpp/phi/core/CMakeLists.txt
+++ b/test/cpp/phi/core/CMakeLists.txt
@@ -1,25 +1,25 @@
 cc_test(
   test_custom_kernel
   SRCS test_custom_kernel.cc
-  DEPS phi)
+  DEPS phi common)
 cc_test(
   test_dense_tensor
   SRCS test_dense_tensor.cc
-  DEPS phi)
+  DEPS phi common)
 cc_test(test_intrusive_ptr SRCS test_intrusive_ptr.cc)
 cc_test(test_type_info SRCS test_type_info.cc)
 cc_test(
   test_kernel_factory
   SRCS test_kernel_factory.cc
-  DEPS phi)
+  DEPS phi common)
 cc_test(
   test_sparse_coo_tensor
   SRCS test_sparse_coo_tensor.cc
-  DEPS phi)
+  DEPS phi common)
 cc_test(
   test_sparse_csr_tensor
   SRCS test_sparse_csr_tensor.cc
-  DEPS phi)
+  DEPS phi common)
 cc_test(
   test_op_utils
   SRCS test_op_utils.cc
@@ -27,28 +27,28 @@ cc_test(
 cc_test(
   test_meta_fn_utils
   SRCS test_meta_fn_utils.cc
-  DEPS phi)
+  DEPS phi common)
 
 cc_test(
   test_ddim
   SRCS test_ddim.cc
-  DEPS phi)
+  DEPS phi common)
 if(WITH_GPU)
   nv_test(
     test_dim
     SRCS test_dim.cu
-    DEPS phi)
+    DEPS phi common)
 elseif(WITH_ROCM)
   hip_test(
     test_dim
     SRCS test_dim.cu
-    DEPS phi)
+    DEPS phi common)
 endif()
 
 cc_test(
   selected_rows_test
   SRCS test_selected_rows.cc
-  DEPS phi)
+  DEPS phi common)
 if(WITH_TESTING AND TEST selected_rows_test)
   set_tests_properties(selected_rows_test PROPERTIES TIMEOUT 120)
 endif()
@@ -58,27 +58,27 @@ endif()
 cc_test(
   test_string_tensor
   SRCS test_string_tensor.cc
-  DEPS phi)
+  DEPS phi common)
 cc_test(unroll_array_ops_test SRCS unroll_array_ops_test.cc)
 
 cc_test(
   test_tensor_array
   SRCS test_tensor_array.cc
-  DEPS phi)
+  DEPS phi common)
 
 if(WITH_GPU)
   nv_test(
     test_mixed_vector
     SRCS test_mixed_vector.cc test_mixed_vector.cu
-    DEPS place memory phi tensor)
+    DEPS place memory phi common tensor)
 elseif(WITH_ROCM)
   hip_test(
     test_mixed_vector
     SRCS test_mixed_vector.cc test_mixed_vector.cu
-    DEPS place memory phi tensor)
+    DEPS place memory phi common tensor)
 else()
   cc_test(
     test_mixed_vector
     SRCS test_mixed_vector.cc
-    DEPS place memory phi tensor)
+    DEPS place memory phi common tensor)
 endif()
diff --git a/test/cpp/phi/core/test_custom_kernel.cc b/test/cpp/phi/core/test_custom_kernel.cc
index 38f59589f72477..b4a9e9da619135 100644
--- a/test/cpp/phi/core/test_custom_kernel.cc
+++ b/test/cpp/phi/core/test_custom_kernel.cc
@@ -200,15 +200,17 @@ TEST(CustomKernel, custom_kernel_dot) {
       std::make_unique<paddle::experimental::DefaultAllocator>(phi::CPUPlace());
   auto dense_x = std::make_shared<phi::DenseTensor>(
       alloc.get(),
-      phi::DenseTensorMeta(
-          phi::DataType::UINT8, phi::make_ddim({2, 3}), phi::DataLayout::NCHW));
+      phi::DenseTensorMeta(phi::DataType::UINT8,
+                           common::make_ddim({2, 3}),
+                           phi::DataLayout::NCHW));
   auto* dev_ctx = phi::DeviceContextPool::Instance().Get(phi::CPUPlace());
   auto* dense_x_data = dev_ctx->template Alloc<uint8_t>(dense_x.get());
 
   auto dense_y = std::make_shared<phi::DenseTensor>(
       alloc.get(),
-      phi::DenseTensorMeta(
-          phi::DataType::UINT8, phi::make_ddim({2, 3}), phi::DataLayout::NCHW));
+      phi::DenseTensorMeta(phi::DataType::UINT8,
+                           common::make_ddim({2, 3}),
+                           phi::DataLayout::NCHW));
   auto* dense_y_data = dev_ctx->template Alloc<uint8_t>(dense_y.get());
 
   // dot x,y and result
diff --git a/test/cpp/phi/core/test_ddim.cc b/test/cpp/phi/core/test_ddim.cc
index a58d86e62aa403..78d8deebdca3d0 100644
--- a/test/cpp/phi/core/test_ddim.cc
+++ b/test/cpp/phi/core/test_ddim.cc
@@ -15,7 +15,7 @@
 #include <sstream>
 
 #include "gtest/gtest.h"
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 
 namespace phi {
 namespace tests {
@@ -27,19 +27,19 @@ TEST(DDim, Equality) {
   EXPECT_EQ(default_ddim[0], 0);
 
   // construct a zero-DDim
-  phi::DDim zero_ddim = phi::make_ddim({});
+  phi::DDim zero_ddim = common::make_ddim({});
   EXPECT_EQ(arity(zero_ddim), 0);
   EXPECT_EQ(zero_ddim.size(), 0);
-  EXPECT_EQ(phi::product(zero_ddim), 1);
+  EXPECT_EQ(common::product(zero_ddim), 1);
 
   std::vector<int64_t> zero_vec;
-  phi::DDim zero_ddim1 = phi::make_ddim(zero_vec);
+  phi::DDim zero_ddim1 = common::make_ddim(zero_vec);
   EXPECT_EQ(arity(zero_ddim1), 0);
   EXPECT_EQ(zero_ddim1.size(), 0);
-  EXPECT_EQ(phi::product(zero_ddim1), 1);
+  EXPECT_EQ(common::product(zero_ddim1), 1);
 
   // zero-DDim to vector
-  std::vector<int64_t> zero_ddim_vec = phi::vectorize(zero_ddim);
+  std::vector<int64_t> zero_ddim_vec = common::vectorize(zero_ddim);
   EXPECT_EQ(zero_ddim_vec.size(), size_t(0));
 
   // reshape zero-DDim
@@ -47,16 +47,16 @@ TEST(DDim, Equality) {
   phi::DDim reshape_ddim = zero_ddim.reshape(reshape_vec);
   EXPECT_EQ(arity(reshape_ddim), 1);
   EXPECT_EQ(reshape_ddim.size(), 1);
-  EXPECT_EQ(phi::product(reshape_ddim), 1);
+  EXPECT_EQ(common::product(reshape_ddim), 1);
 
   // construct a DDim from an initialization list
-  phi::DDim ddim = phi::make_ddim({9, 1, 5});
+  phi::DDim ddim = common::make_ddim({9, 1, 5});
   EXPECT_EQ(ddim[0], 9);
   EXPECT_EQ(ddim[1], 1);
   EXPECT_EQ(ddim[2], 5);
 
   // arity of a DDim
-  EXPECT_EQ(phi::arity(ddim), 3);
+  EXPECT_EQ(common::arity(ddim), 3);
   EXPECT_EQ(ddim.size(), 3);
 
   // mutate a DDim
@@ -67,35 +67,35 @@ TEST(DDim, Equality) {
 
   // construct a DDim from a vector
   std::vector<int64_t> vec({9, 1, 5});
-  phi::DDim vddim = phi::make_ddim(vec);
+  phi::DDim vddim = common::make_ddim(vec);
   EXPECT_EQ(vddim[0], 9);
   EXPECT_EQ(vddim[1], 1);
   EXPECT_EQ(vddim[2], 5);
 
   // vectorize a DDim
-  std::vector<int64_t> res_vec = phi::vectorize(vddim);
+  std::vector<int64_t> res_vec = common::vectorize(vddim);
   EXPECT_EQ(res_vec[0], 9);
   EXPECT_EQ(res_vec[1], 1);
   EXPECT_EQ(res_vec[2], 5);
   phi::Dim<3> d(3, 2, 1);
-  res_vec = phi::vectorize(phi::DDim(d));
+  res_vec = common::vectorize(phi::DDim(d));
   EXPECT_EQ(res_vec[0], 3);
   EXPECT_EQ(res_vec[1], 2);
   EXPECT_EQ(res_vec[2], 1);
 
   // product of a DDim
-  EXPECT_EQ(phi::product(vddim), 45);
-  EXPECT_EQ(phi::product(phi::make_ddim({3, 2, 5, 3})), 90);
+  EXPECT_EQ(common::product(vddim), 45);
+  EXPECT_EQ(common::product(common::make_ddim({3, 2, 5, 3})), 90);
 
   // slice a DDim
-  phi::DDim ddim2 = phi::make_ddim({1, 2, 3, 4, 5, 6});
-  phi::DDim slice_dim1 = phi::slice_ddim(ddim2, 2, 5);
+  phi::DDim ddim2 = common::make_ddim({1, 2, 3, 4, 5, 6});
+  phi::DDim slice_dim1 = common::slice_ddim(ddim2, 2, 5);
   EXPECT_EQ(arity(slice_dim1), 3);
   EXPECT_EQ(slice_dim1[0], 3);
   EXPECT_EQ(slice_dim1[1], 4);
   EXPECT_EQ(slice_dim1[2], 5);
 
-  phi::DDim slice_dim2 = phi::slice_ddim(ddim2, 0, 6);
+  phi::DDim slice_dim2 = common::slice_ddim(ddim2, 0, 6);
   EXPECT_EQ(arity(slice_dim2), 6);
   EXPECT_EQ(slice_dim2[0], 1);
   EXPECT_EQ(slice_dim2[1], 2);
@@ -104,22 +104,22 @@ TEST(DDim, Equality) {
   EXPECT_EQ(slice_dim2[4], 5);
   EXPECT_EQ(slice_dim2[5], 6);
 
-  phi::DDim slice_dim3 = phi::slice_ddim(ddim2, 1, 1);
+  phi::DDim slice_dim3 = common::slice_ddim(ddim2, 1, 1);
   EXPECT_EQ(arity(slice_dim3), 0);
   EXPECT_EQ(slice_dim3.size(), 0);
-  EXPECT_EQ(phi::product(slice_dim3), 1);
+  EXPECT_EQ(common::product(slice_dim3), 1);
 }
 
 TEST(DDim, Print) {
   // print a DDim
   std::stringstream ss1;
-  phi::DDim ddim = phi::make_ddim({2, 3, 4});
+  phi::DDim ddim = common::make_ddim({2, 3, 4});
   ss1 << ddim;
   EXPECT_EQ("2, 3, 4", ss1.str());
 
   // print a zero-DDim
   std::stringstream ss2;
-  phi::DDim zero_ddim = phi::make_ddim({});
+  phi::DDim zero_ddim = common::make_ddim({});
   ss2 << zero_ddim;
   EXPECT_EQ("", ss2.str());
 }
@@ -127,7 +127,7 @@ TEST(DDim, Print) {
 TEST(DDim, Hash) {
   // hash a DDim
   std::size_t h = 0;
-  phi::DDim ddim = phi::make_ddim({2, 3, 4});
+  phi::DDim ddim = common::make_ddim({2, 3, 4});
   h = std::hash<phi::DDim>()(ddim);
   EXPECT_EQ(h, 0xa16fb2b2967ul);
 }
diff --git a/test/cpp/phi/core/test_dim.cu b/test/cpp/phi/core/test_dim.cu
index 2a449191367b4e..cf7196dadd3550 100644
--- a/test/cpp/phi/core/test_dim.cu
+++ b/test/cpp/phi/core/test_dim.cu
@@ -17,21 +17,21 @@
 #include <sstream>
 
 #include "gtest/gtest.h"
-#include "paddle/phi/core/utils/dim.h"
+#include "paddle/common/dim.h"
 
 namespace phi {
 namespace tests {
 
-__global__ void test(phi::Dim<2>* o) { o[0] = phi::make_dim(5, 6); }
+__global__ void test(phi::Dim<2>* o) { o[0] = common::make_dim(5, 6); }
 
 __global__ void dyn_idx_gpu(int64_t* o) {
-  auto d = phi::make_dim(5, 6);
+  auto d = common::make_dim(5, 6);
   o[0] = d[1];
 }
 
 TEST(Dim, Equality) {
   // construct a Dim on the CPU
-  auto a = phi::make_dim(3, 4);
+  auto a = common::make_dim(3, 4);
   EXPECT_EQ(a[0], 3);
   EXPECT_EQ(a[1], 4);
 
@@ -48,10 +48,10 @@ TEST(Dim, Equality) {
   EXPECT_EQ(a[1], 6);
 
   // product
-  EXPECT_EQ(phi::product(a), 30);
+  EXPECT_EQ(common::product(a), 30);
 
   // mutate a Dim
-  auto b = phi::make_dim(7, 8);
+  auto b = common::make_dim(7, 8);
   b[1] = 10;
   EXPECT_EQ(b[0], 7);
   EXPECT_EQ(b[1], 10);
@@ -74,9 +74,9 @@ TEST(Dim, Equality) {
 }
 
 TEST(Dim, Bool) {
-  auto a = phi::make_dim(3, 4);
-  auto b = phi::make_dim(5, 6);
-  auto c = phi::make_dim(3, 4);
+  auto a = common::make_dim(3, 4);
+  auto b = common::make_dim(5, 6);
+  auto c = common::make_dim(3, 4);
 
   // comparison
   EXPECT_TRUE(a == a);
@@ -87,13 +87,13 @@ TEST(Dim, Bool) {
 TEST(Dim, Print) {
   {
     std::stringstream ss;
-    auto a = phi::make_dim(2, 3);
+    auto a = common::make_dim(2, 3);
     ss << a;
     EXPECT_EQ(ss.str(), "2, 3");
   }
   {
     std::stringstream ss;
-    ss << phi::make_dim(8);
+    ss << common::make_dim(8);
     EXPECT_EQ(ss.str(), "8");
   }
 }
diff --git a/test/cpp/phi/core/test_meta_fn_utils.cc b/test/cpp/phi/core/test_meta_fn_utils.cc
index 6c26d38a95a75e..7112f332abe2dd 100644
--- a/test/cpp/phi/core/test_meta_fn_utils.cc
+++ b/test/cpp/phi/core/test_meta_fn_utils.cc
@@ -24,7 +24,7 @@ namespace tests {
 
 TEST(MetaFnFactory, InferMetaFnExists) {
   phi::DenseTensor dense_x;
-  dense_x.Resize(phi::make_ddim({3, 4}));
+  dense_x.Resize(common::make_ddim({3, 4}));
 
   phi::MetaTensor meta_x(&dense_x);
   phi::DenseTensor dense_out1;
diff --git a/test/cpp/phi/core/test_selected_rows.cc b/test/cpp/phi/core/test_selected_rows.cc
index 1f56d851a7b5bc..e55266279d22bb 100644
--- a/test/cpp/phi/core/test_selected_rows.cc
+++ b/test/cpp/phi/core/test_selected_rows.cc
@@ -31,7 +31,8 @@ class SelectedRowsTester : public ::testing::Test {
 
     phi::DenseTensor* value = selected_rows_->mutable_value();
     auto* data = value->mutable_data<float>(
-        phi::make_ddim({static_cast<int64_t>(rows.size()), row_numel}), place_);
+        common::make_ddim({static_cast<int64_t>(rows.size()), row_numel}),
+        place_);
     for (int64_t i = 0; i < value->numel(); ++i) {
       data[i] = static_cast<float>(i);
     }
@@ -45,11 +46,11 @@ class SelectedRowsTester : public ::testing::Test {
 TEST_F(SelectedRowsTester, height) { ASSERT_EQ(selected_rows_->height(), 10); }
 
 TEST_F(SelectedRowsTester, dims) {
-  ASSERT_EQ(selected_rows_->value().dims(), phi::make_ddim({3, 100}));
+  ASSERT_EQ(selected_rows_->value().dims(), common::make_ddim({3, 100}));
 }
 
 TEST_F(SelectedRowsTester, complete_dims) {
-  ASSERT_EQ(selected_rows_->GetCompleteDims(), phi::make_ddim({10, 100}));
+  ASSERT_EQ(selected_rows_->GetCompleteDims(), common::make_ddim({10, 100}));
 }
 
 TEST(SelectedRows, SparseTable) {
@@ -59,7 +60,8 @@ TEST(SelectedRows, SparseTable) {
   int64_t table_size = 100;
   int64_t embedding_width = 8;
   // initialize a sparse table
-  table.mutable_value()->Resize(phi::make_ddim({table_size, embedding_width}));
+  table.mutable_value()->Resize(
+      common::make_ddim({table_size, embedding_width}));
   auto* data = table.mutable_value()->mutable_data<float>(cpu);
   for (int64_t i = 0; i < table_size; ++i) {
     for (int64_t j = 0; j < embedding_width; ++j) {
@@ -80,7 +82,7 @@ TEST(SelectedRows, SparseTable) {
   ASSERT_EQ(table.rows().size(), 3UL);
 
   phi::DenseTensor ids;
-  ids.Resize(phi::make_ddim({4}));
+  ids.Resize(common::make_ddim({4}));
   auto* ids_data = ids.mutable_data<int64_t>(cpu);
   ids_data[0] = static_cast<int64_t>(6);
   ids_data[1] = static_cast<int64_t>(6);
@@ -88,8 +90,8 @@ TEST(SelectedRows, SparseTable) {
   ids_data[3] = static_cast<int64_t>(10);
 
   phi::DenseTensor get_value;
-  auto* value_data =
-      get_value.mutable_data<float>(phi::make_ddim({4, embedding_width}), cpu);
+  auto* value_data = get_value.mutable_data<float>(
+      common::make_ddim({4, embedding_width}), cpu);
   table.Get(ids, &get_value);
 
   for (int j = 0; j < embedding_width; ++j) {
@@ -157,7 +159,8 @@ TEST(SelectedRows, MultiThreadAutoIndex) {
   int64_t table_size = 100000;
   int64_t embedding_width = 8;
   // initialize a sparse table
-  table.mutable_value()->Resize(phi::make_ddim({table_size, embedding_width}));
+  table.mutable_value()->Resize(
+      common::make_ddim({table_size, embedding_width}));
   auto* data = table.mutable_value()->mutable_data<float>(cpu);
   for (int64_t i = 0; i < table_size; ++i) {
     for (int64_t j = 0; j < embedding_width; ++j) {
diff --git a/test/cpp/phi/core/test_sparse_coo_tensor.cc b/test/cpp/phi/core/test_sparse_coo_tensor.cc
index e6d134ffb52350..d3e46fba334a6e 100644
--- a/test/cpp/phi/core/test_sparse_coo_tensor.cc
+++ b/test/cpp/phi/core/test_sparse_coo_tensor.cc
@@ -23,20 +23,21 @@ namespace tests {
 
 TEST(sparse_coo_tensor, construct) {
   phi::CPUPlace cpu;
-  auto dense_dims = phi::make_ddim({3, 3});
+  auto dense_dims = common::make_ddim({3, 3});
   std::vector<float> non_zero_data = {1.0, 2.0, 3.0};
   std::vector<int64_t> indices_data = {0, 1, 2, 0, 2, 1};
   auto fancy_allocator = std::unique_ptr<Allocator>(new FancyAllocator);
   auto* alloc = fancy_allocator.get();
   auto indices_dims =
-      phi::make_ddim({2, static_cast<int>(non_zero_data.size())});
+      common::make_ddim({2, static_cast<int>(non_zero_data.size())});
   DenseTensorMeta indices_meta(DataType::INT64, indices_dims, DataLayout::NCHW);
   DenseTensor indices(alloc, indices_meta);
   memcpy(indices.mutable_data<int64_t>(cpu),
          &indices_data[0],
          indices_data.size() * sizeof(int64_t));
 
-  auto elements_dims = phi::make_ddim({static_cast<int>(non_zero_data.size())});
+  auto elements_dims =
+      common::make_ddim({static_cast<int>(non_zero_data.size())});
   DenseTensorMeta elements_meta(
       DataType::FLOAT32, elements_dims, DataLayout::NCHW);
   DenseTensor elements(alloc, elements_meta);
@@ -58,13 +59,13 @@ TEST(sparse_coo_tensor, construct) {
 TEST(sparse_coo_tensor, other_function) {
   auto fancy_allocator = std::unique_ptr<Allocator>(new FancyAllocator);
   auto* alloc = fancy_allocator.get();
-  auto dense_dims = phi::make_ddim({4, 4});
+  auto dense_dims = common::make_ddim({4, 4});
   const int non_zero_num = 2;
-  auto indices_dims = phi::make_ddim({2, non_zero_num});
+  auto indices_dims = common::make_ddim({2, non_zero_num});
   DenseTensorMeta indices_meta(DataType::INT64, indices_dims, DataLayout::NCHW);
   DenseTensor indices(alloc, indices_meta);
 
-  auto elements_dims = phi::make_ddim({non_zero_num});
+  auto elements_dims = common::make_ddim({non_zero_num});
   DenseTensorMeta elements_meta(
       DataType::FLOAT32, elements_dims, DataLayout::NCHW);
   DenseTensor elements(alloc, elements_meta);
@@ -74,7 +75,7 @@ TEST(sparse_coo_tensor, other_function) {
   CHECK_EQ(coo.dims(), dense_dims);
 
   // Test Resize
-  auto dense_dims_3d = phi::make_ddim({2, 4, 4});
+  auto dense_dims_3d = common::make_ddim({2, 4, 4});
   coo.Resize(dense_dims_3d, 1, 3);
   CHECK_EQ(coo.nnz(), 3);
 
diff --git a/test/cpp/phi/core/test_sparse_csr_tensor.cc b/test/cpp/phi/core/test_sparse_csr_tensor.cc
index 56f671a7fc7e9e..78f19a1ba580d1 100644
--- a/test/cpp/phi/core/test_sparse_csr_tensor.cc
+++ b/test/cpp/phi/core/test_sparse_csr_tensor.cc
@@ -24,7 +24,7 @@ namespace tests {
 
 TEST(sparse_csr_tensor, construct) {
   phi::CPUPlace cpu;
-  auto dense_dims = phi::make_ddim({3, 3});
+  auto dense_dims = common::make_ddim({3, 3});
   std::vector<float> non_zero_data = {1.0, 2.0, 3.0};
   std::vector<int64_t> crows_data = {0, 1, 1, 3};
   std::vector<int64_t> cols_data = {1, 0, 2};
@@ -32,7 +32,7 @@ TEST(sparse_csr_tensor, construct) {
   auto fancy_allocator = std::unique_ptr<Allocator>(new FancyAllocator);
   auto alloc = fancy_allocator.get();
   // create non_zero_crows
-  auto crows_dims = phi::make_ddim({static_cast<int>(crows_data.size())});
+  auto crows_dims = common::make_ddim({static_cast<int>(crows_data.size())});
   DenseTensorMeta crows_meta(DataType::INT64, crows_dims, DataLayout::NCHW);
   DenseTensor crows(alloc, crows_meta);
   memcpy(crows.mutable_data<int64_t>(cpu),
@@ -40,7 +40,7 @@ TEST(sparse_csr_tensor, construct) {
          crows_data.size() * sizeof(int64_t));
 
   // create non_zero_cols
-  auto cols_dims = phi::make_ddim({static_cast<int>(cols_data.size())});
+  auto cols_dims = common::make_ddim({static_cast<int>(cols_data.size())});
   DenseTensorMeta cols_meta(DataType::INT64, cols_dims, DataLayout::NCHW);
   DenseTensor cols(alloc, cols_meta);
   memcpy(cols.mutable_data<int64_t>(cpu),
@@ -48,7 +48,8 @@ TEST(sparse_csr_tensor, construct) {
          cols_data.size() * sizeof(int64_t));
 
   // create non_zero_elements
-  auto elements_dims = phi::make_ddim({static_cast<int>(non_zero_data.size())});
+  auto elements_dims =
+      common::make_ddim({static_cast<int>(non_zero_data.size())});
   DenseTensorMeta elements_meta(
       DataType::FLOAT32, elements_dims, DataLayout::NCHW);
   DenseTensor elements(alloc, elements_meta);
@@ -70,13 +71,13 @@ TEST(sparse_csr_tensor, construct) {
 TEST(sparse_csr_tensor, other_function) {
   auto fancy_allocator = std::unique_ptr<Allocator>(new FancyAllocator);
   auto alloc = fancy_allocator.get();
-  auto dense_dims = phi::make_ddim({4, 4});
-  auto crows_dims = phi::make_ddim({dense_dims[0] + 1});
+  auto dense_dims = common::make_ddim({4, 4});
+  auto crows_dims = common::make_ddim({dense_dims[0] + 1});
   DenseTensorMeta crows_meta(DataType::INT64, crows_dims, DataLayout::NCHW);
   DenseTensor crows(alloc, crows_meta);
 
   const int64_t non_zero_num = 5;
-  auto cols_dims = phi::make_ddim({non_zero_num});
+  auto cols_dims = common::make_ddim({non_zero_num});
   DenseTensorMeta cols_meta(DataType::INT64, cols_dims, DataLayout::NCHW);
   DenseTensor cols(alloc, cols_meta);
   DenseTensorMeta values_meta(DataType::FLOAT32, cols_dims, DataLayout::NCHW);
@@ -87,7 +88,7 @@ TEST(sparse_csr_tensor, other_function) {
   CHECK_EQ(csr.dims(), dense_dims);
 
   // Test Resize
-  auto dense_dims_3d = phi::make_ddim({2, 4, 4});
+  auto dense_dims_3d = common::make_ddim({2, 4, 4});
   csr.Resize(dense_dims_3d, 2);
   CHECK_EQ(csr.non_zero_cols().numel(), 2);
 
diff --git a/test/cpp/phi/core/test_tensor_array.cc b/test/cpp/phi/core/test_tensor_array.cc
index 201790a7bc0e10..ae2685d6fc98e7 100644
--- a/test/cpp/phi/core/test_tensor_array.cc
+++ b/test/cpp/phi/core/test_tensor_array.cc
@@ -17,9 +17,9 @@ limitations under the License. */
 #include <utility>
 
 #include "gtest/gtest.h"
+#include "paddle/common/errors.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/tensor_array.h"
 #include "test/cpp/phi/core/allocator.h"
 
diff --git a/test/cpp/phi/core/unroll_array_ops_test.cc b/test/cpp/phi/core/unroll_array_ops_test.cc
index ddcf48844a7ad3..65d00dace78cc6 100644
--- a/test/cpp/phi/core/unroll_array_ops_test.cc
+++ b/test/cpp/phi/core/unroll_array_ops_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/core/utils/unroll_array_ops.h"
+#include "paddle/common/unroll_array_ops.h"
 
 #include <gtest/gtest.h>
 
@@ -32,7 +32,7 @@ bool FillConstantTestMain() {
   std::array<int, D1> arr;
   arr.fill(0);
 
-  UnrollFillConstant<D2>::Run(arr.data(), 1);
+  common::UnrollFillConstant<D2>::Run(arr.data(), 1);
   return CheckEquality(arr.data(), D2, 1) &&
          CheckEquality(arr.data() + D2, arr.size() - D2, 0);
 }
@@ -47,7 +47,7 @@ TEST(unroll_ops, fill_constant) {
 TEST(unroll_ops, assign) {
   const int a[] = {1, 2, 3, 4, 5};  // NOLINT
   int b[] = {0, 0, 0, 0, 0};        // NOLINT
-  UnrollAssign<3>::Run(a, b);
+  common::UnrollAssign<3>::Run(a, b);
   EXPECT_EQ(b[0], 1);
   EXPECT_EQ(b[1], 2);
   EXPECT_EQ(b[2], 3);
@@ -57,7 +57,7 @@ TEST(unroll_ops, assign) {
 
 TEST(unroll_ops, var_args_assign) {
   int a[] = {0, 0, 0};  // NOLINT
-  UnrollVarArgsAssign<int>::Run(a, 1, 2);
+  common::UnrollVarArgsAssign<int>::Run(a, 1, 2);
   EXPECT_EQ(a[0], 1);
   EXPECT_EQ(a[1], 2);
   EXPECT_EQ(a[2], 0);
@@ -66,17 +66,17 @@ TEST(unroll_ops, var_args_assign) {
 TEST(unroll_ops, compare) {
   int a[] = {1, 2, 3};  // NOLINT
   int b[] = {1, 2, 4};  // NOLINT
-  EXPECT_TRUE(UnrollCompare<2>::Run(a, b));
-  EXPECT_FALSE(UnrollCompare<3>::Run(a, b));
+  EXPECT_TRUE(common::UnrollCompare<2>::Run(a, b));
+  EXPECT_FALSE(common::UnrollCompare<3>::Run(a, b));
 
   b[0] = -1;
-  EXPECT_TRUE(UnrollCompare<0>::Run(a, b));
-  EXPECT_FALSE(UnrollCompare<1>::Run(a, b));
+  EXPECT_TRUE(common::UnrollCompare<0>::Run(a, b));
+  EXPECT_FALSE(common::UnrollCompare<1>::Run(a, b));
 }
 
 TEST(unroll_ops, product) {
   int a[] = {2, 3, 4};  // NOLINT
-  EXPECT_EQ(UnrollProduct<3>::Run(a), a[0] * a[1] * a[2]);
+  EXPECT_EQ(common::UnrollProduct<3>::Run(a), a[0] * a[1] * a[2]);
 }
 
 }  // namespace framework
diff --git a/test/cpp/phi/kernels/CMakeLists.txt b/test/cpp/phi/kernels/CMakeLists.txt
index a4906b3d1a879c..36e7a0b10310b8 100644
--- a/test/cpp/phi/kernels/CMakeLists.txt
+++ b/test/cpp/phi/kernels/CMakeLists.txt
@@ -1,12 +1,12 @@
 cc_test(
   test_math_function
   SRCS test_math_function.cc
-  DEPS phi)
+  DEPS phi common)
 if(WITH_GPU)
   nv_test(
     test_math_function_gpu
     SRCS test_math_function.cu
-    DEPS phi)
+    DEPS phi common)
   nv_test(
     test_broadcast_gpu
     SRCS test_ternary_broadcast.cu
@@ -16,56 +16,56 @@ if(WITH_ROCM)
   hip_test(
     test_math_function_gpu
     SRCS test_math_function.cu
-    DEPS phi)
+    DEPS phi common)
 endif()
 
 cc_test(
   test_cpu_vec
   SRCS test_cpu_vec.cc
-  DEPS phi)
+  DEPS phi common)
 
 # For String Kernels
 cc_test(
   test_strings_lower_upper_dev_api
   SRCS test_strings_lower_upper_dev_api.cc
-  DEPS phi)
+  DEPS phi common)
 if(WITH_GPU)
   nv_test(
     test_strings_lower_upper_dev_gpu_api
     SRCS test_strings_lower_upper_dev_api.cu
-    DEPS phi)
+    DEPS phi common)
 elseif(WITH_ROCM)
   hip_test(
     test_strings_lower_upper_dev_gpu_api
     SRCS test_strings_lower_upper_dev_api.cu
-    DEPS phi)
+    DEPS phi common)
 endif()
 
 cc_test(
   test_strings_copy_dev_api
   SRCS test_strings_copy_dev_api.cc
-  DEPS phi)
+  DEPS phi common)
 if(WITH_GPU)
   nv_test(
     test_strings_copy_dev_gpu_api
     SRCS test_strings_copy_dev_api.cu
-    DEPS phi)
+    DEPS phi common)
 elseif(WITH_ROCM)
   hip_test(
     test_strings_copy_dev_gpu_api
     SRCS test_strings_copy_dev_api.cu
-    DEPS phi)
+    DEPS phi common)
 endif()
 
 cc_test(
   test_memcpy_dev_api
   SRCS test_memcpy_dev_api.cc
-  DEPS phi)
+  DEPS phi common)
 
 cc_test(
   test_transfer_layout_dev_api
   SRCS test_transfer_layout_dev_api.cc
-  DEPS phi)
+  DEPS phi common)
 
 if(WITH_GPU)
   nv_test(
@@ -79,7 +79,7 @@ if(WITH_GPU)
   cc_test(
     test_fused_adam_kernel
     SRCS test_fused_adam_kernel.cc
-    DEPS gtest phi)
+    DEPS gtest phi common)
 elseif(WITH_ROCM)
   hip_test(
     test_gpu_timer
@@ -94,19 +94,19 @@ endif()
 cc_test(
   test_cache
   SRCS test_cache.cc
-  DEPS gtest phi)
+  DEPS gtest phi common)
 
 cc_test(
   strided_memcpy_test
   SRCS strided_memcpy_test.cc
-  DEPS phi memory)
+  DEPS phi common memory)
 
 cc_test(
   sequence_padding_test
   SRCS sequence_padding_test.cc
-  DEPS phi)
+  DEPS phi common)
 
 cc_test(
   sequence_pooling_test
   SRCS sequence_pooling_test.cc
-  DEPS phi)
+  DEPS phi common)
diff --git a/test/cpp/phi/kernels/sequence_padding_test.cc b/test/cpp/phi/kernels/sequence_padding_test.cc
index 015d6f354c5beb..dab519337536e3 100644
--- a/test/cpp/phi/kernels/sequence_padding_test.cc
+++ b/test/cpp/phi/kernels/sequence_padding_test.cc
@@ -31,8 +31,8 @@ void TestSequencePadding(const DeviceContext &context,
   phi::DenseTensor pad_value;
 
   const size_t level = lod.size() - 1;
-  auto seq_dims = phi::make_ddim({static_cast<int64_t>(lod[level].back()),
-                                  static_cast<int64_t>(sequence_width)});
+  auto seq_dims = common::make_ddim({static_cast<int64_t>(lod[level].back()),
+                                     static_cast<int64_t>(sequence_width)});
 
   cpu_seq.set_lod(lod);
   auto *dev_ctx = static_cast<phi::CPUContext *>(
@@ -55,9 +55,10 @@ void TestSequencePadding(const DeviceContext &context,
   const size_t max_sequence_length =
       phi::funcs::MaximumSequenceLength(lod[level]);
   const size_t num_sequences = lod[level].size() - 1;
-  auto padding_dims = phi::make_ddim({static_cast<int64_t>(max_sequence_length),
-                                      static_cast<int64_t>(num_sequences),
-                                      static_cast<int64_t>(sequence_width)});
+  auto padding_dims =
+      common::make_ddim({static_cast<int64_t>(max_sequence_length),
+                         static_cast<int64_t>(num_sequences),
+                         static_cast<int64_t>(sequence_width)});
 
   padding.Resize(padding_dims);
   context.template Alloc<T>(&padding);
diff --git a/test/cpp/phi/kernels/sequence_pooling_test.cc b/test/cpp/phi/kernels/sequence_pooling_test.cc
index 037ad314890c5f..2df2ffa12f969d 100644
--- a/test/cpp/phi/kernels/sequence_pooling_test.cc
+++ b/test/cpp/phi/kernels/sequence_pooling_test.cc
@@ -31,7 +31,7 @@ void TestSequencePoolingSum(const DeviceContext &context,
   // construct out_grad's tensor in cpu
   const size_t out_first_dim = lod[0].size() - 1;
   auto out_dims =
-      phi::make_ddim({static_cast<int64_t>(out_first_dim), second_dim});
+      common::make_ddim({static_cast<int64_t>(out_first_dim), second_dim});
 
   cpu_out_grad.mutable_data<T>(out_dims, phi::CPUPlace());
   for (int64_t i = 0; i < cpu_out_grad.numel(); ++i) {
@@ -49,7 +49,7 @@ void TestSequencePoolingSum(const DeviceContext &context,
   // construct in_grad
   in_grad.set_lod(lod);
   auto in_dims =
-      phi::make_ddim({static_cast<int64_t>(lod[0].back()), second_dim});
+      common::make_ddim({static_cast<int64_t>(lod[0].back()), second_dim});
   in_grad.mutable_data<T>(in_dims, place);
 
   // check tensor contruction result
diff --git a/test/cpp/phi/kernels/test_auto_tune.cu b/test/cpp/phi/kernels/test_auto_tune.cu
index 302f8809d2d575..ecdb4a0311bc17 100644
--- a/test/cpp/phi/kernels/test_auto_tune.cu
+++ b/test/cpp/phi/kernels/test_auto_tune.cu
@@ -83,12 +83,14 @@ TEST(AutoTune, sum) {
       std::make_unique<paddle::experimental::DefaultAllocator>(phi::CPUPlace());
   auto in1 = std::make_shared<phi::DenseTensor>(
       alloc_cpu.get(),
-      phi::DenseTensorMeta(
-          phi::DataType::FLOAT32, phi::make_ddim({N}), phi::DataLayout::NCHW));
+      phi::DenseTensorMeta(phi::DataType::FLOAT32,
+                           common::make_ddim({N}),
+                           phi::DataLayout::NCHW));
   auto in2 = std::make_shared<phi::DenseTensor>(
       alloc_cpu.get(),
-      phi::DenseTensorMeta(
-          phi::DataType::FLOAT32, phi::make_ddim({N}), phi::DataLayout::NCHW));
+      phi::DenseTensorMeta(phi::DataType::FLOAT32,
+                           common::make_ddim({N}),
+                           phi::DataLayout::NCHW));
 
   float* in1_data = in1->data<float>();
   float* in2_data = in2->data<float>();
@@ -106,12 +108,14 @@ TEST(AutoTune, sum) {
 
   auto d_in1 = std::make_shared<phi::DenseTensor>(
       alloc_cuda.get(),
-      phi::DenseTensorMeta(
-          phi::DataType::FLOAT32, phi::make_ddim({N}), phi::DataLayout::NCHW));
+      phi::DenseTensorMeta(phi::DataType::FLOAT32,
+                           common::make_ddim({N}),
+                           phi::DataLayout::NCHW));
   auto d_in2 = std::make_shared<phi::DenseTensor>(
       alloc_cuda.get(),
-      phi::DenseTensorMeta(
-          phi::DataType::FLOAT32, phi::make_ddim({N}), phi::DataLayout::NCHW));
+      phi::DenseTensorMeta(phi::DataType::FLOAT32,
+                           common::make_ddim({N}),
+                           phi::DataLayout::NCHW));
   phi::Copy(*dev_ctx, *in1.get(), phi::GPUPlace(), false, d_in1.get());
   phi::Copy(*dev_ctx, *in2.get(), phi::GPUPlace(), false, d_in2.get());
 
diff --git a/test/cpp/phi/kernels/test_fused_adam_kernel.cc b/test/cpp/phi/kernels/test_fused_adam_kernel.cc
index 7084b85ba73882..b4edd6b0c19770 100644
--- a/test/cpp/phi/kernels/test_fused_adam_kernel.cc
+++ b/test/cpp/phi/kernels/test_fused_adam_kernel.cc
@@ -366,8 +366,12 @@ auto MaxDiff(const Context &ctx,
 
   diff_reduced.Resize({1});
   ctx.template Alloc<MT>(&diff_reduced);
-  MaxRawKernel<MT, Context>(
-      ctx, diff, vectorize<int64_t>(x.dims()), false, true, &diff_reduced);
+  MaxRawKernel<MT, Context>(ctx,
+                            diff,
+                            common::vectorize<int64_t>(x.dims()),
+                            false,
+                            true,
+                            &diff_reduced);
 
   diff_reduced_cpu.Resize(diff_reduced.dims());
   ctx.template HostAlloc<MT>(&diff_reduced_cpu);
diff --git a/test/cpp/phi/kernels/test_memcpy_dev_api.cc b/test/cpp/phi/kernels/test_memcpy_dev_api.cc
index a7f65e5b6dd6f7..2d06dcab602ea1 100644
--- a/test/cpp/phi/kernels/test_memcpy_dev_api.cc
+++ b/test/cpp/phi/kernels/test_memcpy_dev_api.cc
@@ -37,7 +37,7 @@ TEST(DEV_API, memcpy_d2h) {
       std::make_unique<paddle::experimental::DefaultAllocator>(phi::CPUPlace());
   phi::DenseTensor x_cpu(cpu_alloc.get(),
                          phi::DenseTensorMeta(phi::DataType::FLOAT32,
-                                              phi::make_ddim({3, 2, 2, 3}),
+                                              common::make_ddim({3, 2, 2, 3}),
                                               phi::DataLayout::NCHW));
   auto& pool = phi::DeviceContextPool::Instance();
   auto* cpu_ctx = pool.GetByPlace(phi::CPUPlace());
diff --git a/test/cpp/phi/kernels/test_ternary_broadcast.cu b/test/cpp/phi/kernels/test_ternary_broadcast.cu
index 959b79725f07ae..137416df764673 100644
--- a/test/cpp/phi/kernels/test_ternary_broadcast.cu
+++ b/test/cpp/phi/kernels/test_ternary_broadcast.cu
@@ -102,10 +102,10 @@ TEST(Broadcast, add) {
   size_t times = 10;
 
   do {
-    auto dim1 = phi::make_ddim({1, 2048, 3584});
-    auto dim2 = phi::make_ddim({1, 2048, 1});
-    auto dim3 = phi::make_ddim({1, 1, 3584});
-    auto dim_out = phi::make_ddim({1, 2048, 3584});
+    auto dim1 = common::make_ddim({1, 2048, 3584});
+    auto dim2 = common::make_ddim({1, 2048, 1});
+    auto dim3 = common::make_ddim({1, 1, 3584});
+    auto dim_out = common::make_ddim({1, 2048, 3584});
     TestCase<float>(
         *dev_ctx, dim1, dim2, dim3, dim_out, times, AddTernary_1<float>());
     TestCase<phi::dtype::float16>(*dev_ctx,
@@ -141,10 +141,10 @@ TEST(Broadcast, add) {
   } while (0);
 
   do {
-    auto dim1 = phi::make_ddim({1, 256, 4, 256, 256});
-    auto dim2 = phi::make_ddim({1, 256, 1, 1, 256});
-    auto dim3 = phi::make_ddim({1, 1, 4, 256, 256});
-    auto dim_out = phi::make_ddim({1, 256, 4, 256, 256});
+    auto dim1 = common::make_ddim({1, 256, 4, 256, 256});
+    auto dim2 = common::make_ddim({1, 256, 1, 1, 256});
+    auto dim3 = common::make_ddim({1, 1, 4, 256, 256});
+    auto dim_out = common::make_ddim({1, 256, 4, 256, 256});
     TestCase<float>(
         *dev_ctx, dim1, dim2, dim3, dim_out, times, AddTernary_2<float>());
     TestCase<phi::dtype::float16>(*dev_ctx,
@@ -180,10 +180,10 @@ TEST(Broadcast, add) {
   } while (0);
 
   do {
-    auto dim1 = phi::make_ddim({1, 256, 256});
-    auto dim2 = phi::make_ddim({1, 1, 256});
-    auto dim3 = phi::make_ddim({1, 256, 1});
-    auto dim_out = phi::make_ddim({1, 256, 256});
+    auto dim1 = common::make_ddim({1, 256, 256});
+    auto dim2 = common::make_ddim({1, 1, 256});
+    auto dim3 = common::make_ddim({1, 256, 1});
+    auto dim_out = common::make_ddim({1, 256, 256});
     TestCase<float>(
         *dev_ctx, dim1, dim2, dim3, dim_out, times, AddTernary_3<float>());
     TestCase<phi::dtype::float16>(*dev_ctx,
diff --git a/test/cpp/phi/kernels/test_transfer_layout_dev_api.cc b/test/cpp/phi/kernels/test_transfer_layout_dev_api.cc
index f656ee9f59829d..b7da7dc397cd48 100644
--- a/test/cpp/phi/kernels/test_transfer_layout_dev_api.cc
+++ b/test/cpp/phi/kernels/test_transfer_layout_dev_api.cc
@@ -41,7 +41,7 @@ TEST(DEV_API, transfer_layout) {
   MetaTensor meta_x(&x);
   meta_x.set_dtype(DataType::FLOAT32);
   meta_x.set_layout(DataLayout::ONEDNN);
-  meta_x.set_dims(make_ddim({n, c, h, w}));
+  meta_x.set_dims(common::make_ddim({n, c, h, w}));
 
   DenseTensor out;
 
@@ -63,7 +63,7 @@ TEST(DEV_API, transfer_layout) {
 
   // 3. check result
   std::vector<int64_t> expect_shape = {12, 3};
-  ASSERT_EQ(out.dims(), make_ddim({n, h, w, c}));
+  ASSERT_EQ(out.dims(), common::make_ddim({n, h, w, c}));
   ASSERT_EQ(out.dims().size(), 4);
   ASSERT_EQ(out.meta().dtype, DataType::FLOAT32);
   ASSERT_EQ(out.meta().layout, DataLayout::NHWC);
diff --git a/test/cpp/phi/ops/CMakeLists.txt b/test/cpp/phi/ops/CMakeLists.txt
index 4e6cf31f75cdd1..978dad086c877f 100644
--- a/test/cpp/phi/ops/CMakeLists.txt
+++ b/test/cpp/phi/ops/CMakeLists.txt
@@ -1,4 +1,4 @@
 cc_test(
   test_op_signature
   SRCS test_op_signature.cc
-  DEPS phi)
+  DEPS phi common)
diff --git a/test/cpp/pir/cinn/CMakeLists.txt b/test/cpp/pir/cinn/CMakeLists.txt
index a312a422254c00..7bcc9746e2f43e 100644
--- a/test/cpp/pir/cinn/CMakeLists.txt
+++ b/test/cpp/pir/cinn/CMakeLists.txt
@@ -39,8 +39,8 @@ if(WITH_TESTING AND WITH_CINN)
     DEPS
     drr
     pd_to_cinn_pass
-    op_dialect_vjp
     cinn_op_dialect
+    op_dialect_vjp
     pir_transforms
     pir)
   set_tests_properties(test_sub_graph_extract PROPERTIES LABELS "RUN_TYPE=CINN")
@@ -51,7 +51,6 @@ if(WITH_TESTING AND WITH_CINN)
     ir_op_fusion_test.cc
     DEPS
     op_with_group_merge_pass
-    op_dialect_vjp
     cinn_op_dialect
     pir)
   set_tests_properties(ir_op_fusion_test PROPERTIES LABELS "RUN_TYPE=CINN")
diff --git a/test/cpp/pir/cinn/group_op_test.cc b/test/cpp/pir/cinn/group_op_test.cc
index 75379d69c733be..20897dcbb4e2d6 100644
--- a/test/cpp/pir/cinn/group_op_test.cc
+++ b/test/cpp/pir/cinn/group_op_test.cc
@@ -56,7 +56,7 @@ std::shared_ptr<::pir::Program> BuildGroupProgram() {
   const float value_one = 1.0;
   const std::vector<int64_t> shape = {64, 128};
   auto group_op1 = builder.Build<cinn::dialect::GroupOp>(
-      CreateDenseTensorTypes(phi::make_ddim(shape)));
+      CreateDenseTensorTypes(common::make_ddim(shape)));
   pir::Block* block1 = group_op1.block();
   builder.SetInsertionPointToEnd(block1);
   auto full_op_x = builder.Build<paddle::dialect::FullOp>(
@@ -65,7 +65,7 @@ std::shared_ptr<::pir::Program> BuildGroupProgram() {
 
   builder.SetInsertionPointToEnd(program->block());
   auto group_op2 = builder.Build<cinn::dialect::GroupOp>(
-      CreateDenseTensorTypes(phi::make_ddim(shape)));
+      CreateDenseTensorTypes(common::make_ddim(shape)));
   pir::Block* block2 = group_op2.block();
   builder.SetInsertionPointToEnd(block2);
 
@@ -168,7 +168,7 @@ std::shared_ptr<::pir::Program> BuildGroupProgramForLowering() {
       shape, value, phi::DataType::FLOAT32, phi::GPUPlace());
 
   auto group_op1 = builder.Build<cinn::dialect::GroupOp>(
-      CreateDenseTensorTypes(phi::make_ddim(shape)));
+      CreateDenseTensorTypes(common::make_ddim(shape)));
   pir::Block* block1 = group_op1.block();
   builder.SetInsertionPointToEnd(block1);
   auto sin = builder.Build<paddle::dialect::SinOp>(full_x->result(0));
@@ -179,7 +179,7 @@ std::shared_ptr<::pir::Program> BuildGroupProgramForLowering() {
 
   builder.SetInsertionPointToEnd(program->block());
   auto group_op2 = builder.Build<cinn::dialect::GroupOp>(
-      CreateDenseTensorTypes(phi::make_ddim(shape)));
+      CreateDenseTensorTypes(common::make_ddim(shape)));
   pir::Block* block2 = group_op2.block();
   builder.SetInsertionPointToEnd(block2);
   auto cos_op = builder.Build<paddle::dialect::CosOp>(full_y->result(0));
@@ -187,7 +187,7 @@ std::shared_ptr<::pir::Program> BuildGroupProgramForLowering() {
 
   builder.SetInsertionPointToEnd(program->block());
   auto group_op3 = builder.Build<cinn::dialect::GroupOp>(
-      CreateDenseTensorTypes(phi::make_ddim(shape)));
+      CreateDenseTensorTypes(common::make_ddim(shape)));
   pir::Block* block3 = group_op3.block();
   builder.SetInsertionPointToEnd(block3);
   auto add = builder.Build<paddle::dialect::AddOp>(group_op1->result(0),
diff --git a/test/cpp/pir/core/CMakeLists.txt b/test/cpp/pir/core/CMakeLists.txt
index 42c331c59fb70d..5a5981fccee931 100644
--- a/test/cpp/pir/core/CMakeLists.txt
+++ b/test/cpp/pir/core/CMakeLists.txt
@@ -1,7 +1,10 @@
-paddle_test(type_test SRCS type_test.cc DEPS pir op_dialect_vjp)
+cc_test(
+  type_test
+  SRCS type_test.cc
+  DEPS pir op_dialect_vjp)
 cc_test_old(ir_attribute_test SRCS ir_attribute_test.cc DEPS pir gtest)
 cc_test_old(ir_value_test SRCS ir_value_test.cc DEPS pir gtest)
-paddle_test(
+cc_test_old(
   ir_op_test
   SRCS
   ir_op_test.cc
@@ -19,6 +22,7 @@ cc_test_old(
   op_dialect_vjp
   pir
   phi
+  common
   gtest)
 
 cc_test_old(
@@ -29,6 +33,7 @@ cc_test_old(
   op_dialect_vjp
   pir
   phi
+  common
   gtest)
 
 cc_test_old(
diff --git a/test/cpp/pir/core/ir_op_test.cc b/test/cpp/pir/core/ir_op_test.cc
index 9ae7b8b5c17953..bfd1e95dd98b7a 100644
--- a/test/cpp/pir/core/ir_op_test.cc
+++ b/test/cpp/pir/core/ir_op_test.cc
@@ -15,6 +15,7 @@
 #include <gtest/gtest.h>
 #include <sstream>
 
+#include "paddle/common/enforce.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/pir/core/block.h"
@@ -22,7 +23,6 @@
 #include "paddle/pir/core/builtin_attribute.h"
 #include "paddle/pir/core/builtin_op.h"
 #include "paddle/pir/core/dialect.h"
-#include "paddle/pir/core/enforce.h"
 #include "paddle/pir/core/ir_context.h"
 #include "paddle/pir/core/ir_printer.h"
 #include "paddle/pir/core/op_base.h"
diff --git a/test/cpp/pir/core/ir_program_test.cc b/test/cpp/pir/core/ir_program_test.cc
index 6e702b9f333b6f..045d9ed6815aa0 100644
--- a/test/cpp/pir/core/ir_program_test.cc
+++ b/test/cpp/pir/core/ir_program_test.cc
@@ -278,7 +278,7 @@ TEST(program_test, builder) {
   EXPECT_EQ(
       full_op_output.dyn_cast<paddle::dialect::DenseTensorType>().offset() == 0,
       true);
-  for (auto dim : phi::vectorize(
+  for (auto dim : common::vectorize(
            full_op_output.dyn_cast<paddle::dialect::DenseTensorType>()
                .dims())) {
     EXPECT_EQ(dim == 2, true);
diff --git a/test/cpp/pir/core/type_interface_test.cc b/test/cpp/pir/core/type_interface_test.cc
index e3bd38b8adf6b0..7a7af415823ee4 100644
--- a/test/cpp/pir/core/type_interface_test.cc
+++ b/test/cpp/pir/core/type_interface_test.cc
@@ -51,7 +51,7 @@ TEST(shapedtype_test, shapedtype_test) {
   EXPECT_EQ(
       dense_tensor_type_interface.GetElementType().isa<pir::Float32Type>(),
       true);
-  EXPECT_EQ(dense_tensor_type_interface.GetDyShape(), phi::vectorize(dims));
+  EXPECT_EQ(dense_tensor_type_interface.GetDyShape(), common::vectorize(dims));
   EXPECT_EQ(dense_tensor_type_interface.kDynamic,
             std::numeric_limits<int64_t>::min());
   EXPECT_EQ(dense_tensor_type_interface.GetRank(), 2);
diff --git a/test/cpp/pir/kernel_dialect/CMakeLists.txt b/test/cpp/pir/kernel_dialect/CMakeLists.txt
index aea05a2bfeb199..938bf8c21339c3 100644
--- a/test/cpp/pir/kernel_dialect/CMakeLists.txt
+++ b/test/cpp/pir/kernel_dialect/CMakeLists.txt
@@ -1,4 +1,10 @@
 cc_test(
   ir_kernel_dialect_pass_test
   SRCS ir_kernel_dialect_pass_test.cc
-  DEPS pir_transforms program_translator op_dialect pir phi gtest)
+  DEPS pir_transforms
+       program_translator
+       op_dialect
+       pir
+       phi
+       common
+       gtest)
diff --git a/test/cpp/pir/pass/CMakeLists.txt b/test/cpp/pir/pass/CMakeLists.txt
index fb9f37e080f388..0cfd60a2a020f4 100644
--- a/test/cpp/pir/pass/CMakeLists.txt
+++ b/test/cpp/pir/pass/CMakeLists.txt
@@ -6,4 +6,5 @@ cc_test_old(
   pir
   op_dialect_vjp
   phi
+  common
   gtest)
diff --git a/test/cpp/pir/pattern_rewrite/pattern_rewrite_test.cc b/test/cpp/pir/pattern_rewrite/pattern_rewrite_test.cc
index 401de5a7425805..9daec3a19bc807 100644
--- a/test/cpp/pir/pattern_rewrite/pattern_rewrite_test.cc
+++ b/test/cpp/pir/pattern_rewrite/pattern_rewrite_test.cc
@@ -32,13 +32,13 @@
 #include "paddle/fluid/pir/transforms/fusion/conv2d_bn_fuse_pass.h"
 #include "paddle/fluid/pir/transforms/transform_general_functions.h"
 
+#include "paddle/common/enforce.h"
 #include "paddle/pir/core/builder.h"
 #include "paddle/pir/core/builtin_attribute.h"
 #include "paddle/pir/core/builtin_dialect.h"
 #include "paddle/pir/core/builtin_op.h"
 #include "paddle/pir/core/cast_utils.h"
 #include "paddle/pir/core/dialect.h"
-#include "paddle/pir/core/enforce.h"
 #include "paddle/pir/core/ir_context.h"
 #include "paddle/pir/core/op_info.h"
 #include "paddle/pir/core/parameter.h"
@@ -51,8 +51,8 @@
 #include "paddle/pir/pattern_rewrite/pattern_match.h"
 #include "paddle/pir/pattern_rewrite/pattern_rewrite_driver.h"
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/common/place.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
diff --git a/test/cpp/pir/shape_dialect/CMakeLists.txt b/test/cpp/pir/shape_dialect/CMakeLists.txt
index ec80962c1cb3a2..3815531ded5db0 100644
--- a/test/cpp/pir/shape_dialect/CMakeLists.txt
+++ b/test/cpp/pir/shape_dialect/CMakeLists.txt
@@ -1,20 +1,12 @@
-paddle_test(
+cc_test(
   shape_op_test
-  SRCS
-  shape_op_test.cc
-  DEPS
-  op_dialect_vjp
-  pir
-  gtest)
+  SRCS shape_op_test.cc
+  DEPS op_dialect_vjp pir gtest)
 
-paddle_test(
+cc_test(
   shape_struct_test
-  SRCS
-  shape_struct_test.cc
-  DEPS
-  op_dialect_vjp
-  pir
-  gtest)
+  SRCS shape_struct_test.cc
+  DEPS op_dialect_vjp pir gtest)
 
 paddle_test(
   constraint_pass_test
diff --git a/test/cpp/pir/shape_dialect/constraint_pass_test.cc b/test/cpp/pir/shape_dialect/constraint_pass_test.cc
index 3a78dc07faab4d..7ce7f405c76911 100644
--- a/test/cpp/pir/shape_dialect/constraint_pass_test.cc
+++ b/test/cpp/pir/shape_dialect/constraint_pass_test.cc
@@ -20,6 +20,7 @@
 #include <sstream>
 #include <vector>
 
+#include "paddle/common/enforce.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/pir/core/builder.h"
@@ -29,7 +30,6 @@
 #include "paddle/pir/core/builtin_type_interfaces.h"
 #include "paddle/pir/core/cast_utils.h"
 #include "paddle/pir/core/dialect.h"
-#include "paddle/pir/core/enforce.h"
 #include "paddle/pir/core/ir_context.h"
 #include "paddle/pir/core/op_info.h"
 #include "paddle/pir/core/parameter.h"
diff --git a/test/cpp/pir/tools/test_interface.h b/test/cpp/pir/tools/test_interface.h
index a2de7e1bb6972e..4f1eaca6ae7798 100644
--- a/test/cpp/pir/tools/test_interface.h
+++ b/test/cpp/pir/tools/test_interface.h
@@ -15,13 +15,13 @@
 #include <gtest/gtest.h>
 #include <sstream>
 
+#include "paddle/common/enforce.h"
 #include "paddle/pir/core/block.h"
 #include "paddle/pir/core/builder.h"
 #include "paddle/pir/core/builtin_attribute.h"
 #include "paddle/pir/core/builtin_op.h"
 #include "paddle/pir/core/builtin_type.h"
 #include "paddle/pir/core/dialect.h"
-#include "paddle/pir/core/enforce.h"
 #include "paddle/pir/core/ir_context.h"
 #include "paddle/pir/core/ir_printer.h"
 #include "paddle/pir/core/op_base.h"
diff --git a/test/cpp/pir/tools/test_op.cc b/test/cpp/pir/tools/test_op.cc
index d8ecbb3a2af385..cb2bf74293103d 100644
--- a/test/cpp/pir/tools/test_op.cc
+++ b/test/cpp/pir/tools/test_op.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "test/cpp/pir/tools/test_op.h"
+#include "paddle/common/enforce.h"
 #include "paddle/pir/core/builtin_attribute.h"
-#include "paddle/pir/core/enforce.h"
 
 namespace test {
 
diff --git a/test/cpp/pir/tools/test_trait.cc b/test/cpp/pir/tools/test_trait.cc
index 1fa5dd0bba9118..431998b11c0cef 100644
--- a/test/cpp/pir/tools/test_trait.cc
+++ b/test/cpp/pir/tools/test_trait.cc
@@ -14,7 +14,7 @@
 #include "test/cpp/pir/tools/test_trait.h"
 #include "glog/logging.h"
 
-#include "paddle/pir/core/enforce.h"
+#include "paddle/common/enforce.h"
 
 namespace test {
 void OneRegionTrait::Verify(pir::Operation *op) {
diff --git a/test/cpp/prim/CMakeLists.txt b/test/cpp/prim/CMakeLists.txt
index 3436ee702cce79..a844c99a6ff700 100644
--- a/test/cpp/prim/CMakeLists.txt
+++ b/test/cpp/prim/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(prim_eager_deps
     phi
+    common
     hook_utils
     utils
     global_utils
@@ -15,7 +16,7 @@ set(prim_eager_deps
 set(prim_generated_deps final_dygraph_function final_dygraph_node
                         dygraph_function dygraph_node)
 
-paddle_test(test_comp_static SRCS test_static_prim.cc)
+paddle_test(test_comp_static SRCS test_static_prim.cc DEPS common)
 
 if(NOT (NOT WITH_PYTHON AND ON_INFER))
   if(WITH_CINN)
@@ -24,7 +25,8 @@ if(NOT (NOT WITH_PYTHON AND ON_INFER))
   cc_library(init_env_utils SRCS init_env_utils.cc)
   target_compile_definitions(init_env_utils PUBLIC PADDLE_DLL_EXPORT)
 
-  paddle_test(test_comp_eager SRCS test_eager_prim.cc DEPS init_env_utils)
+  paddle_test(test_comp_eager SRCS test_eager_prim.cc DEPS init_env_utils
+              common)
 endif()
 
 # skip win32 since wget is not installed by default on windows machine.
diff --git a/test/cpp/prim/test_eager_prim.cc b/test/cpp/prim/test_eager_prim.cc
index 3a5ba8aea829af..f451e229784c22 100644
--- a/test/cpp/prim/test_eager_prim.cc
+++ b/test/cpp/prim/test_eager_prim.cc
@@ -38,7 +38,7 @@ TEST(EagerPrim, TanhBackwardTest) {
   FLAGS_tensor_operants_mode = "eager";
   paddle::prim::InitTensorOperants();
   // 2. pre
-  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+  paddle::framework::DDim ddim = common::make_ddim({4, 16, 16, 32});
   paddle::Tensor tensor0 =
       eager_test::CreateTensorWithValue(ddim,
                                         paddle::platform::CPUPlace(),
@@ -95,7 +95,7 @@ TEST(EagerPrim, LogicalOperantsTest) {
   FLAGS_tensor_operants_mode = "eager";
   paddle::prim::InitTensorOperants();
   // 2. pre
-  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+  paddle::framework::DDim ddim = common::make_ddim({4, 16, 16, 32});
   paddle::Tensor tensor0 =
       eager_test::CreateTensorWithValue(ddim,
                                         paddle::platform::CPUPlace(),
@@ -133,7 +133,7 @@ TEST(EagerPrim, CompareOperantsTest) {
   FLAGS_tensor_operants_mode = "eager";
   paddle::prim::InitTensorOperants();
   // 2. pre
-  paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
+  paddle::framework::DDim ddim = common::make_ddim({4, 16, 16, 32});
   paddle::Tensor tensor0 =
       eager_test::CreateTensorWithValue(ddim,
                                         paddle::platform::CPUPlace(),