diff --git a/paddle/fluid/distributed/collective/process_group.h b/paddle/fluid/distributed/collective/process_group.h
index 8767dfa60cf181..c8bb357739881b 100644
--- a/paddle/fluid/distributed/collective/process_group.h
+++ b/paddle/fluid/distributed/collective/process_group.h
@@ -20,12 +20,12 @@
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/common/errors.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/device_context.h"
 #include "paddle/phi/core/distributed/types.h"
 #include "paddle/phi/core/distributed/utils.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 
 constexpr auto kWaitTimeout = std::chrono::milliseconds(0);
 
diff --git a/paddle/fluid/distributed/collective/process_group_bkcl.cc b/paddle/fluid/distributed/collective/process_group_bkcl.cc
index 81f52bc97f3342..f38b3e525eefcb 100644
--- a/paddle/fluid/distributed/collective/process_group_bkcl.cc
+++ b/paddle/fluid/distributed/collective/process_group_bkcl.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/distributed/collective/process_group_bkcl.h"
 
+#include "paddle/common/errors.h"
 #include "paddle/fluid/distributed/collective/bkcl_tools.h"
 #include "paddle/fluid/distributed/collective/common.h"
 #include "paddle/fluid/framework/convert_utils.h"
@@ -23,7 +24,6 @@
 #include "paddle/phi/core/device_context.h"
 #include "paddle/phi/core/distributed/check/static_check.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/collective/process_group_with_stream.h b/paddle/fluid/distributed/collective/process_group_with_stream.h
index 0cea9bb3ed87e6..58d1a042fec3c8 100644
--- a/paddle/fluid/distributed/collective/process_group_with_stream.h
+++ b/paddle/fluid/distributed/collective/process_group_with_stream.h
@@ -14,9 +14,9 @@
 
 #pragma once
 
+#include "paddle/common/errors.h"
 #include "paddle/fluid/distributed/collective/process_group.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/collective/process_group_without_stream.h b/paddle/fluid/distributed/collective/process_group_without_stream.h
index dd22c0f1e4cbdb..a3c103574cbc5a 100644
--- a/paddle/fluid/distributed/collective/process_group_without_stream.h
+++ b/paddle/fluid/distributed/collective/process_group_without_stream.h
@@ -14,9 +14,9 @@
 
 #pragma once
 
+#include "paddle/common/errors.h"
 #include "paddle/fluid/distributed/collective/process_group.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
index 7817b9bc0e9dfe..4190019e0d1738 100644
--- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
@@ -14,12 +14,12 @@
 
 #include "paddle/fluid/distributed/fleet_executor/compute_interceptor.h"
 
+#include "paddle/common/errors.h"
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
 #include "paddle/fluid/framework/executor_gc_helper.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/jit/serializer.h"
-#include "paddle/phi/core/errors.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc b/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc
index 2e3389af5feb59..704dd16400065c 100644
--- a/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc
@@ -14,13 +14,13 @@
 
 #include "paddle/fluid/distributed/fleet_executor/cond_interceptor.h"
 #include <algorithm>
+#include "paddle/common/errors.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
 #include "paddle/fluid/framework/executor_gc_helper.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/errors.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/errors.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/fleet_executor/start_interceptor.cc b/paddle/fluid/distributed/fleet_executor/start_interceptor.cc
index 830f619ed3c00c..1fe4aaea15fc4d 100644
--- a/paddle/fluid/distributed/fleet_executor/start_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/start_interceptor.cc
@@ -14,9 +14,9 @@
 
 #include "paddle/fluid/distributed/fleet_executor/start_interceptor.h"
 
+#include "paddle/common/errors.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/phi/core/errors.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
index d29ef0f9ad1fad..61080c52c94bac 100644
--- a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
+++ b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/framework/ir/auto_mixed_precision_pass.h"
 
+#include "paddle/common/errors.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/phi/common/bfloat16.h"
@@ -21,7 +22,6 @@
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 #include "paddle/phi/backends/device_manager.h"
 #endif
diff --git a/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.cc
index 08aafa4a60a0e7..a1f74d3423006b 100644
--- a/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.cc
@@ -16,10 +16,10 @@
 
 #include <vector>
 
+#include "paddle/common/errors.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/utils/string/pretty_log.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.cc
index 697a34904c817e..f9e8722ccf3978 100644
--- a/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.cc
@@ -19,11 +19,11 @@
 #include <utility>
 #include <vector>
 
+#include "paddle/common/errors.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/utils/string/pretty_log.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/trt_support_nhwc_pass.cc b/paddle/fluid/framework/ir/trt_support_nhwc_pass.cc
index 5a086acd7cac2e..a59e1be1595036 100644
--- a/paddle/fluid/framework/ir/trt_support_nhwc_pass.cc
+++ b/paddle/fluid/framework/ir/trt_support_nhwc_pass.cc
@@ -18,13 +18,13 @@
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/common/errors.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/layout.h"
-#include "paddle/phi/core/errors.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/new_executor/interpreter/job.h b/paddle/fluid/framework/new_executor/interpreter/job.h
index 952702d6e2f0a5..1ff08d062d23c6 100644
--- a/paddle/fluid/framework/new_executor/interpreter/job.h
+++ b/paddle/fluid/framework/new_executor/interpreter/job.h
@@ -16,8 +16,8 @@
 #include <glog/logging.h>
 #include <set>
 
+#include "paddle/common/errors.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/macros.h"
 
 namespace paddle {
diff --git a/paddle/fluid/imperative/layout_autotune.cc b/paddle/fluid/imperative/layout_autotune.cc
index 18baaf98fdf11c..7903c212ec90aa 100644
--- a/paddle/fluid/imperative/layout_autotune.cc
+++ b/paddle/fluid/imperative/layout_autotune.cc
@@ -14,12 +14,12 @@
 
 #include "paddle/fluid/imperative/layout_autotune.h"
 
+#include "paddle/common/errors.h"
 #include "paddle/fluid/eager/api/utils/global_utils.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/imperative/layout_transformer.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 namespace paddle {
 namespace imperative {
 
diff --git a/paddle/fluid/imperative/layout_transformer.h b/paddle/fluid/imperative/layout_transformer.h
index 61bd4f9dfe2b8f..a18207df0260e0 100644
--- a/paddle/fluid/imperative/layout_transformer.h
+++ b/paddle/fluid/imperative/layout_transformer.h
@@ -13,12 +13,12 @@
 // limitations under the License.
 
 #pragma once
+#include "paddle/common/errors.h"
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/imperative/layout_autotune.h"
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/fluid/imperative/var_helper.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/tensor_utils.h"
 namespace paddle {
 namespace imperative {
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index d3e4ce93ca01e5..5e705b4fb9877a 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -22,13 +22,13 @@
 #include <utility>
 #include <vector>
 
+#include "paddle/common/errors.h"
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/analysis/argument.h"
 #include "paddle/fluid/string/pretty_log.h"
 #include "paddle/phi/common/data_type.h"
-#include "paddle/phi/core/errors.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/api/resource_manager.cc b/paddle/fluid/inference/api/resource_manager.cc
index 2414aaee1b78b5..2806204f4b9406 100644
--- a/paddle/fluid/inference/api/resource_manager.cc
+++ b/paddle/fluid/inference/api/resource_manager.cc
@@ -20,6 +20,7 @@
 #include <unordered_map>
 #include <utility>
 
+#include "paddle/common/errors.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/phi/backends/gpu/forwards.h"
@@ -28,7 +29,6 @@
 #include "paddle/phi/backends/gpu/gpu_resources.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/allocator.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/generator.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
diff --git a/paddle/fluid/inference/tensorrt/convert/generic_and_custom_plugin_creater.cc b/paddle/fluid/inference/tensorrt/convert/generic_and_custom_plugin_creater.cc
index 9f14c8c1b64fb8..e811827a7296c1 100644
--- a/paddle/fluid/inference/tensorrt/convert/generic_and_custom_plugin_creater.cc
+++ b/paddle/fluid/inference/tensorrt/convert/generic_and_custom_plugin_creater.cc
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/common/errors.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/inference/tensorrt/plugin/generic_plugin.h"
 #include "paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.h"
 #include "paddle/phi/api/ext/op_meta_info.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/jit/layer.cc b/paddle/fluid/jit/layer.cc
index d6986b51306ebd..0b2e20f77837a2 100644
--- a/paddle/fluid/jit/layer.cc
+++ b/paddle/fluid/jit/layer.cc
@@ -14,9 +14,9 @@
 
 #include "paddle/fluid/jit/layer.h"
 
+#include "paddle/common/errors.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 
 #include "paddle/fluid/jit/compilation_unit.h"
 #include "paddle/fluid/jit/engine/base_engine.h"
diff --git a/paddle/fluid/jit/property.cc b/paddle/fluid/jit/property.cc
index 9b0c50a954624c..687468df83a3dc 100644
--- a/paddle/fluid/jit/property.cc
+++ b/paddle/fluid/jit/property.cc
@@ -18,10 +18,10 @@ limitations under the License. */
 
 #include "glog/logging.h"
 
+#include "paddle/common/errors.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/jit/property.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 
 namespace paddle {
 namespace jit {
diff --git a/paddle/fluid/operators/fused/fused_attention_utils.h b/paddle/fluid/operators/fused/fused_attention_utils.h
index 7d17041133bcd7..b198c4a5792912 100644
--- a/paddle/fluid/operators/fused/fused_attention_utils.h
+++ b/paddle/fluid/operators/fused/fused_attention_utils.h
@@ -23,8 +23,8 @@
 PHI_DECLARE_bool(dynamic_static_unified_comm);
 #endif
 
+#include "paddle/common/errors.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
-#include "paddle/phi/core/errors.h"
 
 namespace phi {
 namespace fusion {
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu
index ee40633e4252b3..656f8ba6ad0acb 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cu
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/common/errors.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/fused/fused_attention_utils.h"
 #include "paddle/phi/api/include/tensor.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index 0d170eae31cfb1..b6fce494f5a740 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -23,6 +23,7 @@
 #include <utility>
 #include <vector>
 
+#include "paddle/common/errors.h"
 #include "paddle/fluid/framework/data_device_transform.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -39,7 +40,6 @@
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/kernels/cast_kernel.h"
 #include "paddle/phi/kernels/funcs/data_type_transform.h"
 #include "paddle/utils/string/string_helper.h"
diff --git a/paddle/fluid/pir/transforms/transform_general_functions.h b/paddle/fluid/pir/transforms/transform_general_functions.h
index 77c790235b8329..ab279f0ab3a958 100644
--- a/paddle/fluid/pir/transforms/transform_general_functions.h
+++ b/paddle/fluid/pir/transforms/transform_general_functions.h
@@ -14,9 +14,9 @@
 
 #pragma once
 
+#include "paddle/common/errors.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/pir/core/operation.h"
 #include "paddle/pir/core/parameter.h"
 #include "paddle/pir/core/type.h"
diff --git a/paddle/fluid/platform/errors.h b/paddle/fluid/platform/errors.h
index 758af3e2d9137e..e3c307820f84bb 100644
--- a/paddle/fluid/platform/errors.h
+++ b/paddle/fluid/platform/errors.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/phi/core/errors.h"
+#include "paddle/common/errors.h"
 namespace paddle {
 namespace platform {
 namespace errors = ::phi::errors;
diff --git a/paddle/fluid/pybind/eval_frame_tools.cc b/paddle/fluid/pybind/eval_frame_tools.cc
index 3b8df99eb2a3f3..da09c2478c02cd 100644
--- a/paddle/fluid/pybind/eval_frame_tools.cc
+++ b/paddle/fluid/pybind/eval_frame_tools.cc
@@ -18,9 +18,9 @@
 
 #include <unordered_set>
 
+#include "paddle/common/errors.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 
 /*============================ Dict Tree ================================*/
 
diff --git a/paddle/phi/api/lib/context_pool.cc b/paddle/phi/api/lib/context_pool.cc
index ee1e21a58e2f1b..1caa57770f54e2 100644
--- a/paddle/phi/api/lib/context_pool.cc
+++ b/paddle/phi/api/lib/context_pool.cc
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #include "paddle/phi/api/include/context_pool.h"
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/allocator.h"
-#include "paddle/phi/core/enforce.h"
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/phi/core/cuda_stream.h"
diff --git a/paddle/phi/api/lib/op_meta_info.cc b/paddle/phi/api/lib/op_meta_info.cc
index 14334aa7c42a6d..20cb6a142e2f03 100644
--- a/paddle/phi/api/lib/op_meta_info.cc
+++ b/paddle/phi/api/lib/op_meta_info.cc
@@ -20,9 +20,9 @@ limitations under the License. */
 #include <vector>
 
 #include "glog/logging.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
-#include "paddle/phi/core/enforce.h"
 
 namespace paddle {
 
diff --git a/paddle/phi/api/lib/scalar.cc b/paddle/phi/api/lib/scalar.cc
index 75232adb9be45d..a217c4da021f0a 100644
--- a/paddle/phi/api/lib/scalar.cc
+++ b/paddle/phi/api/lib/scalar.cc
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #include "paddle/common/scalar.h"
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/api/lib/tensor_copy.h"
 #include "paddle/phi/common/place.h"
-#include "paddle/phi/core/enforce.h"
 
 namespace paddle {
 namespace experimental {
diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc
index 206d5082e62dd1..edfc76ffe21ab6 100644
--- a/paddle/phi/api/lib/tensor.cc
+++ b/paddle/phi/api/lib/tensor.cc
@@ -20,14 +20,14 @@ limitations under the License. */
 
 #include "glog/logging.h"
 
+#include "paddle/common/ddim.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/api/include/context_pool.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/selected_rows.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
diff --git a/paddle/phi/api/profiler/common_event.h b/paddle/phi/api/profiler/common_event.h
index 76b9d5fa609b9b..d9e3ed74fd397a 100644
--- a/paddle/phi/api/profiler/common_event.h
+++ b/paddle/phi/api/profiler/common_event.h
@@ -18,10 +18,10 @@
 #include <functional>
 #include <string>
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/api/profiler/event.h"  // import EventRole, TODO(TIEXING): remove later
 #include "paddle/phi/api/profiler/trace_event.h"
 #include "paddle/phi/core/attribute.h"
-#include "paddle/phi/core/ddim.h"
 
 namespace phi {
 
diff --git a/paddle/phi/api/profiler/device_tracer.cc b/paddle/phi/api/profiler/device_tracer.cc
index e294130da7bab8..8f5c1c79cb1a1a 100644
--- a/paddle/phi/api/profiler/device_tracer.cc
+++ b/paddle/phi/api/profiler/device_tracer.cc
@@ -22,7 +22,7 @@ limitations under the License. */
 #include <thread>  // NOLINT
 
 #include "glog/logging.h"
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 #include "paddle/utils/flags.h"
 
 PD_DECLARE_bool(enable_host_event_recorder_hook);
diff --git a/paddle/phi/api/profiler/profiler.cc b/paddle/phi/api/profiler/profiler.cc
index 6dc419658d3c27..7a0d819a257267 100644
--- a/paddle/phi/api/profiler/profiler.cc
+++ b/paddle/phi/api/profiler/profiler.cc
@@ -22,12 +22,12 @@ limitations under the License. */
 
 #include "glog/logging.h"
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/api/profiler/common_event.h"
 #include "paddle/phi/api/profiler/device_tracer.h"
 #include "paddle/phi/api/profiler/host_event_recorder.h"
 #include "paddle/phi/api/profiler/host_tracer.h"
 #include "paddle/phi/api/profiler/profiler_helper.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/os_info.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/phi/backends/dynload/nvtx.h"
diff --git a/paddle/phi/api/profiler/supplement_tracing.h b/paddle/phi/api/profiler/supplement_tracing.h
index e93ad63b607ade..fc20f041ec02a7 100644
--- a/paddle/phi/api/profiler/supplement_tracing.h
+++ b/paddle/phi/api/profiler/supplement_tracing.h
@@ -18,8 +18,8 @@ limitations under the License. */
 #include <string>
 #include <utility>
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/core/attribute.h"
-#include "paddle/phi/core/ddim.h"
 
 namespace phi {
 
diff --git a/paddle/phi/api/yaml/generator/tensor_operants_gen.py b/paddle/phi/api/yaml/generator/tensor_operants_gen.py
index 444cfb1ddd1c81..845cf8afae2199 100644
--- a/paddle/phi/api/yaml/generator/tensor_operants_gen.py
+++ b/paddle/phi/api/yaml/generator/tensor_operants_gen.py
@@ -443,8 +443,8 @@ class TEST_API OperantsManager {
 #include "paddle/phi/api/include/operants_manager.h"
 
 #include "glog/logging.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
+#include "paddle/common/enforce.h"
+#include "paddle/common/errors.h"
 #include "paddle/utils/flags.h"
 
 """
diff --git a/paddle/phi/backends/c_comm_lib.h b/paddle/phi/backends/c_comm_lib.h
index cc86d0cae00915..0b6cee136d2a58 100644
--- a/paddle/phi/backends/c_comm_lib.h
+++ b/paddle/phi/backends/c_comm_lib.h
@@ -15,11 +15,11 @@
 #pragma once
 #include <vector>
 
+#include "paddle/common/enforce.h"
+#include "paddle/common/errors.h"
 #include "paddle/common/macros.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/place.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 
 #include "paddle/phi/common/reduce_type.h"
 
diff --git a/paddle/phi/backends/callback_manager.cc b/paddle/phi/backends/callback_manager.cc
index 0d658258fa4c05..4b9ccd7c798e23 100644
--- a/paddle/phi/backends/callback_manager.cc
+++ b/paddle/phi/backends/callback_manager.cc
@@ -16,8 +16,8 @@
 
 #include <ThreadPool.h>
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/device_guard.h"
-#include "paddle/phi/core/enforce.h"
 
 namespace phi {
 
diff --git a/paddle/phi/backends/context_pool.cc b/paddle/phi/backends/context_pool.cc
index 7824fc3b160b10..4f7fefa3dfa027 100644
--- a/paddle/phi/backends/context_pool.cc
+++ b/paddle/phi/backends/context_pool.cc
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include "glog/logging.h"
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/common/memory_utils.h"
-#include "paddle/phi/core/enforce.h"
 
 namespace phi {
 
diff --git a/paddle/phi/backends/cpu/cpu_context.cc b/paddle/phi/backends/cpu/cpu_context.cc
index 1a3ae7ae351d6b..ca7f93f3aea852 100644
--- a/paddle/phi/backends/cpu/cpu_context.cc
+++ b/paddle/phi/backends/cpu/cpu_context.cc
@@ -14,8 +14,8 @@
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/common/place.h"
-#include "paddle/phi/core/enforce.h"
 
 // NOTE: The paddle framework should add WITH_EIGEN option to support compile
 // without eigen.
diff --git a/paddle/phi/backends/custom/enforce_custom.h b/paddle/phi/backends/custom/enforce_custom.h
index c98d4580d3cdb8..96a653e266e4fc 100644
--- a/paddle/phi/backends/custom/enforce_custom.h
+++ b/paddle/phi/backends/custom/enforce_custom.h
@@ -16,8 +16,8 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 #include <string>
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/device_ext.h"
-#include "paddle/phi/core/enforce.h"
 
 namespace phi {
 template <typename T>
diff --git a/paddle/phi/backends/device_base.cc b/paddle/phi/backends/device_base.cc
index 5b6b8fcfc2fe9a..d014cf59695328 100644
--- a/paddle/phi/backends/device_base.cc
+++ b/paddle/phi/backends/device_base.cc
@@ -15,7 +15,7 @@
 #include "paddle/phi/backends/device_base.h"
 
 #include "glog/logging.h"
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 #include "paddle/utils/flags.h"
 
 PD_DECLARE_double(fraction_of_gpu_memory_to_use);
diff --git a/paddle/phi/backends/device_code.h b/paddle/phi/backends/device_code.h
index 8debb4dc9c45ee..1ecdf5cb3bac41 100644
--- a/paddle/phi/backends/device_code.h
+++ b/paddle/phi/backends/device_code.h
@@ -20,8 +20,8 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/common/place.h"
-#include "paddle/phi/core/enforce.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/phi/backends/dynload/cuda_driver.h"
 #include "paddle/phi/backends/dynload/nvrtc.h"
diff --git a/paddle/phi/backends/device_memory_aligment.h b/paddle/phi/backends/device_memory_aligment.h
index 8508d5206558d2..2276f76dc8e4d0 100644
--- a/paddle/phi/backends/device_memory_aligment.h
+++ b/paddle/phi/backends/device_memory_aligment.h
@@ -15,10 +15,10 @@ limitations under the License. */
 #pragma once
 #include <stddef.h>
 
+#include "paddle/common/enforce.h"
+#include "paddle/common/errors.h"
 #include "paddle/phi/backends/cpu/cpu_info.h"
 #include "paddle/phi/common/place.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/xpu/xpu_info.h"
diff --git a/paddle/phi/backends/dynload/cublasLt.h b/paddle/phi/backends/dynload/cublasLt.h
index 95800e1f64aacd..6604374a22db7a 100644
--- a/paddle/phi/backends/dynload/cublasLt.h
+++ b/paddle/phi/backends/dynload/cublasLt.h
@@ -44,7 +44,7 @@ extern void *cublasLt_dso_handle;
       using cublasLt_func =                                                 \
           decltype(::__name(std::declval<Args>()...)) (*)(Args...);         \
       std::call_once(cublasLt_dso_flag, []() {                              \
-        cublasLt_dso_handle = phi::dynload::GetCublasLtDsoHandle();         \
+        cublasLt_dso_handle = common::dynload::GetCublasLtDsoHandle();      \
       });                                                                   \
       static void *p_##__name = dlsym(cublasLt_dso_handle, #__name);        \
       return reinterpret_cast<cublasLt_func>(p_##__name)(args...);          \
diff --git a/paddle/phi/backends/dynload/cuda_driver.cc b/paddle/phi/backends/dynload/cuda_driver.cc
index d9fd89a0c65a6f..27000ccfe332e9 100644
--- a/paddle/phi/backends/dynload/cuda_driver.cc
+++ b/paddle/phi/backends/dynload/cuda_driver.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/phi/backends/dynload/cuda_driver.h"
 
-namespace phi {
+namespace common {
 namespace dynload {
 
 std::once_flag cuda_dso_flag;
@@ -34,4 +34,4 @@ bool HasCUDADriver() {
 }
 
 }  // namespace dynload
-}  // namespace phi
+}  // namespace common
diff --git a/paddle/phi/backends/dynload/cuda_driver.h b/paddle/phi/backends/dynload/cuda_driver.h
index 1a5f243c31257a..a949bcdb2b0f40 100644
--- a/paddle/phi/backends/dynload/cuda_driver.h
+++ b/paddle/phi/backends/dynload/cuda_driver.h
@@ -34,7 +34,7 @@ extern bool HasCUDADriver();
     auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
       using cuda_func = decltype(&::__name);                         \
       std::call_once(cuda_dso_flag, []() {                           \
-        cuda_dso_handle = phi::dynload::GetCUDADsoHandle();          \
+        cuda_dso_handle = common::dynload::GetCUDADsoHandle();       \
       });                                                            \
       static void* p_##__name = dlsym(cuda_dso_handle, #__name);     \
       return reinterpret_cast<cuda_func>(p_##__name)(args...);       \
diff --git a/paddle/phi/backends/dynload/cudnn.cc b/paddle/phi/backends/dynload/cudnn.cc
index dbef1c002e8d93..9dcd8c69499703 100644
--- a/paddle/phi/backends/dynload/cudnn.cc
+++ b/paddle/phi/backends/dynload/cudnn.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/common/backends/dynload/cudnn.h"
 
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 
 namespace phi {
 namespace dynload {
diff --git a/paddle/phi/backends/dynload/cudnn_frontend.h b/paddle/phi/backends/dynload/cudnn_frontend.h
index 0ea9d7cf7adf9f..7bfb51c73a5a95 100644
--- a/paddle/phi/backends/dynload/cudnn_frontend.h
+++ b/paddle/phi/backends/dynload/cudnn_frontend.h
@@ -23,8 +23,8 @@ limitations under the License. */
 PD_DECLARE_bool(enable_cudnn_frontend);
 
 // Redirect the CUDNN APIs in the cudnn_frontend namespace to
-// the functions in phi::dynload
-#define CUDNN_FRONTEND_OVERRIDE_SYMBOL(__name) using phi::dynload::__name
+// the functions in common::dynload
+#define CUDNN_FRONTEND_OVERRIDE_SYMBOL(__name) using common::dynload::__name
 
 #define CUDNN_FRONTEND_APPLY_EACH(__macro) \
   __macro(cudnnBackendCreateDescriptor);   \
diff --git a/paddle/phi/backends/dynload/cufft.cc b/paddle/phi/backends/dynload/cufft.cc
index a15969ecc3f87c..d9ac967208ba40 100644
--- a/paddle/phi/backends/dynload/cufft.cc
+++ b/paddle/phi/backends/dynload/cufft.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/phi/backends/dynload/cufft.h"
 
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 
 namespace phi {
 namespace dynload {
diff --git a/paddle/phi/backends/dynload/cufft.h b/paddle/phi/backends/dynload/cufft.h
index 30c9ec6e8f7dab..84cce45235c985 100644
--- a/paddle/phi/backends/dynload/cufft.h
+++ b/paddle/phi/backends/dynload/cufft.h
@@ -37,7 +37,7 @@ extern void EnforceCUFFTLoaded(const char* fn_name);
     auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
       using cufft_func = decltype(&::__name);                        \
       std::call_once(cufft_dso_flag, []() {                          \
-        cufft_dso_handle = phi::dynload::GetCUFFTDsoHandle();        \
+        cufft_dso_handle = common::dynload::GetCUFFTDsoHandle();     \
       });                                                            \
       EnforceCUFFTLoaded(#__name);                                   \
       static void* p_##__name = dlsym(cufft_dso_handle, #__name);    \
diff --git a/paddle/phi/backends/dynload/cupti.h b/paddle/phi/backends/dynload/cupti.h
index 8e02009b547039..6195255df9d741 100644
--- a/paddle/phi/backends/dynload/cupti.h
+++ b/paddle/phi/backends/dynload/cupti.h
@@ -43,7 +43,7 @@ extern void *cupti_dso_handle;
     inline CUptiResult CUPTIAPI operator()(Args... args) {        \
       using cuptiFunc = decltype(&::__name);                      \
       std::call_once(cupti_dso_flag, []() {                       \
-        cupti_dso_handle = phi::dynload::GetCUPTIDsoHandle();     \
+        cupti_dso_handle = common::dynload::GetCUPTIDsoHandle();  \
       });                                                         \
       static void *p_##__name = dlsym(cupti_dso_handle, #__name); \
       return reinterpret_cast<cuptiFunc>(p_##__name)(args...);    \
diff --git a/paddle/phi/backends/dynload/curand.cc b/paddle/phi/backends/dynload/curand.cc
index 6666b7f23962d8..36ca5b696b6c1e 100644
--- a/paddle/phi/backends/dynload/curand.cc
+++ b/paddle/phi/backends/dynload/curand.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/common/backends/dynload/curand.h"
 
-namespace phi {
+namespace common {
 namespace dynload {
 
 std::once_flag curand_dso_flag;
@@ -25,4 +25,4 @@ void *curand_dso_handle;
 CURAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
 
 }  // namespace dynload
-}  // namespace phi
+}  // namespace common
diff --git a/paddle/phi/backends/dynload/cusparse.h b/paddle/phi/backends/dynload/cusparse.h
index ec8c80d6749b4b..f5484ee43a1a8a 100644
--- a/paddle/phi/backends/dynload/cusparse.h
+++ b/paddle/phi/backends/dynload/cusparse.h
@@ -26,18 +26,18 @@ namespace dynload {
 extern std::once_flag cusparse_dso_flag;
 extern void *cusparse_dso_handle;
 
-#define DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP(__name)                   \
-  struct DynLoad__##__name {                                         \
-    template <typename... Args>                                      \
-    cusparseStatus_t operator()(Args... args) {                      \
-      using Func = decltype(&::__name);                              \
-      std::call_once(cusparse_dso_flag, []() {                       \
-        cusparse_dso_handle = phi::dynload::GetCusparseDsoHandle();  \
-      });                                                            \
-      static void *p_##__name = dlsym(cusparse_dso_handle, #__name); \
-      return reinterpret_cast<Func>(p_##__name)(args...);            \
-    }                                                                \
-  };                                                                 \
+#define DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP(__name)                     \
+  struct DynLoad__##__name {                                           \
+    template <typename... Args>                                        \
+    cusparseStatus_t operator()(Args... args) {                        \
+      using Func = decltype(&::__name);                                \
+      std::call_once(cusparse_dso_flag, []() {                         \
+        cusparse_dso_handle = common::dynload::GetCusparseDsoHandle(); \
+      });                                                              \
+      static void *p_##__name = dlsym(cusparse_dso_handle, #__name);   \
+      return reinterpret_cast<Func>(p_##__name)(args...);              \
+    }                                                                  \
+  };                                                                   \
   extern DynLoad__##__name __name
 
 #if defined(PADDLE_WITH_CUDA)
diff --git a/paddle/phi/backends/dynload/cusparseLt.h b/paddle/phi/backends/dynload/cusparseLt.h
index bdaae044ee1d96..f293e9e82be8b3 100644
--- a/paddle/phi/backends/dynload/cusparseLt.h
+++ b/paddle/phi/backends/dynload/cusparseLt.h
@@ -34,18 +34,18 @@ extern void *cusparselt_dso_handle;
  *
  * note: default dynamic linked libs
  */
-#define DECLARE_DYNAMIC_LOAD_CUSPARSELT_WRAP(__name)                    \
-  struct DynLoad__##__name {                                            \
-    template <typename... Args>                                         \
-    cusparseStatus_t operator()(Args... args) {                         \
-      using cusparseltFunc = decltype(&::__name);                       \
-      std::call_once(cusparselt_dso_flag, []() {                        \
-        cusparselt_dso_handle = phi::dynload::GetCusparseLtDsoHandle(); \
-      });                                                               \
-      static void *p_##__name = dlsym(cusparselt_dso_handle, #__name);  \
-      return reinterpret_cast<cusparseltFunc>(p_##__name)(args...);     \
-    }                                                                   \
-  };                                                                    \
+#define DECLARE_DYNAMIC_LOAD_CUSPARSELT_WRAP(__name)                       \
+  struct DynLoad__##__name {                                               \
+    template <typename... Args>                                            \
+    cusparseStatus_t operator()(Args... args) {                            \
+      using cusparseltFunc = decltype(&::__name);                          \
+      std::call_once(cusparselt_dso_flag, []() {                           \
+        cusparselt_dso_handle = common::dynload::GetCusparseLtDsoHandle(); \
+      });                                                                  \
+      static void *p_##__name = dlsym(cusparselt_dso_handle, #__name);     \
+      return reinterpret_cast<cusparseltFunc>(p_##__name)(args...);        \
+    }                                                                      \
+  };                                                                       \
   extern DynLoad__##__name __name
 #if defined(PADDLE_WITH_CUDA)
 #if CUDA_VERSION >= 11020
diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc
index 3e38732fb0c066..e0d7da9ee24cce 100644
--- a/paddle/phi/backends/dynload/dynamic_loader.cc
+++ b/paddle/phi/backends/dynload/dynamic_loader.cc
@@ -18,8 +18,8 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/common/backends/dynload/port.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/dynload/cupti_lib_path.h"
-#include "paddle/phi/core/enforce.h"
 
 #if defined(_WIN32)
 #include <windows.h>
diff --git a/paddle/phi/backends/dynload/flashattn.h b/paddle/phi/backends/dynload/flashattn.h
index 799d31346e0606..e229f76f62843d 100644
--- a/paddle/phi/backends/dynload/flashattn.h
+++ b/paddle/phi/backends/dynload/flashattn.h
@@ -26,18 +26,18 @@ namespace dynload {
 extern std::once_flag flashattn_dso_flag;
 extern void* flashattn_dso_handle;
 
-#define DYNAMIC_LOAD_FLASHATTN_WRAP(__name)                           \
-  struct DynLoad__##__name {                                          \
-    template <typename... Args>                                       \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {  \
-      using flashattnFunc = decltype(&::__name);                      \
-      std::call_once(flashattn_dso_flag, []() {                       \
-        flashattn_dso_handle = phi::dynload::GetFlashAttnDsoHandle(); \
-      });                                                             \
-      static void* p_##__name = dlsym(flashattn_dso_handle, #__name); \
-      return reinterpret_cast<flashattnFunc>(p_##__name)(args...);    \
-    }                                                                 \
-  };                                                                  \
+#define DYNAMIC_LOAD_FLASHATTN_WRAP(__name)                              \
+  struct DynLoad__##__name {                                             \
+    template <typename... Args>                                          \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {     \
+      using flashattnFunc = decltype(&::__name);                         \
+      std::call_once(flashattn_dso_flag, []() {                          \
+        flashattn_dso_handle = common::dynload::GetFlashAttnDsoHandle(); \
+      });                                                                \
+      static void* p_##__name = dlsym(flashattn_dso_handle, #__name);    \
+      return reinterpret_cast<flashattnFunc>(p_##__name)(args...);       \
+    }                                                                    \
+  };                                                                     \
   extern DynLoad__##__name __name
 
 #define DECLARE_DYNAMIC_LOAD_FLASHATTN_WRAP(__name) \
diff --git a/paddle/phi/backends/dynload/hiprtc.h b/paddle/phi/backends/dynload/hiprtc.h
index e4cf485dd22db1..66cb5d3ebec203 100644
--- a/paddle/phi/backends/dynload/hiprtc.h
+++ b/paddle/phi/backends/dynload/hiprtc.h
@@ -34,7 +34,7 @@ extern bool HasNVRTC();
     auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
       using hiprtc_func = decltype(&::__name);                       \
       std::call_once(hiprtc_dso_flag, []() {                         \
-        hiprtc_dso_handle = phi::dynload::GetNVRTCDsoHandle();       \
+        hiprtc_dso_handle = common::dynload::GetNVRTCDsoHandle();    \
       });                                                            \
       static void* p_##__name = dlsym(hiprtc_dso_handle, #__name);   \
       return reinterpret_cast<hiprtc_func>(p_##__name)(args...);     \
diff --git a/paddle/phi/backends/dynload/lapack.h b/paddle/phi/backends/dynload/lapack.h
index b7ff843a1e273e..f010aa01f2e328 100644
--- a/paddle/phi/backends/dynload/lapack.h
+++ b/paddle/phi/backends/dynload/lapack.h
@@ -325,7 +325,7 @@ extern void *lapack_dso_handle;
     auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
       using lapackFunc = decltype(&::__name);                        \
       std::call_once(lapack_dso_flag, []() {                         \
-        lapack_dso_handle = phi::dynload::GetLAPACKDsoHandle();      \
+        lapack_dso_handle = common::dynload::GetLAPACKDsoHandle();   \
       });                                                            \
       static void *p_##_name = dlsym(lapack_dso_handle, #__name);    \
       return reinterpret_cast<lapackFunc>(p_##_name)(args...);       \
diff --git a/paddle/phi/backends/dynload/miopen.cc b/paddle/phi/backends/dynload/miopen.cc
index b8f328b4aae34e..248d899e2477e3 100644
--- a/paddle/phi/backends/dynload/miopen.cc
+++ b/paddle/phi/backends/dynload/miopen.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/phi/backends/dynload/miopen.h"
 
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 
 namespace phi {
 namespace dynload {
diff --git a/paddle/phi/backends/dynload/mklrt.h b/paddle/phi/backends/dynload/mklrt.h
index 53d704b2b5c3ba..564cd95450a663 100644
--- a/paddle/phi/backends/dynload/mklrt.h
+++ b/paddle/phi/backends/dynload/mklrt.h
@@ -38,7 +38,7 @@ extern void* mklrt_dso_handle;
     auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
       using mklrtFunc = decltype(&::__name);                         \
       std::call_once(mklrt_dso_flag, []() {                          \
-        mklrt_dso_handle = phi::dynload::GetMKLRTDsoHandle();        \
+        mklrt_dso_handle = common::dynload::GetMKLRTDsoHandle();     \
       });                                                            \
       static void* p_##__name = dlsym(mklrt_dso_handle, #__name);    \
       return reinterpret_cast<mklrtFunc>(p_##__name)(args...);       \
diff --git a/paddle/phi/backends/dynload/nvjpeg.h b/paddle/phi/backends/dynload/nvjpeg.h
index b2257dd7e5d159..1018e04d5ca0d8 100644
--- a/paddle/phi/backends/dynload/nvjpeg.h
+++ b/paddle/phi/backends/dynload/nvjpeg.h
@@ -29,7 +29,7 @@ extern void *nvjpeg_dso_handle;
     nvjpegStatus_t operator()(Args... args) {                      \
       using nvjpegFunc = decltype(&::__name);                      \
       std::call_once(nvjpeg_dso_flag, []() {                       \
-        nvjpeg_dso_handle = phi::dynload::GetNvjpegDsoHandle();    \
+        nvjpeg_dso_handle = common::dynload::GetNvjpegDsoHandle(); \
       });                                                          \
       static void *p_##__name = dlsym(nvjpeg_dso_handle, #__name); \
       return reinterpret_cast<nvjpegFunc>(p_##__name)(args...);    \
diff --git a/paddle/phi/backends/dynload/nvrtc.cc b/paddle/phi/backends/dynload/nvrtc.cc
index 0ed370801c6acd..cb12021f6b81fb 100644
--- a/paddle/phi/backends/dynload/nvrtc.cc
+++ b/paddle/phi/backends/dynload/nvrtc.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/phi/backends/dynload/nvrtc.h"
 
-namespace phi {
+namespace common {
 namespace dynload {
 
 std::once_flag nvrtc_dso_flag;
@@ -31,4 +31,4 @@ bool HasNVRTC() {
 }
 
 }  // namespace dynload
-}  // namespace phi
+}  // namespace common
diff --git a/paddle/phi/backends/dynload/nvrtc.h b/paddle/phi/backends/dynload/nvrtc.h
index ce5be605cdf50f..5275032464b774 100644
--- a/paddle/phi/backends/dynload/nvrtc.h
+++ b/paddle/phi/backends/dynload/nvrtc.h
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/common/backends/dynload/dynamic_loader.h"
 #include "paddle/common/backends/dynload/port.h"
 
-namespace phi {
+namespace common {
 namespace dynload {
 
 extern std::once_flag nvrtc_dso_flag;
@@ -34,7 +34,7 @@ extern bool HasNVRTC();
     auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
       using nvrtc_func = decltype(&::__name);                        \
       std::call_once(nvrtc_dso_flag, []() {                          \
-        nvrtc_dso_handle = phi::dynload::GetNVRTCDsoHandle();        \
+        nvrtc_dso_handle = common::dynload::GetNVRTCDsoHandle();     \
       });                                                            \
       static void* p_##__name = dlsym(nvrtc_dso_handle, #__name);    \
       return reinterpret_cast<nvrtc_func>(p_##__name)(args...);      \
@@ -61,4 +61,4 @@ NVRTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVRTC_WRAP);
 #undef DECLARE_DYNAMIC_LOAD_NVRTC_WRAP
 
 }  // namespace dynload
-}  // namespace phi
+}  // namespace common
diff --git a/paddle/phi/backends/dynload/nvtx.h b/paddle/phi/backends/dynload/nvtx.h
index c2817764d036a2..ef7ee636d8b935 100644
--- a/paddle/phi/backends/dynload/nvtx.h
+++ b/paddle/phi/backends/dynload/nvtx.h
@@ -32,7 +32,7 @@ extern void *nvtx_dso_handle;
     int operator()(Args... args) {                               \
       using nvtxFunc = decltype(&::__name);                      \
       std::call_once(nvtx_dso_flag, []() {                       \
-        nvtx_dso_handle = phi::dynload::GetNvtxDsoHandle();      \
+        nvtx_dso_handle = common::dynload::GetNvtxDsoHandle();   \
       });                                                        \
       static void *p_##__name = dlsym(nvtx_dso_handle, #__name); \
       return reinterpret_cast<nvtxFunc>(p_##__name)(args...);    \
diff --git a/paddle/phi/backends/dynload/rocblas.h b/paddle/phi/backends/dynload/rocblas.h
index da36aeeaf885e1..866a83d60486e3 100644
--- a/paddle/phi/backends/dynload/rocblas.h
+++ b/paddle/phi/backends/dynload/rocblas.h
@@ -42,7 +42,7 @@ extern void *rocblas_dso_handle;
     rocblas_status operator()(Args... args) {                       \
       using rocblas_func = decltype(&::__name);                     \
       std::call_once(rocblas_dso_flag, []() {                       \
-        rocblas_dso_handle = phi::dynload::GetCublasDsoHandle();    \
+        rocblas_dso_handle = common::dynload::GetCublasDsoHandle(); \
       });                                                           \
       static void *p_##__name = dlsym(rocblas_dso_handle, #__name); \
       return reinterpret_cast<rocblas_func>(p_##__name)(args...);   \
diff --git a/paddle/phi/backends/dynload/rocm_driver.h b/paddle/phi/backends/dynload/rocm_driver.h
index c1dd53caeea281..cff4023dfb3372 100644
--- a/paddle/phi/backends/dynload/rocm_driver.h
+++ b/paddle/phi/backends/dynload/rocm_driver.h
@@ -34,7 +34,7 @@ extern bool HasCUDADriver();
     auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
       using rocm_func = decltype(&::__name);                         \
       std::call_once(rocm_dso_flag, []() {                           \
-        rocm_dso_handle = phi::dynload::GetCUDADsoHandle();          \
+        rocm_dso_handle = common::dynload::GetCUDADsoHandle();       \
       });                                                            \
       static void* p_##__name = dlsym(rocm_dso_handle, #__name);     \
       return reinterpret_cast<rocm_func>(p_##__name)(args...);       \
diff --git a/paddle/phi/backends/dynload/rocsparse.h b/paddle/phi/backends/dynload/rocsparse.h
index 0c24e03bff0258..f24d3d79c5f4f2 100644
--- a/paddle/phi/backends/dynload/rocsparse.h
+++ b/paddle/phi/backends/dynload/rocsparse.h
@@ -35,18 +35,18 @@ extern void *rocsparse_dso_handle;
  *
  * note: default dynamic linked libs
  */
-#define DECLARE_DYNAMIC_LOAD_ROCSPARSE_WRAP(__name)                   \
-  struct DynLoad__##__name {                                          \
-    template <typename... Args>                                       \
-    rocsparse_status operator()(Args... args) {                       \
-      using rocsparse_func = decltype(&::__name);                     \
-      std::call_once(rocsparse_dso_flag, []() {                       \
-        rocsparse_dso_handle = phi::dynload::GetCusparseDsoHandle();  \
-      });                                                             \
-      static void *p_##__name = dlsym(rocsparse_dso_handle, #__name); \
-      return reinterpret_cast<rocsparse_func>(p_##__name)(args...);   \
-    }                                                                 \
-  };                                                                  \
+#define DECLARE_DYNAMIC_LOAD_ROCSPARSE_WRAP(__name)                     \
+  struct DynLoad__##__name {                                            \
+    template <typename... Args>                                         \
+    rocsparse_status operator()(Args... args) {                         \
+      using rocsparse_func = decltype(&::__name);                       \
+      std::call_once(rocsparse_dso_flag, []() {                         \
+        rocsparse_dso_handle = common::dynload::GetCusparseDsoHandle(); \
+      });                                                               \
+      static void *p_##__name = dlsym(rocsparse_dso_handle, #__name);   \
+      return reinterpret_cast<rocsparse_func>(p_##__name)(args...);     \
+    }                                                                   \
+  };                                                                    \
   extern DynLoad__##__name __name
 
 #if defined(PADDLE_WITH_HIP)
diff --git a/paddle/phi/backends/dynload/tensorrt.h b/paddle/phi/backends/dynload/tensorrt.h
index 7a74f93358b281..2001a427db7d4e 100644
--- a/paddle/phi/backends/dynload/tensorrt.h
+++ b/paddle/phi/backends/dynload/tensorrt.h
@@ -22,7 +22,7 @@ limitations under the License. */
 #include <mutex>  // NOLINT
 
 #include "paddle/common/backends/dynload/dynamic_loader.h"
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 
 namespace phi {
 namespace dynload {
@@ -41,7 +41,7 @@ extern void* tensorrt_plugin_dso_handle;
     template <typename... Args>                                        \
     void* operator()(Args... args) {                                   \
       std::call_once(tensorrt_dso_flag, []() {                         \
-        tensorrt_dso_handle = phi::dynload::GetTensorRtHandle();       \
+        tensorrt_dso_handle = common::dynload::GetTensorRtHandle();    \
       });                                                              \
       static void* p_##__name = dlsym(tensorrt_dso_handle, #__name);   \
       if (p_##__name == nullptr) {                                     \
@@ -59,7 +59,7 @@ extern void* tensorrt_plugin_dso_handle;
     template <typename... Args>                                              \
     auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {         \
       std::call_once(tensorrt_dso_flag, []() {                               \
-        tensorrt_dso_handle = phi::dynload::GetTensorRtHandle();             \
+        tensorrt_dso_handle = common::dynload::GetTensorRtHandle();          \
       });                                                                    \
       static void* p_##__name = dlsym(tensorrt_dso_handle, #__name);         \
       PADDLE_ENFORCE_NOT_NULL(                                               \
@@ -76,7 +76,8 @@ extern void* tensorrt_plugin_dso_handle;
     template <typename... Args>                                                \
     auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {           \
       std::call_once(tensorrt_plugin_dso_flag, []() {                          \
-        tensorrt_plugin_dso_handle = phi::dynload::GetTensorRtPluginHandle();  \
+        tensorrt_plugin_dso_handle =                                           \
+            common::dynload::GetTensorRtPluginHandle();                        \
       });                                                                      \
       static void* p_##__name = dlsym(tensorrt_plugin_dso_handle, #__name);    \
       PADDLE_ENFORCE_NOT_NULL(p_##__name,                                      \
diff --git a/paddle/phi/backends/dynload/warpctc.h b/paddle/phi/backends/dynload/warpctc.h
index a767d785f79ee6..a91d760ca28bd7 100644
--- a/paddle/phi/backends/dynload/warpctc.h
+++ b/paddle/phi/backends/dynload/warpctc.h
@@ -37,7 +37,7 @@ extern void* warpctc_dso_handle;
     auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
       using warpctcFunc = decltype(&::__name);                       \
       std::call_once(warpctc_dso_flag, []() {                        \
-        warpctc_dso_handle = phi::dynload::GetWarpCTCDsoHandle();    \
+        warpctc_dso_handle = common::dynload::GetWarpCTCDsoHandle(); \
       });                                                            \
       static void* p_##__name = dlsym(warpctc_dso_handle, #__name);  \
       return reinterpret_cast<warpctcFunc>(p_##__name)(args...);     \
diff --git a/paddle/phi/backends/dynload/warprnnt.h b/paddle/phi/backends/dynload/warprnnt.h
index 5c9315bf23b757..01c7cb5f835e0a 100644
--- a/paddle/phi/backends/dynload/warprnnt.h
+++ b/paddle/phi/backends/dynload/warprnnt.h
@@ -31,18 +31,18 @@ extern void* warprnnt_dso_handle;
  * (for each function) to dynamic load warprnnt routine
  * via operator overloading.
  */
-#define DYNAMIC_LOAD_WARPRNNT_WRAP(__name)                           \
-  struct DynLoad__##__name {                                         \
-    template <typename... Args>                                      \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
-      using warprnntFunc = decltype(&::__name);                      \
-      std::call_once(warprnnt_dso_flag, []() {                       \
-        warprnnt_dso_handle = phi::dynload::GetWarpRNNTDsoHandle();  \
-      });                                                            \
-      static void* p_##__name = dlsym(warprnnt_dso_handle, #__name); \
-      return reinterpret_cast<warprnntFunc>(p_##__name)(args...);    \
-    }                                                                \
-  };                                                                 \
+#define DYNAMIC_LOAD_WARPRNNT_WRAP(__name)                             \
+  struct DynLoad__##__name {                                           \
+    template <typename... Args>                                        \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {   \
+      using warprnntFunc = decltype(&::__name);                        \
+      std::call_once(warprnnt_dso_flag, []() {                         \
+        warprnnt_dso_handle = common::dynload::GetWarpRNNTDsoHandle(); \
+      });                                                              \
+      static void* p_##__name = dlsym(warprnnt_dso_handle, #__name);   \
+      return reinterpret_cast<warprnntFunc>(p_##__name)(args...);      \
+    }                                                                  \
+  };                                                                   \
   extern DynLoad__##__name __name
 
 #define DECLARE_DYNAMIC_LOAD_WARPRNNT_WRAP(__name) \
diff --git a/paddle/phi/backends/dynload/xpti.h b/paddle/phi/backends/dynload/xpti.h
index 9c65d606ba5bd0..d84b20640ad958 100644
--- a/paddle/phi/backends/dynload/xpti.h
+++ b/paddle/phi/backends/dynload/xpti.h
@@ -34,7 +34,7 @@ extern void *xpti_dso_handle;
     XPTIResult operator()(Args... args) {                        \
       using xptiFunc = decltype(&::__name);                      \
       std::call_once(xpti_dso_flag, []() {                       \
-        xpti_dso_handle = phi::dynload::GetXPTIDsoHandle();      \
+        xpti_dso_handle = common::dynload::GetXPTIDsoHandle();   \
       });                                                        \
       static void *p_##__name = dlsym(xpti_dso_handle, #__name); \
       return reinterpret_cast<xptiFunc>(p_##__name)(args...);    \
diff --git a/paddle/phi/backends/gpu/cuda/cuda_device_function.h b/paddle/phi/backends/gpu/cuda/cuda_device_function.h
index 3c2b347776edbe..409e96006624da 100644
--- a/paddle/phi/backends/gpu/cuda/cuda_device_function.h
+++ b/paddle/phi/backends/gpu/cuda/cuda_device_function.h
@@ -18,8 +18,8 @@ limitations under the License. */
 #define PADDLE_CUDA_FP16
 #include "paddle/common/bfloat16.h"
 #include "paddle/common/complex.h"
+#include "paddle/common/enforce.h"
 #include "paddle/common/float16.h"
-#include "paddle/phi/core/enforce.h"
 
 namespace phi {
 namespace backends {
diff --git a/paddle/phi/backends/gpu/cuda/cuda_graph.h b/paddle/phi/backends/gpu/cuda/cuda_graph.h
index cbf66a945a6ec3..38f9718494c712 100644
--- a/paddle/phi/backends/gpu/cuda/cuda_graph.h
+++ b/paddle/phi/backends/gpu/cuda/cuda_graph.h
@@ -26,14 +26,14 @@
 
 #include "glog/logging.h"
 
+#include "paddle/common/enforce.h"
+#include "paddle/common/errors.h"
 #include "paddle/common/macros.h"
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/device_code.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/common/place.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/utils/optional.h"
 
 #if CUDA_VERSION < 11000
diff --git a/paddle/phi/backends/gpu/cuda/cuda_helper.h b/paddle/phi/backends/gpu/cuda/cuda_helper.h
index 61ed6fe65c0e70..b380e86ba62372 100644
--- a/paddle/phi/backends/gpu/cuda/cuda_helper.h
+++ b/paddle/phi/backends/gpu/cuda/cuda_helper.h
@@ -18,9 +18,9 @@
 #include <cuda_runtime.h>  // NOLINT
 
 #include "paddle/common/bfloat16.h"
+#include "paddle/common/enforce.h"
 #include "paddle/common/float16.h"
 #include "paddle/phi/common/data_type.h"
-#include "paddle/phi/core/enforce.h"
 
 namespace phi {
 namespace backends {
diff --git a/paddle/phi/backends/gpu/cuda/cuda_info.cc b/paddle/phi/backends/gpu/cuda/cuda_info.cc
index 0af1beb782fcf0..14df0645a6d5cd 100644
--- a/paddle/phi/backends/gpu/cuda/cuda_info.cc
+++ b/paddle/phi/backends/gpu/cuda/cuda_info.cc
@@ -16,7 +16,7 @@
 
 #include "glog/logging.h"
 
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 
 static std::once_flag g_device_props_size_init_flag;
 static std::vector<std::unique_ptr<std::once_flag>> g_device_props_init_flags;
diff --git a/paddle/phi/backends/gpu/cuda/cudnn_desc.h b/paddle/phi/backends/gpu/cuda/cudnn_desc.h
index d4fb6930bcc550..fdbc28ffc23d8b 100644
--- a/paddle/phi/backends/gpu/cuda/cudnn_desc.h
+++ b/paddle/phi/backends/gpu/cuda/cudnn_desc.h
@@ -23,8 +23,8 @@
 #include <string>
 #include <vector>
 
+#include "paddle/common/data_type.h"
 #include "paddle/phi/backends/gpu/cuda/cudnn_helper.h"
-#include "paddle/phi/core/utils/data_type.h"
 
 namespace phi {
 namespace backends {
@@ -87,7 +87,7 @@ class ActivationDescriptor {
     void operator()(T* t) {
       if (t != nullptr) {
         PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::cudnnDestroyActivationDescriptor(t));
+            common::dynload::cudnnDestroyActivationDescriptor(t));
         t = nullptr;
       }
     }
@@ -95,12 +95,12 @@ class ActivationDescriptor {
   ActivationDescriptor() {
     T* raw_ptr;
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnCreateActivationDescriptor(&raw_ptr));
+        common::dynload::cudnnCreateActivationDescriptor(&raw_ptr));
     desc_.reset(raw_ptr);
   }
   template <typename T>
   void set(cudnnActivationMode_t mode, const T& coef) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetActivationDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnSetActivationDescriptor(
         desc_.get(), mode, CUDNN_NOT_PROPAGATE_NAN, static_cast<double>(coef)));
   }
 
@@ -118,7 +118,7 @@ class TensorDescriptor {
     void operator()(T* t) {
       if (t != nullptr) {
         PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::cudnnDestroyTensorDescriptor(t));
+            common::dynload::cudnnDestroyTensorDescriptor(t));
         t = nullptr;
       }
     }
@@ -126,7 +126,7 @@ class TensorDescriptor {
   TensorDescriptor() {
     T* raw_ptr;
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnCreateTensorDescriptor(&raw_ptr));
+        common::dynload::cudnnCreateTensorDescriptor(&raw_ptr));
     desc_.reset(raw_ptr);
   }
   T* desc() { return desc_.get(); }
@@ -142,7 +142,7 @@ class TensorDescriptor {
     if (groups > 1) {
       dims_with_group[1] = dims_with_group[1] / groups;
     }
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnSetTensorNdDescriptor(
         desc_.get(),
         ToCudnnDataType(tensor.dtype()),
         dims_with_group.size(),
@@ -160,11 +160,11 @@ class TensorDescriptor {
       transformed_dims = dims;
     }
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnSetTensorNdDescriptorEx(desc_.get(),
-                                                   format,
-                                                   dtype,
-                                                   transformed_dims.size(),
-                                                   transformed_dims.data()));
+        common::dynload::cudnnSetTensorNdDescriptorEx(desc_.get(),
+                                                      format,
+                                                      dtype,
+                                                      transformed_dims.size(),
+                                                      transformed_dims.data()));
   }
 
   void set(const phi::DenseTensor& tensor, const cudnnTensorFormat_t format) {
@@ -184,7 +184,7 @@ class FilterDescriptor {
     void operator()(T* t) {
       if (t != nullptr) {
         PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::cudnnDestroyFilterDescriptor(t));
+            common::dynload::cudnnDestroyFilterDescriptor(t));
         t = nullptr;
       }
     }
@@ -192,7 +192,7 @@ class FilterDescriptor {
   FilterDescriptor() {
     T* raw_ptr;
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnCreateFilterDescriptor(&raw_ptr));
+        common::dynload::cudnnCreateFilterDescriptor(&raw_ptr));
     desc_.reset(raw_ptr);
   }
   T* desc() { return desc_.get(); }
@@ -212,11 +212,11 @@ class FilterDescriptor {
       transformed_dims[1] = transformed_dims[1] / groups;
     }
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnSetFilterNdDescriptor(desc_.get(),
-                                                 dtype,
-                                                 format,
-                                                 transformed_dims.size(),
-                                                 transformed_dims.data()));
+        common::dynload::cudnnSetFilterNdDescriptor(desc_.get(),
+                                                    dtype,
+                                                    format,
+                                                    transformed_dims.size(),
+                                                    transformed_dims.data()));
   }
 
   void set(const phi::DenseTensor& tensor,
@@ -238,7 +238,7 @@ class ConvolutionDescriptor {
     void operator()(T* t) {
       if (t != nullptr) {
         PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::cudnnDestroyConvolutionDescriptor(t));
+            common::dynload::cudnnDestroyConvolutionDescriptor(t));
         t = nullptr;
       }
     }
@@ -246,7 +246,7 @@ class ConvolutionDescriptor {
   ConvolutionDescriptor() {
     T* raw_ptr;
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnCreateConvolutionDescriptor(&raw_ptr));
+        common::dynload::cudnnCreateConvolutionDescriptor(&raw_ptr));
     desc_.reset(raw_ptr);
   }
   T* desc() { return desc_.get(); }
@@ -262,32 +262,32 @@ class ConvolutionDescriptor {
     cudnnDataType_t compute_type =
         (dtype == CUDNN_DATA_DOUBLE) ? CUDNN_DATA_DOUBLE : CUDNN_DATA_FLOAT;
     T* desc = desc_.get();
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnSetConvolutionNdDescriptor(desc,
-                                                      pads.size(),
-                                                      pads.data(),
-                                                      strides.data(),
-                                                      dilations.data(),
-                                                      CUDNN_CROSS_CORRELATION,
-                                                      compute_type));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnSetConvolutionNdDescriptor(
+        desc,
+        pads.size(),
+        pads.data(),
+        strides.data(),
+        dilations.data(),
+        CUDNN_CROSS_CORRELATION,
+        compute_type));
 #if CUDNN_VERSION_MIN(7, 0, 1)
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnSetConvolutionGroupCount(desc, groups));
+        common::dynload::cudnnSetConvolutionGroupCount(desc, groups));
 #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnSetConvolutionMathType(desc, CUDNN_DEFAULT_MATH));
+        common::dynload::cudnnSetConvolutionMathType(desc, CUDNN_DEFAULT_MATH));
     if (dtype == CUDNN_DATA_HALF) {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType(
+      PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnSetConvolutionMathType(
           desc, CUDNN_TENSOR_OP_MATH));
 #if CUDA_VERSION >= 11000
 #if CUDNN_VERSION_MIN(8, 1, 0)
     } else if (dtype == CUDNN_DATA_BFLOAT16) {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType(
+      PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnSetConvolutionMathType(
           desc, CUDNN_TENSOR_OP_MATH));
 #endif  // CUDNN_VERSION_MIN(8,1,0)
     } else if (dtype == CUDNN_DATA_FLOAT && !allow_tf32) {
       PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::cudnnSetConvolutionMathType(desc, CUDNN_FMA_MATH));
+          common::dynload::cudnnSetConvolutionMathType(desc, CUDNN_FMA_MATH));
 #endif  // CUDA_VERSION >= 11000
     }
 #endif
diff --git a/paddle/phi/backends/gpu/cuda/cudnn_helper.h b/paddle/phi/backends/gpu/cuda/cudnn_helper.h
index 5acd3f32075ca8..f527211f83c420 100644
--- a/paddle/phi/backends/gpu/cuda/cudnn_helper.h
+++ b/paddle/phi/backends/gpu/cuda/cudnn_helper.h
@@ -19,12 +19,12 @@ limitations under the License. */
 
 #include "paddle/common/backends/dynload/cudnn.h"
 #include "paddle/common/bfloat16.h"
+#include "paddle/common/enforce.h"
+#include "paddle/common/errors.h"
 #include "paddle/common/float16.h"
 #include "paddle/common/macros.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/utils/flags.h"
 
 PD_DECLARE_bool(cudnn_deterministic);
@@ -195,11 +195,11 @@ class ScopedTensorDescriptor {
  public:
   ScopedTensorDescriptor() {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnCreateTensorDescriptor(&desc_));
+        common::dynload::cudnnCreateTensorDescriptor(&desc_));
   }
   ~ScopedTensorDescriptor() PADDLE_MAY_THROW {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnDestroyTensorDescriptor(desc_));
+        common::dynload::cudnnDestroyTensorDescriptor(desc_));
   }
 
   inline cudnnTensorDescriptor_t descriptor(const cudnnTensorFormat_t format,
@@ -222,26 +222,27 @@ class ScopedTensorDescriptor {
     if (dims.size() == 4) {
       if (format == CUDNN_TENSOR_NCHW) {
         PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::cudnnSetTensorNdDescriptor(desc_,
-                                                     type,
-                                                     dims_with_group.size(),
-                                                     dims_with_group.data(),
-                                                     strides.data()));
+            common::dynload::cudnnSetTensorNdDescriptor(desc_,
+                                                        type,
+                                                        dims_with_group.size(),
+                                                        dims_with_group.data(),
+                                                        strides.data()));
       } else {  // CUDNN_TENSOR_NHWC
-        PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensor4dDescriptor(
+        PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnSetTensor4dDescriptor(
             desc_, format, type, dims[0], dims[3], dims[1], dims[2]));
       }
     } else if (dims.size() == 5) {
       if (format == CUDNN_TENSOR_NCHW) {
         PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::cudnnSetTensorNdDescriptor(desc_,
-                                                     type,
-                                                     dims_with_group.size(),
-                                                     dims_with_group.data(),
-                                                     strides.data()));
+            common::dynload::cudnnSetTensorNdDescriptor(desc_,
+                                                        type,
+                                                        dims_with_group.size(),
+                                                        dims_with_group.data(),
+                                                        strides.data()));
       } else {  // CUDNN_TENSOR_NHWC
-        PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptorEx(
-            desc_, format, type, dims.size(), dims.data()));
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            common::dynload::cudnnSetTensorNdDescriptorEx(
+                desc_, format, type, dims.size(), dims.data()));
       }
     }
     return desc_;
@@ -258,7 +259,7 @@ class ScopedTensorDescriptor {
   inline cudnnTensorDescriptor_t descriptor(const cudnnDataType_t cudnn_type,
                                             const std::vector<int>& dim,
                                             const std::vector<int>& stride) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnSetTensorNdDescriptor(
         desc_, cudnn_type, dim.size(), dim.data(), stride.data()));
     return desc_;
   }
@@ -281,12 +282,12 @@ class ScopedRNNTensorDescriptor {
  public:
   ScopedRNNTensorDescriptor() {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnCreateRNNDataDescriptor(&desc_));
+        common::dynload::cudnnCreateRNNDataDescriptor(&desc_));
   }
 
   ~ScopedRNNTensorDescriptor() PADDLE_MAY_THROW {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnDestroyRNNDataDescriptor(desc_));
+        common::dynload::cudnnDestroyRNNDataDescriptor(desc_));
   }
 
   inline cudnnRNNDataDescriptor_t descriptor(
@@ -305,7 +306,7 @@ class ScopedRNNTensorDescriptor {
       layout = CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED;
     }
 
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetRNNDataDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnSetRNNDataDescriptor(
         desc_,
         cudnn_type,
         layout,
@@ -345,11 +346,11 @@ class ScopedDropoutDescriptor {
  public:
   ScopedDropoutDescriptor() {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnCreateDropoutDescriptor(&desc_));
+        common::dynload::cudnnCreateDropoutDescriptor(&desc_));
   }
   ~ScopedDropoutDescriptor() PADDLE_MAY_THROW {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnDestroyDropoutDescriptor(desc_));
+        common::dynload::cudnnDestroyDropoutDescriptor(desc_));
   }
 
   inline cudnnDropoutDescriptor_t descriptor(const cudnnHandle_t& handle,
@@ -361,22 +362,22 @@ class ScopedDropoutDescriptor {
                                              size_t state_size) {
     if (dropout_state_ == nullptr) {  // for no dropout or test
       PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::cudnnSetDropoutDescriptor(desc_,
-                                                  handle,
-                                                  0 /* dropout */,
-                                                  nullptr,
-                                                  0 /* state_size */,
-                                                  0 /* seed */));
+          common::dynload::cudnnSetDropoutDescriptor(desc_,
+                                                     handle,
+                                                     0 /* dropout */,
+                                                     nullptr,
+                                                     0 /* state_size */,
+                                                     0 /* seed */));
       return desc_;
     }
     auto* dropout_state_data = dropout_state_->data<uint8_t>();
     if (!initialized) {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetDropoutDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnSetDropoutDescriptor(
           desc_, handle, dropout_prob_, dropout_state_data, state_size, seed));
     } else {
       auto dropout_state_dims = phi::vectorize<int64_t>(dropout_state_->dims());
       state_size = dropout_state_dims[0];
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRestoreDropoutDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnRestoreDropoutDescriptor(
           desc_, handle, dropout_prob_, dropout_state_data, state_size, 0));
     }
     return desc_;
@@ -391,10 +392,12 @@ class ScopedDropoutDescriptor {
 class ScopedRNNDescriptor {
  public:
   ScopedRNNDescriptor() {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnCreateRNNDescriptor(&desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        common::dynload::cudnnCreateRNNDescriptor(&desc_));
   }
   ~ScopedRNNDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDestroyRNNDescriptor(desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        common::dynload::cudnnDestroyRNNDescriptor(desc_));
   }
 
   inline cudnnRNNDescriptor_t desc() { return desc_; }
@@ -408,11 +411,11 @@ class ScopedFilterDescriptor {
  public:
   ScopedFilterDescriptor() {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnCreateFilterDescriptor(&desc_));
+        common::dynload::cudnnCreateFilterDescriptor(&desc_));
   }
   ~ScopedFilterDescriptor() PADDLE_MAY_THROW {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnDestroyFilterDescriptor(desc_));
+        common::dynload::cudnnDestroyFilterDescriptor(desc_));
   }
 
   inline cudnnFilterDescriptor_t descriptor(const cudnnTensorFormat_t format,
@@ -429,11 +432,11 @@ class ScopedFilterDescriptor {
       // NOTE: input filter(C) of the filter is already asserted to be C/groups.
     }
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnSetFilterNdDescriptor(desc_,
-                                                 type,
-                                                 format,
-                                                 kernel_with_group.size(),
-                                                 kernel_with_group.data()));
+        common::dynload::cudnnSetFilterNdDescriptor(desc_,
+                                                    type,
+                                                    format,
+                                                    kernel_with_group.size(),
+                                                    kernel_with_group.data()));
     return desc_;
   }
 
@@ -456,11 +459,11 @@ class ScopedConvolutionDescriptor {
  public:
   ScopedConvolutionDescriptor() {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnCreateConvolutionDescriptor(&desc_));
+        common::dynload::cudnnCreateConvolutionDescriptor(&desc_));
   }
   ~ScopedConvolutionDescriptor() PADDLE_MAY_THROW {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnDestroyConvolutionDescriptor(desc_));
+        common::dynload::cudnnDestroyConvolutionDescriptor(desc_));
   }
 
   inline cudnnConvolutionDescriptor_t descriptor(
@@ -486,14 +489,14 @@ class ScopedConvolutionDescriptor {
 
     cudnnDataType_t compute_type =
         (type == CUDNN_DATA_DOUBLE) ? CUDNN_DATA_DOUBLE : CUDNN_DATA_FLOAT;
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnSetConvolutionNdDescriptor(desc_,
-                                                      pads.size(),
-                                                      pads.data(),
-                                                      strides.data(),
-                                                      dilations.data(),
-                                                      CUDNN_CROSS_CORRELATION,
-                                                      compute_type));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnSetConvolutionNdDescriptor(
+        desc_,
+        pads.size(),
+        pads.data(),
+        strides.data(),
+        dilations.data(),
+        CUDNN_CROSS_CORRELATION,
+        compute_type));
     return desc_;
   }
 
@@ -514,11 +517,11 @@ class ScopedPoolingDescriptor {
  public:
   ScopedPoolingDescriptor() {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnCreatePoolingDescriptor(&desc_));
+        common::dynload::cudnnCreatePoolingDescriptor(&desc_));
   }
   ~ScopedPoolingDescriptor() PADDLE_MAY_THROW {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnDestroyPoolingDescriptor(desc_));
+        common::dynload::cudnnDestroyPoolingDescriptor(desc_));
   }
 
   inline cudnnPoolingDescriptor_t descriptor(const PoolingMode& mode,
@@ -540,7 +543,7 @@ class ScopedPoolingDescriptor {
             "received size of kernel is %d, size of strides is %d.",
             kernel.size(),
             strides.size()));
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetPoolingNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnSetPoolingNdDescriptor(
         desc_,
         (GetPoolingMode(mode)),
         CUDNN_PROPAGATE_NAN,  // Always propagate nans.
@@ -560,18 +563,18 @@ class ScopedSpatialTransformerDescriptor {
  public:
   ScopedSpatialTransformerDescriptor() {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnCreateSpatialTransformerDescriptor(&desc_));
+        common::dynload::cudnnCreateSpatialTransformerDescriptor(&desc_));
   }
   ~ScopedSpatialTransformerDescriptor() PADDLE_MAY_THROW {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnDestroySpatialTransformerDescriptor(desc_));
+        common::dynload::cudnnDestroySpatialTransformerDescriptor(desc_));
   }
 
   template <typename T>
   inline cudnnSpatialTransformerDescriptor_t descriptor(const int nbDims,
                                                         const int dimA[]) {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnSetSpatialTransformerNdDescriptor(
+        common::dynload::cudnnSetSpatialTransformerNdDescriptor(
             desc_,
             CUDNN_SAMPLER_BILINEAR,
             CudnnDataType<T>::type,
@@ -589,11 +592,11 @@ class ScopedActivationDescriptor {
  public:
   ScopedActivationDescriptor() {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnCreateActivationDescriptor(&desc_));
+        common::dynload::cudnnCreateActivationDescriptor(&desc_));
   }
   ~ScopedActivationDescriptor() PADDLE_MAY_THROW {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnDestroyActivationDescriptor(desc_));
+        common::dynload::cudnnDestroyActivationDescriptor(desc_));
   }
 
   template <typename T>
@@ -630,7 +633,7 @@ class ScopedActivationDescriptor {
             "Unrecognized CUDNN activation mode: %d.",
             static_cast<int>(activation_mode)));
     }
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetActivationDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnSetActivationDescriptor(
         desc_, mode, CUDNN_NOT_PROPAGATE_NAN, relu_ceiling));
     return desc_;
   }
@@ -645,17 +648,17 @@ class ScopedCTCLossDescriptor {
  public:
   ScopedCTCLossDescriptor() {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnCreateCTCLossDescriptor(&desc_));
+        common::dynload::cudnnCreateCTCLossDescriptor(&desc_));
   }
   ~ScopedCTCLossDescriptor() PADDLE_MAY_THROW {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnDestroyCTCLossDescriptor(desc_));
+        common::dynload::cudnnDestroyCTCLossDescriptor(desc_));
   }
 
   template <typename T>
   inline cudnnCTCLossDescriptor_t descriptor() {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnSetCTCLossDescriptor(desc_, CudnnDataType<T>::type));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnSetCTCLossDescriptor(
+        desc_, CudnnDataType<T>::type));
     return desc_;
   }
 
diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc
index f4598583466851..ccb4dc5126f3e3 100644
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -55,7 +55,7 @@ limitations under the License. */
 // without eigen.
 #include "unsupported/Eigen/CXX11/Tensor"
 
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 
 namespace phi {
 
@@ -381,7 +381,7 @@ struct GPUContext::Impl {
         } else {
           blas_tensor_core_handle_ = blas_tensor_core_handle_creator_();
         }
-        PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+        PADDLE_RETRY_CUDA_SUCCESS(common::dynload::cublasSetMathMode(
             blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
       }
 #endif
@@ -393,7 +393,7 @@ struct GPUContext::Impl {
           blas_tf32_tensor_core_handle_ =
               blas_tf32_tensor_core_handle_creator_();
         }
-        PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+        PADDLE_RETRY_CUDA_SUCCESS(common::dynload::cublasSetMathMode(
             blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
       }
 #endif
@@ -461,12 +461,12 @@ struct GPUContext::Impl {
   void DestroyInternalDnnHandle() {
 #ifdef PADDLE_WITH_HIP
     if (owned_ && dnn_handle_ != nullptr) {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDestroy(dnn_handle_));
+      PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::miopenDestroy(dnn_handle_));
       dnn_handle_ = nullptr;
     }
 #else
     if (owned_ && dnn_handle_ != nullptr) {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDestroy(dnn_handle_));
+      PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnDestroy(dnn_handle_));
       dnn_handle_ = nullptr;
     }
 #endif  // PADDLE_WITH_HIP
@@ -583,7 +583,7 @@ struct GPUContext::Impl {
         } else {
           blas_tensor_core_handle_ = blas_tensor_core_handle_creator_();
         }
-        PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+        PADDLE_RETRY_CUDA_SUCCESS(common::dynload::cublasSetMathMode(
             blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
       }
 #endif
@@ -595,7 +595,7 @@ struct GPUContext::Impl {
           blas_tf32_tensor_core_handle_ =
               blas_tf32_tensor_core_handle_creator_();
         }
-        PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+        PADDLE_RETRY_CUDA_SUCCESS(common::dynload::cublasSetMathMode(
             blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
       }
 #endif
@@ -628,7 +628,7 @@ struct GPUContext::Impl {
         } else {
           blas_tensor_core_handle_ = blas_tensor_core_handle_creator_();
         }
-        PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+        PADDLE_RETRY_CUDA_SUCCESS(common::dynload::cublasSetMathMode(
             blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
       }
 #endif
@@ -640,7 +640,7 @@ struct GPUContext::Impl {
           blas_tf32_tensor_core_handle_ =
               blas_tf32_tensor_core_handle_creator_();
         }
-        PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+        PADDLE_RETRY_CUDA_SUCCESS(common::dynload::cublasSetMathMode(
             blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
       }
 #endif
diff --git a/paddle/phi/backends/gpu/gpu_info.h b/paddle/phi/backends/gpu/gpu_info.h
index 132493f2c62cd2..f2348082d07dda 100644
--- a/paddle/phi/backends/gpu/gpu_info.h
+++ b/paddle/phi/backends/gpu/gpu_info.h
@@ -22,7 +22,7 @@ limitations under the License. */
 
 #include "paddle/common/backends/gpu/gpu_types.h"
 
-namespace phi {
+namespace common {
 namespace backends {
 namespace gpu {
 
@@ -142,6 +142,6 @@ class GPUDeviceGuard {
 
 }  // namespace gpu
 }  // namespace backends
-}  // namespace phi
+}  // namespace common
 
 #endif
diff --git a/paddle/phi/backends/gpu/gpu_launch_config.h b/paddle/phi/backends/gpu/gpu_launch_config.h
index fd712baf754803..87c98db8427058 100644
--- a/paddle/phi/backends/gpu/gpu_launch_config.h
+++ b/paddle/phi/backends/gpu/gpu_launch_config.h
@@ -31,8 +31,8 @@
 #include <vector>
 
 #include "glog/logging.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/enforce.h"
 
 // CUDA performs better when thread_per_block is between [64, 512]
 #define PREDEFINED_BLOCK_SIZE 512
diff --git a/paddle/phi/backends/gpu/gpu_resources.cc b/paddle/phi/backends/gpu/gpu_resources.cc
index bf611705ed59b0..8f60db495655a5 100644
--- a/paddle/phi/backends/gpu/gpu_resources.cc
+++ b/paddle/phi/backends/gpu/gpu_resources.cc
@@ -40,7 +40,7 @@
 #include "glog/logging.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 
 namespace phi {
 
@@ -229,24 +229,24 @@ void DestoryStream(gpuStream_t stream) {
 
 void InitBlasHandle(blasHandle_t* blas_handle, gpuStream_t stream) {
 #ifdef PADDLE_WITH_HIP
-  phi::dynload::rocblas_create_handle(blas_handle);
-  phi::dynload::rocblas_set_stream(*blas_handle, stream);
+  common::dynload::rocblas_create_handle(blas_handle);
+  common::dynload::rocblas_set_stream(*blas_handle, stream);
 #else   // PADDLE_WITH_CUDA
-  PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasCreate(blas_handle));
+  PADDLE_RETRY_CUDA_SUCCESS(common::dynload::cublasCreate(blas_handle));
   PADDLE_RETRY_CUDA_SUCCESS(
-      phi::dynload::cublasSetStream(*blas_handle, stream));
+      common::dynload::cublasSetStream(*blas_handle, stream));
 #endif  // PADDLE_WITH_HIP
 }
 
 void DestroyBlasHandle(blasHandle_t handle) {
 #ifdef PADDLE_WITH_HIP
   if (handle != nullptr) {
-    phi::dynload::rocblas_destroy_handle(handle);
+    common::dynload::rocblas_destroy_handle(handle);
     handle = nullptr;
   }
 #else
   if (handle != nullptr) {
-    phi::dynload::cublasDestroy(handle);
+    common::dynload::cublasDestroy(handle);
     handle = nullptr;
   }
 #endif  // PADDLE_WITH_HIP
@@ -254,21 +254,21 @@ void DestroyBlasHandle(blasHandle_t handle) {
 
 void InitBlasLtHandle(blasLtHandle_t* blaslt_handle) {
 #if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
-  phi::dynload::cublasLtCreate(blaslt_handle);
+  common::dynload::cublasLtCreate(blaslt_handle);
 #endif
 }
 
 void DestroyBlasLtHandle(blasLtHandle_t handle) {
 #if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
   if (handle != nullptr) {
-    phi::dynload::cublasLtDestroy(handle);
+    common::dynload::cublasLtDestroy(handle);
     handle = nullptr;
   }
 #endif
 }
 
 void InitDnnHandle(dnnHandle_t* handle, gpuStream_t stream, Place place) {
-  if (phi::dynload::HasCUDNN()) {
+  if (common::dynload::HasCUDNN()) {
 #ifdef PADDLE_WITH_HIP
     size_t miopen_major, miopen_minor, miopen_patch;
     PADDLE_ENFORCE_GPU_SUCCESS(
@@ -290,7 +290,7 @@ void InitDnnHandle(dnnHandle_t* handle, gpuStream_t stream, Place place) {
     PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreate(handle));
     PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetStream(*handle, stream));
 #else
-    auto version = phi::dynload::cudnnGetVersion();
+    auto version = common::dynload::cudnnGetVersion();
     auto local_cudnn_major =
         (version < 9000) ? version / 1000 : version / 10000;
     auto local_cudnn_minor =
@@ -305,8 +305,8 @@ void InitDnnHandle(dnnHandle_t* handle, gpuStream_t stream, Place place) {
           << "Please recompile or reinstall Paddle with compatible CUDNN "
              "version.";
     }
-    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cudnnCreate(handle));
-    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cudnnSetStream(*handle, stream));
+    PADDLE_RETRY_CUDA_SUCCESS(common::dynload::cudnnCreate(handle));
+    PADDLE_RETRY_CUDA_SUCCESS(common::dynload::cudnnSetStream(*handle, stream));
 #endif
   } else {
     *handle = nullptr;
@@ -316,12 +316,12 @@ void InitDnnHandle(dnnHandle_t* handle, gpuStream_t stream, Place place) {
 void DestroyDnnHandle(dnnHandle_t handle) {
 #ifdef PADDLE_WITH_HIP
   if (handle != nullptr) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDestroy(handle));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::miopenDestroy(handle));
     handle = nullptr;
   }
 #else
   if (handle != nullptr) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDestroy(handle));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnDestroy(handle));
     handle = nullptr;
   }
 #endif  // PADDLE_WITH_HIP
@@ -329,15 +329,17 @@ void DestroyDnnHandle(dnnHandle_t handle) {
 
 void InitSolverHandle(solverHandle_t* handle, gpuStream_t stream) {
 #ifndef PADDLE_WITH_HIP
-  PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnCreate(handle));
-  PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnSetStream(*handle, stream));
+  PADDLE_RETRY_CUDA_SUCCESS(common::dynload::cusolverDnCreate(handle));
+  PADDLE_RETRY_CUDA_SUCCESS(
+      common::dynload::cusolverDnSetStream(*handle, stream));
 #endif
 }
 
 void DestroySolverHandle(solverHandle_t solver_handle) {
 #ifndef PADDLE_WITH_HIP
   if (solver_handle != nullptr) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnDestroy(solver_handle));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        common::dynload::cusolverDnDestroy(solver_handle));
     solver_handle = nullptr;
   }
 #endif
@@ -352,8 +354,8 @@ void InitSparseHandle(sparseHandle_t* handle, gpuStream_t stream) {
   PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseSetStream(*handle, stream));
 #endif
 #elif defined(PADDLE_WITH_HIP)
-  phi::dynload::rocsparse_create_handle(handle);
-  phi::dynload::rocsparse_set_stream(*handle, stream);
+  common::dynload::rocsparse_create_handle(handle);
+  common::dynload::rocsparse_set_stream(*handle, stream);
 #endif
 }
 
@@ -367,7 +369,7 @@ void DestroySparseHandle(sparseHandle_t handle) {
 #endif
 #elif defined(PADDLE_WITH_HIP)
   if (handle != nullptr) {
-    phi::dynload::rocsparse_destroy_handle(handle);
+    common::dynload::rocsparse_destroy_handle(handle);
     handle = nullptr;
   }
 #endif
diff --git a/paddle/phi/backends/gpu/gpu_utils.h b/paddle/phi/backends/gpu/gpu_utils.h
index 0bb0aef7be1f13..c598c488807dea 100644
--- a/paddle/phi/backends/gpu/gpu_utils.h
+++ b/paddle/phi/backends/gpu/gpu_utils.h
@@ -18,7 +18,7 @@
 
 #include <array>
 
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
 namespace phi {
diff --git a/paddle/phi/backends/gpu/rocm/miopen_desc.h b/paddle/phi/backends/gpu/rocm/miopen_desc.h
index ae0e274ca650ef..f921b338cacca2 100644
--- a/paddle/phi/backends/gpu/rocm/miopen_desc.h
+++ b/paddle/phi/backends/gpu/rocm/miopen_desc.h
@@ -23,8 +23,8 @@
 #include <string>
 #include <vector>
 
+#include "paddle/common/data_type.h"
 #include "paddle/phi/backends/gpu/rocm/miopen_helper.h"
-#include "paddle/phi/core/utils/data_type.h"
 
 namespace phi {
 namespace backends {
@@ -75,7 +75,7 @@ class ActivationDescriptor {
     void operator()(T* t) {
       if (t != nullptr) {
         PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::miopenDestroyActivationDescriptor(t));
+            common::dynload::miopenDestroyActivationDescriptor(t));
         t = nullptr;
       }
     }
@@ -83,12 +83,12 @@ class ActivationDescriptor {
   ActivationDescriptor() {
     T* raw_ptr;
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenCreateActivationDescriptor(&raw_ptr));
+        common::dynload::miopenCreateActivationDescriptor(&raw_ptr));
     desc_.reset(raw_ptr);
   }
   template <typename T>
   void set(miopenActivationMode_t mode, const T& coef) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetActivationDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::miopenSetActivationDescriptor(
         desc_.get(), mode, static_cast<double>(coef), 0.0, 0.0));
   }
 
@@ -106,7 +106,7 @@ class TensorDescriptor {
     void operator()(T* t) {
       if (t != nullptr) {
         PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::miopenDestroyTensorDescriptor(t));
+            common::dynload::miopenDestroyTensorDescriptor(t));
         t = nullptr;
       }
     }
@@ -114,7 +114,7 @@ class TensorDescriptor {
   TensorDescriptor() {
     T* raw_ptr;
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenCreateTensorDescriptor(&raw_ptr));
+        common::dynload::miopenCreateTensorDescriptor(&raw_ptr));
     desc_.reset(raw_ptr);
   }
   T* desc() { return desc_.get(); }
@@ -131,7 +131,7 @@ class TensorDescriptor {
     if (groups > 1) {
       dims_with_group[1] = dims_with_group[1] / groups;
     }
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::miopenSetTensorDescriptor(
         (miopenTensorDescriptor_t)(desc_.get()),
         ToCudnnDataType(tensor.dtype()),
         static_cast<int>(dims_with_group.size()),
@@ -155,7 +155,7 @@ class TensorDescriptor {
     if (groups > 1) {
       dims_with_group[1] = dims_with_group[1] / groups;
     }
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::miopenSetTensorDescriptor(
         (miopenTensorDescriptor_t)(desc_.get()),
         ToCudnnDataType(tensor.dtype()),
         static_cast<int>(dims_with_group.size()),
@@ -174,7 +174,7 @@ class FilterDescriptor {
     void operator()(T* t) {
       if (t != nullptr) {
         PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::miopenDestroyTensorDescriptor(t));
+            common::dynload::miopenDestroyTensorDescriptor(t));
         t = nullptr;
       }
     }
@@ -182,7 +182,7 @@ class FilterDescriptor {
   FilterDescriptor() {
     T* raw_ptr;
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenCreateTensorDescriptor(&raw_ptr));
+        common::dynload::miopenCreateTensorDescriptor(&raw_ptr));
     desc_.reset(raw_ptr);
   }
   T* desc() { return desc_.get(); }
@@ -205,7 +205,7 @@ class FilterDescriptor {
     if (groups > 1) {
       dims_with_group[1] = dims_with_group[1] / groups;
     }
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::miopenSetTensorDescriptor(
         (miopenTensorDescriptor_t)(desc_.get()),
         ToCudnnDataType(tensor.dtype()),
         static_cast<int>(dims_with_group.size()),
@@ -224,7 +224,7 @@ class ConvolutionDescriptor {
     void operator()(T* t) {
       if (t != nullptr) {
         PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::miopenDestroyConvolutionDescriptor(t));
+            common::dynload::miopenDestroyConvolutionDescriptor(t));
         t = nullptr;
       }
     }
@@ -232,7 +232,7 @@ class ConvolutionDescriptor {
   ConvolutionDescriptor() {
     T* raw_ptr;
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenCreateConvolutionDescriptor(&raw_ptr));
+        common::dynload::miopenCreateConvolutionDescriptor(&raw_ptr));
     desc_.reset(raw_ptr);
   }
   T* desc() { return desc_.get(); }
@@ -244,14 +244,15 @@ class ConvolutionDescriptor {
            const std::vector<int>& dilations,
            bool allow_tf32,
            const int groups = 1) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenInitConvolutionNdDescriptor(
-        (miopenConvolutionDescriptor_t)desc_.get(),
-        static_cast<int>(pads.size()),
-        const_cast<int*>(pads.data()),
-        const_cast<int*>(strides.data()),
-        const_cast<int*>(dilations.data()),
-        miopenConvolution));
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetConvolutionGroupCount(
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        common::dynload::miopenInitConvolutionNdDescriptor(
+            (miopenConvolutionDescriptor_t)desc_.get(),
+            static_cast<int>(pads.size()),
+            const_cast<int*>(pads.data()),
+            const_cast<int*>(strides.data()),
+            const_cast<int*>(dilations.data()),
+            miopenConvolution));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::miopenSetConvolutionGroupCount(
         (miopenConvolutionDescriptor_t)desc_.get(), groups));
   }
 
diff --git a/paddle/phi/backends/gpu/rocm/miopen_helper.h b/paddle/phi/backends/gpu/rocm/miopen_helper.h
index 61dab08f5db583..b27bec1aebc1ee 100644
--- a/paddle/phi/backends/gpu/rocm/miopen_helper.h
+++ b/paddle/phi/backends/gpu/rocm/miopen_helper.h
@@ -20,13 +20,13 @@ limitations under the License. */
 #include "paddle/utils/flags.h"
 
 #include "paddle/common/bfloat16.h"
+#include "paddle/common/enforce.h"
+#include "paddle/common/errors.h"
 #include "paddle/common/float16.h"
 #include "paddle/common/macros.h"
 #include "paddle/phi/backends/dynload/miopen.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 
 // MIOPEN do not have epslion definition
 #define CUDNN_BN_MIN_EPSILON 1e-05
@@ -204,11 +204,11 @@ class ScopedTensorDescriptor {
  public:
   ScopedTensorDescriptor() {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenCreateTensorDescriptor(&desc_));
+        common::dynload::miopenCreateTensorDescriptor(&desc_));
   }
   ~ScopedTensorDescriptor() PADDLE_MAY_THROW {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenDestroyTensorDescriptor(desc_));
+        common::dynload::miopenDestroyTensorDescriptor(desc_));
   }
 
   inline miopenTensorDescriptor_t descriptor(const miopenTensorFormat_t format,
@@ -234,14 +234,14 @@ class ScopedTensorDescriptor {
         MIOPEN_TENSOR_NCHW,
         phi::errors::InvalidArgument("format should ONLY be NCHW in MIOPEN."));
     if (dims.size() == 4) {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::miopenSetTensorDescriptor(
           desc_,
           type,
           dims_with_group.size(),
           const_cast<int*>(dims_with_group.data()),
           const_cast<int*>(strides.data())));
     } else if (dims.size() == 5) {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::miopenSetTensorDescriptor(
           desc_,
           type,
           dims_with_group.size(),
@@ -262,7 +262,7 @@ class ScopedTensorDescriptor {
   inline miopenTensorDescriptor_t descriptor(const miopenDataType_t miopen_type,
                                              const std::vector<int>& dim,
                                              const std::vector<int>& stride) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::miopenSetTensorDescriptor(
         desc_,
         miopen_type,
         dim.size(),
@@ -288,11 +288,11 @@ class ScopedDropoutDescriptor {
  public:
   ScopedDropoutDescriptor() {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenCreateDropoutDescriptor(&desc_));
+        common::dynload::miopenCreateDropoutDescriptor(&desc_));
   }
   ~ScopedDropoutDescriptor() PADDLE_MAY_THROW {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenDestroyDropoutDescriptor(desc_));
+        common::dynload::miopenDestroyDropoutDescriptor(desc_));
   }
 
   inline miopenDropoutDescriptor_t descriptor(const miopenHandle_t& handle,
@@ -303,43 +303,44 @@ class ScopedDropoutDescriptor {
                                               int seed,
                                               size_t state_size) {
     if (dropout_state_ == nullptr) {  // for no dropout or test
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::miopenSetDropoutDescriptor(desc_,
-                                                   handle,
-                                                   0 /* dropout */,
-                                                   nullptr,
-                                                   0 /* state_size */,
-                                                   0 /* seed */,
-                                                   false,
-                                                   false,
-                                                   MIOPEN_RNG_PSEUDO_XORWOW));
+      PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::miopenSetDropoutDescriptor(
+          desc_,
+          handle,
+          0 /* dropout */,
+          nullptr,
+          0 /* state_size */,
+          0 /* seed */,
+          false,
+          false,
+          MIOPEN_RNG_PSEUDO_XORWOW));
       return desc_;
     }
     auto* dropout_state_data = dropout_state_->data<uint8_t>();
     if (!initialized) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::miopenSetDropoutDescriptor(desc_,
-                                                   handle,
-                                                   dropout_prob_,
-                                                   dropout_state_data,
-                                                   state_size,
-                                                   seed,
-                                                   false,
-                                                   false,
-                                                   MIOPEN_RNG_PSEUDO_XORWOW));
-    } else {
-      auto dropout_state_dims = dropout_state_->dims();
-      state_size = dropout_state_dims[0];
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenRestoreDropoutDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::miopenSetDropoutDescriptor(
           desc_,
           handle,
           dropout_prob_,
           dropout_state_data,
           state_size,
-          0,
+          seed,
           false,
           false,
           MIOPEN_RNG_PSEUDO_XORWOW));
+    } else {
+      auto dropout_state_dims = dropout_state_->dims();
+      state_size = dropout_state_dims[0];
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          common::dynload::miopenRestoreDropoutDescriptor(
+              desc_,
+              handle,
+              dropout_prob_,
+              dropout_state_data,
+              state_size,
+              0,
+              false,
+              false,
+              MIOPEN_RNG_PSEUDO_XORWOW));
     }
     return desc_;
   }
@@ -353,10 +354,12 @@ class ScopedDropoutDescriptor {
 class ScopedRNNDescriptor {
  public:
   ScopedRNNDescriptor() {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenCreateRNNDescriptor(&desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        common::dynload::miopenCreateRNNDescriptor(&desc_));
   }
   ~ScopedRNNDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDestroyRNNDescriptor(desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        common::dynload::miopenDestroyRNNDescriptor(desc_));
   }
 
   inline miopenRNNDescriptor_t desc() { return desc_; }
@@ -370,11 +373,11 @@ class ScopedFilterDescriptor {
  public:
   ScopedFilterDescriptor() {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenCreateTensorDescriptor(&desc_));
+        common::dynload::miopenCreateTensorDescriptor(&desc_));
   }
   ~ScopedFilterDescriptor() PADDLE_MAY_THROW {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenDestroyTensorDescriptor(desc_));
+        common::dynload::miopenDestroyTensorDescriptor(desc_));
   }
 
   inline miopenTensorDescriptor_t descriptor(const miopenTensorFormat_t format,
@@ -395,7 +398,7 @@ class ScopedFilterDescriptor {
     for (int k = kernel_with_group.size() - 2; k >= 0; k--) {
       stride_dim[k] = stride_dim[k + 1] * kernel_with_group[k + 1];
     }
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::miopenSetTensorDescriptor(
         desc_,
         type,
         kernel_with_group.size(),
@@ -423,11 +426,11 @@ class ScopedConvolutionDescriptor {
  public:
   ScopedConvolutionDescriptor() {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenCreateConvolutionDescriptor(&desc_));
+        common::dynload::miopenCreateConvolutionDescriptor(&desc_));
   }
   ~ScopedConvolutionDescriptor() PADDLE_MAY_THROW {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenDestroyConvolutionDescriptor(desc_));
+        common::dynload::miopenDestroyConvolutionDescriptor(desc_));
   }
 
   inline miopenConvolutionDescriptor_t descriptor(
@@ -450,13 +453,14 @@ class ScopedConvolutionDescriptor {
             "of pads is %d, size of dilations is %d.",
             pads.size(),
             dilations.size()));
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenInitConvolutionNdDescriptor(
-        desc_,
-        pads.size(),
-        const_cast<int*>(pads.data()),
-        const_cast<int*>(strides.data()),
-        const_cast<int*>(dilations.data()),
-        miopenConvolution));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        common::dynload::miopenInitConvolutionNdDescriptor(
+            desc_,
+            pads.size(),
+            const_cast<int*>(pads.data()),
+            const_cast<int*>(strides.data()),
+            const_cast<int*>(dilations.data()),
+            miopenConvolution));
     return desc_;
   }
 
@@ -477,11 +481,11 @@ class ScopedPoolingDescriptor {
  public:
   ScopedPoolingDescriptor() {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenCreatePoolingDescriptor(&desc_));
+        common::dynload::miopenCreatePoolingDescriptor(&desc_));
   }
   ~ScopedPoolingDescriptor() PADDLE_MAY_THROW {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenDestroyPoolingDescriptor(desc_));
+        common::dynload::miopenDestroyPoolingDescriptor(desc_));
   }
 
   inline miopenPoolingDescriptor_t descriptor(const PoolingMode& mode,
@@ -503,7 +507,7 @@ class ScopedPoolingDescriptor {
             "received size of kernel is %d, size of strides is %d.",
             kernel.size(),
             strides.size()));
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetNdPoolingDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::miopenSetNdPoolingDescriptor(
         desc_,
         GetPoolingMode(mode),
         kernel.size(),
@@ -522,11 +526,11 @@ class ScopedActivationDescriptor {
  public:
   ScopedActivationDescriptor() {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenCreateActivationDescriptor(&desc_));
+        common::dynload::miopenCreateActivationDescriptor(&desc_));
   }
   ~ScopedActivationDescriptor() PADDLE_MAY_THROW {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenDestroyActivationDescriptor(desc_));
+        common::dynload::miopenDestroyActivationDescriptor(desc_));
   }
 
   template <typename T>
@@ -561,7 +565,7 @@ class ScopedActivationDescriptor {
             "Unrecognized MIOPEN activation mode: %d.",
             static_cast<int>(activation_mode)));
     }
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetActivationDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::miopenSetActivationDescriptor(
         desc_, mode, relu_ceiling, 0.0, 0.0));
     return desc_;
   }
@@ -575,16 +579,16 @@ class ScopedCTCLossDescriptor {
  public:
   ScopedCTCLossDescriptor() {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenCreateCTCLossDescriptor(&desc_));
+        common::dynload::miopenCreateCTCLossDescriptor(&desc_));
   }
   ~ScopedCTCLossDescriptor() PADDLE_MAY_THROW {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenDestroyCTCLossDescriptor(desc_));
+        common::dynload::miopenDestroyCTCLossDescriptor(desc_));
   }
 
   template <typename T>
   inline miopenCTCLossDescriptor_t descriptor() {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetCTCLossDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::miopenSetCTCLossDescriptor(
         desc_, CudnnDataType<T>::type, 0, false));
     return desc_;
   }
diff --git a/paddle/phi/backends/gpu/rocm/rocm_info.cc b/paddle/phi/backends/gpu/rocm/rocm_info.cc
index edc23479c92380..1d9ac0d2e5226e 100644
--- a/paddle/phi/backends/gpu/rocm/rocm_info.cc
+++ b/paddle/phi/backends/gpu/rocm/rocm_info.cc
@@ -16,7 +16,7 @@
 
 #include "paddle/phi/backends/gpu/gpu_info.h"
 
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 
 static std::once_flag g_device_props_size_init_flag;
 static std::vector<std::unique_ptr<std::once_flag>> g_device_props_init_flags;
diff --git a/paddle/phi/backends/onednn/onednn_context.cc b/paddle/phi/backends/onednn/onednn_context.cc
index 8392a0a45b38c4..77eb0d80853d77 100644
--- a/paddle/phi/backends/onednn/onednn_context.cc
+++ b/paddle/phi/backends/onednn/onednn_context.cc
@@ -14,8 +14,8 @@
 #ifdef PADDLE_WITH_DNNL
 #include "paddle/phi/backends/onednn/onednn_context.h"
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/common/place.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/utils/flat_hash_map.h"
 
 #include "paddle/phi/backends/context_pool.h"
diff --git a/paddle/phi/backends/xpu/enforce_xpu.h b/paddle/phi/backends/xpu/enforce_xpu.h
index 0a2a21e236d040..9321a8c843ec72 100644
--- a/paddle/phi/backends/xpu/enforce_xpu.h
+++ b/paddle/phi/backends/xpu/enforce_xpu.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/xpu/xpu_header.h"
-#include "paddle/phi/core/enforce.h"
 #ifdef PADDLE_WITH_XPU_BKCL
 #include "xpu/bkcl.h"
 #endif
diff --git a/paddle/phi/capi/include/type_utils.h b/paddle/phi/capi/include/type_utils.h
index 029ee42fe091bc..98d25aa1bd010f 100644
--- a/paddle/phi/capi/include/type_utils.h
+++ b/paddle/phi/capi/include/type_utils.h
@@ -15,10 +15,10 @@
 #pragma once
 #if !defined(_WIN32)
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/capi/include/c_data_type.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/layout.h"
-#include "paddle/phi/core/enforce.h"
 
 namespace phi {
 namespace capi {
diff --git a/paddle/phi/common/int_array.cc b/paddle/phi/common/int_array.cc
index 4b5d553006685b..75440bd2d5b818 100644
--- a/paddle/phi/common/int_array.cc
+++ b/paddle/phi/common/int_array.cc
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #include "paddle/phi/common/int_array.h"
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/place.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/tensor_utils.h"
 
 namespace paddle {
diff --git a/paddle/phi/common/memory_utils.h b/paddle/phi/common/memory_utils.h
index e2a590ee4d210c..784394188ce406 100644
--- a/paddle/phi/common/memory_utils.h
+++ b/paddle/phi/common/memory_utils.h
@@ -17,11 +17,11 @@
 #include <future>  // NOLINT
 #include <unordered_map>
 
+#include "paddle/common/enforce.h"
 #include "paddle/common/macros.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/allocator.h"
 #include "paddle/phi/core/device_context.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/stream.h"
 #include "paddle/utils/test_macros.h"
 
diff --git a/paddle/phi/common/scalar.cc b/paddle/phi/common/scalar.cc
index 71b90361f8b6b0..60ad3b68fe7f0e 100644
--- a/paddle/phi/common/scalar.cc
+++ b/paddle/phi/common/scalar.cc
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #include "paddle/common/scalar.h"
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/place.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/tensor_utils.h"
 namespace paddle {
 namespace experimental {
diff --git a/paddle/phi/common/transform.h b/paddle/phi/common/transform.h
index d83b698a45bc6f..58b9d0ccf221ea 100644
--- a/paddle/phi/common/transform.h
+++ b/paddle/phi/common/transform.h
@@ -17,9 +17,9 @@ limitations under the License. */
 #include <algorithm>
 #include <type_traits>
 
+#include "paddle/common/enforce.h"
 #include "paddle/common/hostdevice.h"
 #include "paddle/phi/backends/all_context.h"
-#include "paddle/phi/core/enforce.h"
 
 #if defined(__NVCC__) || defined(__HIPCC__)
 #include <thrust/execution_policy.h>
diff --git a/paddle/phi/core/compat/arg_map_context.cc b/paddle/phi/core/compat/arg_map_context.cc
index 800245406afd3a..b924dab355564b 100644
--- a/paddle/phi/core/compat/arg_map_context.cc
+++ b/paddle/phi/core/compat/arg_map_context.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/phi/core/compat/arg_map_context.h"
 
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 #include "paddle/utils/string/string_helper.h"
 
 namespace phi {
diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc
index d4c5de0dbe6dc9..06b0651784dfd1 100644
--- a/paddle/phi/core/compat/convert_utils.cc
+++ b/paddle/phi/core/compat/convert_utils.cc
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #include "paddle/phi/core/compat/convert_utils.h"
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/xpu/xpu_info.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/compat/op_utils.h"
-#include "paddle/phi/core/enforce.h"
 
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 #include "paddle/phi/backends/device_manager.h"
diff --git a/paddle/phi/core/compat/get_kerneltype_forvar_utils.cc b/paddle/phi/core/compat/get_kerneltype_forvar_utils.cc
index e144af3757a40a..1c51762890e0ee 100644
--- a/paddle/phi/core/compat/get_kerneltype_forvar_utils.cc
+++ b/paddle/phi/core/compat/get_kerneltype_forvar_utils.cc
@@ -14,7 +14,7 @@
 
 #include "paddle/phi/core/compat/get_kerneltype_forvar_utils.h"
 
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 namespace phi {
 
 const std::string& GetKernelTypeForVarContext::GetVarName() const {
diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h
index e8ba9a8295816c..beee1aebe72197 100644
--- a/paddle/phi/core/compat/op_utils.h
+++ b/paddle/phi/core/compat/op_utils.h
@@ -18,10 +18,10 @@ limitations under the License. */
 #include <unordered_set>
 
 #include "glog/logging.h"
+#include "paddle/common/enforce.h"
 #include "paddle/common/macros.h"
 #include "paddle/common/type_defs.h"
 #include "paddle/phi/core/compat/arg_map_context.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/utils/flat_hash_map.h"
 
diff --git a/paddle/phi/core/cuda_stream.h b/paddle/phi/core/cuda_stream.h
index b1565643c97878..87db6a04097879 100644
--- a/paddle/phi/core/cuda_stream.h
+++ b/paddle/phi/core/cuda_stream.h
@@ -30,7 +30,7 @@ using gpuStream_t = hipStream_t;
 
 #include "glog/logging.h"
 
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 
 namespace phi {
 
diff --git a/paddle/phi/core/ddim.cc b/paddle/phi/core/ddim.cc
deleted file mode 100644
index ff95346be17c7a..00000000000000
--- a/paddle/phi/core/ddim.cc
+++ /dev/null
@@ -1,230 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/ddim.h"
-
-#include <set>
-
-namespace phi {
-
-DDim make_ddim(std::initializer_list<int64_t> dims) {
-  return DDim(dims.begin(), static_cast<int>(dims.size()));
-}
-
-DDim make_ddim(const std::vector<int64_t>& dims) {
-  return DDim(dims.data(), static_cast<int>(dims.size()));
-}
-
-DDim make_ddim(const std::vector<int>& dims) {
-  return DDim(dims.data(), static_cast<int>(dims.size()));
-}
-
-struct DDimEqualityVisitor {
-  explicit DDimEqualityVisitor(const int64_t* d) : d_(d) {}
-
-  template <int D>
-  inline bool operator()(const Dim<D>& self) const {
-    return UnrollCompare<D>::Run(self.Get(), d_);
-  }
-
-  const int64_t* d_;
-};
-
-bool DDim::operator==(const DDim& d) const {
-  if (size() == -1 && d.size() == -1) {
-    return true;
-  } else if (size() == -1 || d.size() == -1) {
-    return false;
-  } else {
-    return size() == d.size() &&
-           this->apply_visitor(DDimEqualityVisitor(d.Get()));
-  }
-}
-
-bool DDim::operator!=(const DDim& d) const { return !(*this == d); }
-
-std::string DDim::to_str() const {
-  std::stringstream ss;
-  ss << '[';
-  if (rank_ > 0) ss << dim_[0];
-
-  for (int i = 1; i < rank_; ++i) ss << ", " << dim_[i];
-  ss << ']';
-  return ss.str();
-}
-
-struct ProductVisitor {
-  template <int D>
-  inline int64_t operator()(const Dim<D>& dim) {
-    return product(dim);
-  }
-};
-
-int64_t product(const DDim& ddim) {
-  if (ddim.size() == -1) {
-    return 0;
-  }
-  return ddim.apply_visitor(ProductVisitor());
-}
-
-bool contain_unknown_dim(const DDim& ddim) {
-  for (int i = 0; i < ddim.size(); ++i) {
-    if (ddim[i] < 0) {
-      return true;
-    }
-  }
-
-  return false;
-}
-
-DDim slice_ddim(const DDim& dim, int begin, int end) {
-  PADDLE_ENFORCE_EQ(
-      (begin >= 0 && end <= dim.size()),
-      true,
-      phi::errors::InvalidArgument(
-          "[begin(%d), end(%d)) must be inside [0, %d) in ddim slice.",
-          begin,
-          end,
-          dim.size()));
-  // Constructor of DDim would check whether end - begin is valid
-  return DDim(dim.Get() + begin, end - begin);
-}
-
-int arity(const DDim& d) { return d.size(); }
-
-struct DDimPrinter {
-  std::ostream& os;
-  explicit DDimPrinter(std::ostream& os_) : os(os_) {}
-
-  template <int D>
-  void operator()(const Dim<D>& t) {
-    os << t;
-  }
-};
-
-std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
-  if (ddim.size() == -1) {
-    return os;
-  }
-  ddim.apply_visitor(DDimPrinter(os));
-  return os;
-}
-
-DDim flatten_to_3d(const DDim& src, int num_row_dims, int num_col_dims) {
-  PADDLE_ENFORCE_GE(
-      src.size(),
-      3,
-      phi::errors::InvalidArgument("The rank of src dim should be at least 3 "
-                                   "in flatten_to_3d, but received %d.",
-                                   src.size()));
-  PADDLE_ENFORCE_EQ(
-      (num_row_dims >= 1 && num_row_dims < src.size()),
-      true,
-      phi::errors::InvalidArgument("The num_row_dims should be inside [1, %d] "
-                                   "in flatten_to_3d, but received %d.",
-                                   src.size() - 1,
-                                   num_row_dims));
-  PADDLE_ENFORCE_EQ(
-      (num_col_dims >= 2 && num_col_dims <= src.size()),
-      true,
-      phi::errors::InvalidArgument("The num_col_dims should be inside [2, %d] "
-                                   "in flatten_to_3d, but received %d.",
-                                   src.size(),
-                                   num_col_dims));
-  PADDLE_ENFORCE_GE(
-      num_col_dims,
-      num_row_dims,
-      phi::errors::InvalidArgument(
-          "The num_row_dims should be less than num_col_dims in flatten_to_3d,"
-          "but received num_row_dims = %d, num_col_dims = %d.",
-          num_row_dims,
-          num_col_dims));
-
-  return DDim({product(slice_ddim(src, 0, num_row_dims)),
-               product(slice_ddim(src, num_row_dims, num_col_dims)),
-               product(slice_ddim(src, num_col_dims, src.size()))});
-}
-
-DDim flatten_to_2d(const DDim& src, int num_col_dims) {
-  return DDim({product(slice_ddim(src, 0, num_col_dims)),
-               product(slice_ddim(src, num_col_dims, src.size()))});
-}
-
-DDim flatten_to_1d(const DDim& src) { return DDim({product(src)}); }
-
-DDim stride(const DDim& ddim) {
-  DDim strides;
-  strides.rank_ = ddim.size();
-  if (ddim.size() > 0) strides[ddim.size() - 1] = 1;
-  for (int i = ddim.size() - 2; i >= 0; --i) {
-    strides[i] = strides[i + 1] * ddim[i + 1];
-  }
-  return strides;
-}
-
-DDim stride_numel(const DDim& ddim) {
-  DDim strides;
-  strides.rank_ = ddim.size();
-  if (ddim.size() > 0) strides[ddim.size() - 1] = ddim[ddim.size() - 1];
-  for (int i = ddim.size() - 2; i >= 0; --i) {
-    strides[i] = strides[i + 1] * ddim[i];
-  }
-  return strides;
-}
-
-DDim DDim::reshape(std::vector<int>& shape) const {
-  const DDim& in_dims = *this;
-
-  for (int i = 0; i < static_cast<int>(shape.size()); ++i) {
-    if (shape[i] == 0) {
-      shape[i] = static_cast<int>(in_dims.at(i));
-    }
-  }
-
-  // Dim marked as "-1" must be inferred
-  auto it = std::find(shape.begin(), shape.end(), -1);
-  if (it != shape.end()) {
-    int index = static_cast<int>(std::distance(shape.begin(), it));
-    int reshape_out_product =
-        std::accumulate(shape.begin(), shape.end(), -1, std::multiplies<int>());
-    shape[index] = static_cast<int>(product(in_dims)) / reshape_out_product;
-  }
-
-  return phi::make_ddim(shape);
-}
-
-DDim DDim::transpose(const std::vector<int>& axis) const {
-  const DDim& in_dims = *this;
-
-  DDim out_dims(in_dims);
-  for (int i = 0; i < static_cast<int>(axis.size()); i++) {
-    out_dims[i] = in_dims[axis[i]];
-  }
-  return out_dims;
-}
-
-}  // namespace phi
-
-namespace std {
-
-std::size_t hash<phi::DDim>::operator()(phi::DDim const& ddim) const {
-  int ndim = ddim.size();
-  std::size_t seed = ndim;
-  for (int i = 0; i < ndim; ++i) {
-    seed ^= ddim.Get()[i] + 0x9e3779b9 + (seed << 6) + (seed >> 2);
-  }
-  return seed;
-}
-
-}  // namespace std
diff --git a/paddle/phi/core/ddim.h b/paddle/phi/core/ddim.h
deleted file mode 100644
index 22df7f9a1044ed..00000000000000
--- a/paddle/phi/core/ddim.h
+++ /dev/null
@@ -1,284 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <initializer_list>
-#include <numeric>
-#include <stdexcept>
-#include <string>
-#include <vector>
-
-#include "paddle/common/dim.h"
-#include "paddle/common/exception.h"
-#include "paddle/utils/test_macros.h"
-
-namespace phi {
-
-#define PADDLE_VISIT_DDIM_BASE(rank, callback) \
-  case (rank): {                               \
-    constexpr auto kRank = (rank);             \
-    return (callback);                         \
-  }
-
-#define PADDLE_VISIT_DDIM(rank, callback)                                    \
-  switch (rank) {                                                            \
-    PADDLE_VISIT_DDIM_BASE(0, callback);                                     \
-    PADDLE_VISIT_DDIM_BASE(1, callback);                                     \
-    PADDLE_VISIT_DDIM_BASE(2, callback);                                     \
-    PADDLE_VISIT_DDIM_BASE(3, callback);                                     \
-    PADDLE_VISIT_DDIM_BASE(4, callback);                                     \
-    PADDLE_VISIT_DDIM_BASE(5, callback);                                     \
-    PADDLE_VISIT_DDIM_BASE(6, callback);                                     \
-    PADDLE_VISIT_DDIM_BASE(7, callback);                                     \
-    PADDLE_VISIT_DDIM_BASE(8, callback);                                     \
-    PADDLE_VISIT_DDIM_BASE(9, callback);                                     \
-    default:                                                                 \
-      PD_THROW(                                                              \
-          "Unimplemented error. Invalid dimension to be accessed. Now only " \
-          "supports access to "                                              \
-          "dimension 0 to 9, but received dimension is ",                    \
-          rank,                                                              \
-          ".");                                                              \
-  }
-
-template <typename T1, typename T2>
-inline void dynamic_dim_assign(const T1* in, T2* out, int n) {
-  if (n == -1) {
-    return;
-  }
-  PADDLE_VISIT_DDIM(n, (common::static_dim_assign<kRank, T1, T2>(in, out)));
-}
-
-/**
- * \brief A dynamically sized dimension.
- *
- * The number of dimensions must be between [1, 9].
- */
-class DDim {
- public:
-  constexpr static int kMaxRank = 9;
-
-  DDim() : rank_(-1) { dim_[0] = 0; }
-
-  DDim(const DDim& ddim) : dim_() { CopyFrom(ddim); }
-
-  DDim(const int* d, int n) : rank_(n) {
-    dynamic_dim_assign(d, dim_.GetMutable(), n);
-  }
-
-  DDim(const int64_t* d, int n) : rank_(n) {
-    dynamic_dim_assign(d, dim_.GetMutable(), n);
-  }
-
-  template <int D>
-  /*implicit*/ DDim(const common::Dim<D>& in) : rank_(D) {  // NOLINT
-    UnsafeCast<D>() = in;
-  }
-
-  /*implicit*/ DDim(std::initializer_list<int64_t> init_list)
-      : DDim(init_list.begin(), init_list.size()) {}
-
-  inline DDim& operator=(const DDim& ddim) { return CopyFrom(ddim); }
-
-  template <int D>
-  inline DDim& operator=(const common::Dim<D>& dim) {
-    rank_ = D;
-    UnsafeCast<D>() = dim;
-    return *this;
-  }
-
-  inline int64_t& operator[](int idx) { return dim_[idx]; }
-
-  inline int64_t operator[](int idx) const { return dim_[idx]; }
-
-  int64_t& at(int idx) {
-    PADDLE_ENFORCE_GE(idx,
-                      0,
-                      common::errors::InvalidArgument(
-                          "Invalid DDim index to be accessed. The valid index "
-                          "is between 0 and %d, but received index is %d.",
-                          rank_,
-                          idx));
-    PADDLE_ENFORCE_LT(idx,
-                      rank_,
-                      common::errors::InvalidArgument(
-                          "Invalid DDim index to be accessed. The valid index "
-                          "is between 0 and %d, but received index is %d.",
-                          rank_,
-                          idx));
-    return dim_[idx];
-  }
-
-  int64_t at(int idx) const {
-    PADDLE_ENFORCE_GE(idx,
-                      0,
-                      common::errors::InvalidArgument(
-                          "Invalid DDim index to be accessed. The valid index "
-                          "is between 0 and %d, but received index is %d.",
-                          rank_,
-                          idx));
-    PADDLE_ENFORCE_LT(idx,
-                      rank_,
-                      common::errors::InvalidArgument(
-                          "Invalid DDim index to be accessed. The valid index "
-                          "is between 0 and %d, but received index is %d.",
-                          rank_,
-                          idx));
-    return dim_[idx];
-  }
-
-  template <typename Visitor>
-  typename std::result_of<Visitor(common::Dim<0>&)>::type apply_visitor(
-      Visitor&& visitor) {
-    PADDLE_VISIT_DDIM(rank_, visitor(UnsafeCast<kRank>()));
-  }
-
-  template <typename Visitor>
-  typename std::result_of<Visitor(const common::Dim<0>&)>::type apply_visitor(
-      Visitor&& visitor) const {
-    PADDLE_VISIT_DDIM(rank_, visitor(UnsafeCast<kRank>()));
-  }
-
-  bool operator==(const DDim& d) const;
-
-  bool operator!=(const DDim& d) const;
-
-  inline const int64_t* Get() const { return dim_.Get(); }
-
-  inline int64_t* GetMutable() { return dim_.GetMutable(); }
-
-  inline int size() const { return rank_; }
-
-  std::string to_str() const;
-
-  DDim reshape(std::vector<int>& shape) const;  // NOLINT
-
-  DDim transpose(const std::vector<int>& axis) const;
-
- private:
-  template <int D>
-  inline common::Dim<D>& UnsafeCast() {
-    static_assert(D >= 0 && D <= kMaxRank, "Invalid rank");
-    auto* p = static_cast<void*>(&dim_);
-    return *reinterpret_cast<common::Dim<D>*>(p);
-  }
-
-  template <int D>
-  inline const common::Dim<D>& UnsafeCast() const {
-    static_assert(D >= 0 && D <= kMaxRank, "Invalid rank");
-    auto* p = static_cast<const void*>(&dim_);
-    return *reinterpret_cast<const common::Dim<D>*>(p);
-  }
-
-  inline DDim& CopyFrom(const DDim& ddim) {
-    if (ddim.rank_ == -1) {
-      rank_ = -1;
-      return *this;
-    }
-    PADDLE_VISIT_DDIM(ddim.rank_, (*this = ddim.UnsafeCast<kRank>()));
-  }
-
-  friend DDim stride(const DDim& ddim);
-  friend DDim stride_numel(const DDim& ddim);
-
- private:
-  common::Dim<kMaxRank> dim_;
-  int rank_;
-};
-
-#undef PADDLE_VISIT_DDIM_BASE
-#undef PADDLE_VISIT_DDIM
-
-/**
- * \brief Make a DDim from std::vector<int64_t>
- *
- * \param dims An vector of ints. Must be sized between [1, 9]
- */
-TEST_API DDim make_ddim(const std::vector<int64_t>& dims);
-
-TEST_API DDim make_ddim(const std::vector<int>& dims);
-
-/**
- * \brief Make a DDim from an initializer list
- *
- * \param dims An initializer list of ints. Must be sized between [1, 9]
- *
- */
-TEST_API DDim make_ddim(std::initializer_list<int64_t> dims);
-
-template <typename T = int64_t>
-std::vector<T> vectorize(const DDim& ddim) {
-  if (ddim.size() == -1) {
-    return std::vector<T>({0});
-  }
-  std::vector<T> result(DDim::kMaxRank);
-  dynamic_dim_assign(ddim.Get(), result.data(), ddim.size());
-  result.resize(ddim.size());
-  return result;
-}
-
-TEST_API int64_t product(const DDim& ddim);
-
-bool contain_unknown_dim(const DDim& ddim);
-
-/**
- * \brief Slice a ddim
- *
- * Slice dim with [begin, end).
- * e.g.  DDim d = make_ddim({1,2,3,4,5});
- *       slice_ddim(d, 1, 3); ====> {2,3}
- */
-DDim slice_ddim(const DDim& dim, int begin, int end);
-
-/**
- * \brief What is the length of this dimension?
- *
- * \param Dynamic dimension to inspect
- */
-
-int arity(const DDim& ddim);
-
-TEST_API std::ostream& operator<<(std::ostream&, const DDim&);
-
-/**
- * \brief Flatten dim to 3d
- * e.g., DDim d = mak_ddim({1, 2, 3, 4, 5, 6})
- *       flatten_to_3d(d, 2, 4); ===> {1*2, 3*4, 5*6} ===> {2, 12, 30}
- */
-DDim flatten_to_3d(const DDim& src, int num_row_dims, int num_col_dims);
-
-// Reshape a tensor to a matrix. The matrix's first dimension(column length)
-// will be the product of tensor's first `num_col_dims` dimensions.
-DDim flatten_to_2d(const DDim& src, int num_col_dims);
-
-DDim flatten_to_1d(const DDim& src);
-
-DDim stride(const DDim& ddim);
-
-DDim stride_numel(const DDim& ddim);
-}  // namespace phi
-
-namespace paddle {
-namespace framework {
-
-using DDim = phi::DDim;
-
-}  // namespace framework
-}  // namespace paddle
-
-namespace std {
-template <>
-struct hash<phi::DDim> {
-  std::size_t operator()(phi::DDim const& ddim) const;
-};
-}  // namespace std
diff --git a/paddle/phi/core/device_context.cc b/paddle/phi/core/device_context.cc
index 3804802e84260d..f148ad6e255744 100644
--- a/paddle/phi/core/device_context.cc
+++ b/paddle/phi/core/device_context.cc
@@ -18,8 +18,8 @@
 #include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
 #endif
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/selected_rows.h"
 #include "paddle/phi/core/string_tensor.h"
 
diff --git a/paddle/phi/core/distributed/auto_parallel/device_mesh.h b/paddle/phi/core/distributed/auto_parallel/device_mesh.h
index 0888d5e2e7a2a6..03571c7932f33b 100644
--- a/paddle/phi/core/distributed/auto_parallel/device_mesh.h
+++ b/paddle/phi/core/distributed/auto_parallel/device_mesh.h
@@ -23,9 +23,9 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/core/distributed/auto_parallel/auto_parallel.pb.h"
 #include "paddle/phi/core/distributed/auto_parallel/utils.h"
-#include "paddle/phi/core/enforce.h"
 
 namespace phi {
 namespace distributed {
diff --git a/paddle/phi/core/distributed/auto_parallel/dist_attr.h b/paddle/phi/core/distributed/auto_parallel/dist_attr.h
index 6689750d24ad9c..a9643912e3f5da 100644
--- a/paddle/phi/core/distributed/auto_parallel/dist_attr.h
+++ b/paddle/phi/core/distributed/auto_parallel/dist_attr.h
@@ -21,11 +21,11 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/common/reduce_type.h"
 #include "paddle/phi/core/distributed/auto_parallel/auto_parallel.pb.h"
 #include "paddle/phi/core/distributed/auto_parallel/process_mesh.h"
 #include "paddle/phi/core/distributed/auto_parallel/utils.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/utils/flat_hash_map.h"
 #include "paddle/utils/test_macros.h"
 
diff --git a/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h b/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h
index 922bdbebf895ec..ee8793ae687a78 100644
--- a/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h
+++ b/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <typeinfo>
 #include <utility>
 
+#include "paddle/common/enforce.h"
 #include "paddle/common/macros.h"
 #include "paddle/common/scalar.h"
 #include "paddle/common/type_defs.h"
@@ -27,7 +28,6 @@ limitations under the License. */
 #include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
 #include "paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.h"
 #include "paddle/phi/core/distributed/type_defs.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/utils/any.h"
 #include "paddle/utils/flat_hash_map.h"
 #include "paddle/utils/small_vector.h"
diff --git a/paddle/phi/core/distributed/auto_parallel/process_mesh.h b/paddle/phi/core/distributed/auto_parallel/process_mesh.h
index d512255ec10359..60a8031c2cc7b6 100644
--- a/paddle/phi/core/distributed/auto_parallel/process_mesh.h
+++ b/paddle/phi/core/distributed/auto_parallel/process_mesh.h
@@ -20,10 +20,10 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/core/distributed/auto_parallel/auto_parallel.pb.h"
 #include "paddle/phi/core/distributed/auto_parallel/device_mesh.h"
 #include "paddle/phi/core/distributed/auto_parallel/utils.h"
-#include "paddle/phi/core/enforce.h"
 
 namespace phi {
 namespace distributed {
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.cc b/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.cc
index e7a1ec15da307a..c0a7d2dc59dd59 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.cc
+++ b/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.cc
@@ -15,13 +15,13 @@
 #include "paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h"
 
 #include "glog/logging.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/core/device_context.h"
 #include "paddle/phi/core/distributed/auto_parallel/process_mesh.h"
 #include "paddle/phi/core/distributed/auto_parallel/reshard/reshard_function.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/distributed/store/store_utils.h"
-#include "paddle/phi/core/enforce.h"
 
 namespace phi {
 namespace distributed {
diff --git a/paddle/phi/core/distributed/auto_parallel/utils.h b/paddle/phi/core/distributed/auto_parallel/utils.h
index 915c1565296700..ec6ab156050aa2 100644
--- a/paddle/phi/core/distributed/auto_parallel/utils.h
+++ b/paddle/phi/core/distributed/auto_parallel/utils.h
@@ -21,7 +21,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 
 namespace phi {
 namespace distributed {
@@ -52,7 +52,7 @@ inline int64_t canonical_dim(int dim, int ndim) {
   PADDLE_ENFORCE_EQ(
       dim >= -ndim && dim < ndim,
       true,
-      errors::InvalidArgument(
+      common::errors::InvalidArgument(
           "Dimension %d is outside of [-%d, %d).", dim, ndim, ndim));
   if (dim < 0) {
     return dim + ndim;
diff --git a/paddle/phi/core/distributed/check/nccl_dynamic_check.cc b/paddle/phi/core/distributed/check/nccl_dynamic_check.cc
index 0cb295b1787a55..9836d04cdf2c84 100644
--- a/paddle/phi/core/distributed/check/nccl_dynamic_check.cc
+++ b/paddle/phi/core/distributed/check/nccl_dynamic_check.cc
@@ -16,9 +16,9 @@
 
 #include "glog/logging.h"
 
+#include "paddle/common/enforce.h"
+#include "paddle/common/errors.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 
 #if defined(PADDLE_WITH_RCCL)
 #include <hip/hip_runtime.h>
@@ -64,13 +64,13 @@ void NCCLDynamicCheck::CheckDataType(const phi::DenseTensor& tensor,
   PADDLE_ENFORCE_GPU_SUCCESS(
       gpuMemcpy(dtype_device, &dtype_host, kSize, gpuMemcpyHostToDevice));
 
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclBroadcast(dtype_device,
-                                                         dtype_device,
-                                                         1,
-                                                         ncclInt64,
-                                                         root_rank,
-                                                         comm,
-                                                         kDefaultStream));
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::ncclBroadcast(dtype_device,
+                                                            dtype_device,
+                                                            1,
+                                                            ncclInt64,
+                                                            root_rank,
+                                                            comm,
+                                                            kDefaultStream));
 
   if (root_rank == cur_rank) {
     VLOG(3) << "Dynamic check broadcast metadata, dtype: " << dtype_host;
@@ -106,13 +106,13 @@ void NCCLDynamicCheck::CheckShape(const phi::DenseTensor& tensor,
   PADDLE_ENFORCE_GPU_SUCCESS(
       gpuMemcpy(shape_device, &shape_host, kSize, gpuMemcpyHostToDevice));
 
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclBroadcast(shape_device,
-                                                         shape_device,
-                                                         1,
-                                                         ncclInt64,
-                                                         root_rank,
-                                                         comm,
-                                                         kDefaultStream));
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::ncclBroadcast(shape_device,
+                                                            shape_device,
+                                                            1,
+                                                            ncclInt64,
+                                                            root_rank,
+                                                            comm,
+                                                            kDefaultStream));
 
   if (root_rank == cur_rank) {
     VLOG(3) << "Dynamic check broadcast metadata, shape: " << shape_host;
@@ -143,14 +143,14 @@ void NCCLDynamicCheck::CheckShape(const phi::DenseTensor& out_tensor,
     PADDLE_ENFORCE_GPU_SUCCESS(gpuMalloc(&in_shape_device, kSize));
     PADDLE_ENFORCE_GPU_SUCCESS(gpuMemcpy(
         in_shape_device, &in_shape_host, kSize, gpuMemcpyHostToDevice));
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclReduce(in_shape_device,
-                                                        in_shape_device,
-                                                        1,
-                                                        ncclInt64,
-                                                        ncclSum,
-                                                        rank,
-                                                        comm,
-                                                        kDefaultStream));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::ncclReduce(in_shape_device,
+                                                           in_shape_device,
+                                                           1,
+                                                           ncclInt64,
+                                                           ncclSum,
+                                                           rank,
+                                                           comm,
+                                                           kDefaultStream));
     if (rank == cur_rank) {
       PADDLE_ENFORCE_GPU_SUCCESS(gpuMemcpy(
           &in_shape_host, in_shape_device, kSize, gpuMemcpyDeviceToHost));
@@ -178,13 +178,13 @@ void NCCLDynamicCheck::CheckGatherShape(
                                        world_size * sizeof(int64_t),
                                        gpuMemcpyHostToDevice));
 
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(in_shape_device,
-                                                         in_shape_device,
-                                                         world_size,
-                                                         ncclInt64,
-                                                         ncclSum,
-                                                         comm,
-                                                         kDefaultStream));
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::ncclAllReduce(in_shape_device,
+                                                            in_shape_device,
+                                                            world_size,
+                                                            ncclInt64,
+                                                            ncclSum,
+                                                            comm,
+                                                            kDefaultStream));
   PADDLE_ENFORCE_GPU_SUCCESS(gpuMemcpy(shapes.data(),
                                        in_shape_device,
                                        world_size * sizeof(int64_t),
diff --git a/paddle/phi/core/distributed/check/static_check.cc b/paddle/phi/core/distributed/check/static_check.cc
index 8ec3e19e6038ea..b6e208c677cd73 100644
--- a/paddle/phi/core/distributed/check/static_check.cc
+++ b/paddle/phi/core/distributed/check/static_check.cc
@@ -17,9 +17,9 @@
 #include <cstdlib>
 #include <cstring>
 
+#include "paddle/common/enforce.h"
+#include "paddle/common/errors.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 
 namespace phi {
 namespace distributed {
diff --git a/paddle/phi/core/distributed/comm_context_manager.cc b/paddle/phi/core/distributed/comm_context_manager.cc
index 2a5b336f34e256..9450aa51cd2b91 100644
--- a/paddle/phi/core/distributed/comm_context_manager.cc
+++ b/paddle/phi/core/distributed/comm_context_manager.cc
@@ -18,9 +18,9 @@
 #include <string>
 #include "glog/logging.h"
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/core/distributed/store/store.h"
-#include "paddle/phi/core/enforce.h"
 
 #if defined(PADDLE_WITH_GLOO)
 #include <gloo/rendezvous/prefix_store.h>
@@ -65,7 +65,7 @@ void CommContextManager::CreateNCCLCommContext(
   }
   ncclUniqueId nccl_id;
   if (rank == 0 || (p2p_opt && p2p_opt->is_p2p_op && p2p_opt->p2p_rank == 0)) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclGetUniqueId(&nccl_id));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::ncclGetUniqueId(&nccl_id));
   }
 
   std::string unique_key = "NCCLCommContext/" + unique_comm_key + hash_key;
diff --git a/paddle/phi/core/distributed/comm_task.h b/paddle/phi/core/distributed/comm_task.h
index 079c16902b7b9d..489700934d61e6 100644
--- a/paddle/phi/core/distributed/comm_task.h
+++ b/paddle/phi/core/distributed/comm_task.h
@@ -18,9 +18,9 @@
 #include <condition_variable>
 #include <cstdint>
 #include <exception>
+#include "paddle/common/enforce.h"
 #include "paddle/common/macros.h"
 #include "paddle/phi/core/distributed/utils.h"
-#include "paddle/phi/core/enforce.h"
 
 #if defined(PADDLE_WITH_RCCL)
 #include "paddle/phi/backends/dynload/rccl.h"
diff --git a/paddle/phi/core/distributed/comm_task_manager.cc b/paddle/phi/core/distributed/comm_task_manager.cc
index 37083119b59f59..a32d433739b2fd 100644
--- a/paddle/phi/core/distributed/comm_task_manager.cc
+++ b/paddle/phi/core/distributed/comm_task_manager.cc
@@ -27,9 +27,9 @@
 
 #include "gflags/gflags.h"
 #include "glog/logging.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/core/distributed/store/store.h"
-#include "paddle/phi/core/enforce.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/phi/core/distributed/comm_task_manager.h"
diff --git a/paddle/phi/core/distributed/gloo_comm_context.cc b/paddle/phi/core/distributed/gloo_comm_context.cc
index 098bc851bf11c3..863d8d76e50359 100644
--- a/paddle/phi/core/distributed/gloo_comm_context.cc
+++ b/paddle/phi/core/distributed/gloo_comm_context.cc
@@ -24,10 +24,10 @@
 #include <gloo/scatter.h>
 #include <gloo/types.h>
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/distributed/check/static_check.h"
-#include "paddle/phi/core/enforce.h"
 
 namespace phi {
 namespace distributed {
diff --git a/paddle/phi/core/distributed/gloo_utils.cc b/paddle/phi/core/distributed/gloo_utils.cc
index 312681384a1996..1472e15420ca2b 100644
--- a/paddle/phi/core/distributed/gloo_utils.cc
+++ b/paddle/phi/core/distributed/gloo_utils.cc
@@ -26,10 +26,10 @@
 #include <cstdlib>
 #include <cstring>
 
+#include "paddle/common/enforce.h"
+#include "paddle/common/errors.h"
 #include "paddle/phi/core/distributed/gloo_utils.h"
 #include "paddle/phi/core/distributed/store/tcp_utils.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 
 namespace phi {
 namespace distributed {
diff --git a/paddle/phi/core/distributed/nccl_comm_context.cc b/paddle/phi/core/distributed/nccl_comm_context.cc
index d1d92c98fb0fd6..c2bedaf4d613b0 100644
--- a/paddle/phi/core/distributed/nccl_comm_context.cc
+++ b/paddle/phi/core/distributed/nccl_comm_context.cc
@@ -16,13 +16,13 @@
 
 #include "glog/logging.h"
 
+#include "paddle/common/data_type.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/distributed/check/nccl_dynamic_check.h"
 #include "paddle/phi/core/distributed/check/static_check.h"
 #include "paddle/phi/core/distributed/nccl_tools.h"
 #include "paddle/phi/core/distributed/utils.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/utils/data_type.h"
 
 namespace phi {
 namespace distributed {
@@ -33,8 +33,8 @@ constexpr bool FLAGS_enable_nccl_dynamic_check = false;
 NCCLCommContext::NCCLCommContext(int rank, int size, ncclUniqueId nccl_id)
     : CommContext(rank, size) {
   NCCL_CHECK(
-      phi::dynload::ncclCommInitRank(&nccl_comm_, size_, nccl_id, rank_));
-  NCCL_CHECK(phi::dynload::ncclGetVersion(&nccl_version_));
+      common::dynload::ncclCommInitRank(&nccl_comm_, size_, nccl_id, rank_));
+  NCCL_CHECK(common::dynload::ncclGetVersion(&nccl_version_));
 }
 
 int NCCLCommContext::GetNcclVersion() { return nccl_version_; }
@@ -77,13 +77,13 @@ void NCCLCommContext::Broadcast(phi::DenseTensor* out_tensor,
   if (FLAGS_enable_nccl_dynamic_check) {
     NCCLDynamicCheck::CheckShape(*out_tensor, root, rank_, nccl_comm_);
   }
-  NCCL_CHECK(phi::dynload::ncclBroadcast(in_tensor.data(),
-                                         out_tensor->data(),
-                                         in_tensor.numel(),
-                                         ToNCCLDataType(in_tensor.type()),
-                                         root,
-                                         nccl_comm_,
-                                         stream));
+  NCCL_CHECK(common::dynload::ncclBroadcast(in_tensor.data(),
+                                            out_tensor->data(),
+                                            in_tensor.numel(),
+                                            ToNCCLDataType(in_tensor.type()),
+                                            root,
+                                            nccl_comm_,
+                                            stream));
 }
 
 void NCCLCommContext::AllGather(phi::DenseTensor* out_tensor,
@@ -100,12 +100,12 @@ void NCCLCommContext::AllGather(phi::DenseTensor* out_tensor,
                                                    rank_,
                                                    nccl_comm_);
   }
-  NCCL_CHECK(phi::dynload::ncclAllGather(in_tensor.data(),
-                                         out_tensor->data(),
-                                         in_tensor.numel(),
-                                         ToNCCLDataType(in_tensor.type()),
-                                         nccl_comm_,
-                                         stream));
+  NCCL_CHECK(common::dynload::ncclAllGather(in_tensor.data(),
+                                            out_tensor->data(),
+                                            in_tensor.numel(),
+                                            ToNCCLDataType(in_tensor.type()),
+                                            nccl_comm_,
+                                            stream));
 }
 void NCCLCommContext::ReduceScatter(phi::DenseTensor* out_tensor,
                                     const phi::DenseTensor& in_tensor,
@@ -122,13 +122,14 @@ void NCCLCommContext::ReduceScatter(phi::DenseTensor* out_tensor,
                                                    rank_,
                                                    nccl_comm_);
   }
-  NCCL_CHECK(phi::dynload::ncclReduceScatter(in_tensor.data(),
-                                             out_tensor->data(),
-                                             out_tensor->numel(),
-                                             ToNCCLDataType(in_tensor.type()),
-                                             reduce_type,
-                                             nccl_comm_,
-                                             stream));
+  NCCL_CHECK(
+      common::dynload::ncclReduceScatter(in_tensor.data(),
+                                         out_tensor->data(),
+                                         out_tensor->numel(),
+                                         ToNCCLDataType(in_tensor.type()),
+                                         reduce_type,
+                                         nccl_comm_,
+                                         stream));
 }
 
 void NCCLCommContext::Send(const phi::DenseTensor& in_tensor,
@@ -141,12 +142,12 @@ void NCCLCommContext::Send(const phi::DenseTensor& in_tensor,
     NCCLDynamicCheck::CheckShape(in_tensor, rank_, rank_, nccl_comm_);
   }
 
-  NCCL_CHECK(phi::dynload::ncclSend(in_tensor.data(),
-                                    count,
-                                    ToNCCLDataType(in_tensor.dtype()),
-                                    peer,
-                                    nccl_comm_,
-                                    stream));
+  NCCL_CHECK(common::dynload::ncclSend(in_tensor.data(),
+                                       count,
+                                       ToNCCLDataType(in_tensor.dtype()),
+                                       peer,
+                                       nccl_comm_,
+                                       stream));
   VLOG(3) << "rank " << GetRank() << " send " << phi::product(in_tensor.dims())
           << " to " << peer;
 }
@@ -160,12 +161,12 @@ void NCCLCommContext::Recv(phi::DenseTensor* out_tensor,
     NCCLDynamicCheck::CheckShape(*out_tensor, peer, rank_, nccl_comm_);
   }
 
-  NCCL_CHECK(phi::dynload::ncclRecv(out_tensor->data(),
-                                    count,
-                                    ToNCCLDataType(out_tensor->dtype()),
-                                    peer,
-                                    nccl_comm_,
-                                    stream));
+  NCCL_CHECK(common::dynload::ncclRecv(out_tensor->data(),
+                                       count,
+                                       ToNCCLDataType(out_tensor->dtype()),
+                                       peer,
+                                       nccl_comm_,
+                                       stream));
   VLOG(3) << "rank " << GetRank() << " recv "
           << phi::product(out_tensor->dims()) << " from " << peer;
 }
@@ -185,13 +186,13 @@ void NCCLCommContext::AllReduce(phi::DenseTensor* out_tensor,
                                                    rank_,
                                                    nccl_comm_);
   }
-  NCCL_CHECK(phi::dynload::ncclAllReduce(in_tensor.data(),
-                                         out_tensor->data(),
-                                         in_tensor.numel(),
-                                         ToNCCLDataType(in_tensor.type()),
-                                         reduce_type,
-                                         nccl_comm_,
-                                         stream));
+  NCCL_CHECK(common::dynload::ncclAllReduce(in_tensor.data(),
+                                            out_tensor->data(),
+                                            in_tensor.numel(),
+                                            ToNCCLDataType(in_tensor.type()),
+                                            reduce_type,
+                                            nccl_comm_,
+                                            stream));
 }
 
 void NCCLCommContext::Reduce(phi::DenseTensor* out_tensor,
@@ -210,32 +211,34 @@ void NCCLCommContext::Reduce(phi::DenseTensor* out_tensor,
                                                    rank_,
                                                    nccl_comm_);
   }
-  NCCL_CHECK(phi::dynload::ncclReduce(in_tensor.data(),
-                                      out_tensor->data(),
-                                      in_tensor.numel(),
-                                      ToNCCLDataType(in_tensor.type()),
-                                      reduce_type,
-                                      root,
-                                      nccl_comm_,
-                                      stream));
+  NCCL_CHECK(common::dynload::ncclReduce(in_tensor.data(),
+                                         out_tensor->data(),
+                                         in_tensor.numel(),
+                                         ToNCCLDataType(in_tensor.type()),
+                                         reduce_type,
+                                         root,
+                                         nccl_comm_,
+                                         stream));
 }
 
 void NCCLCommContext::GroupStart() {
-  NCCL_CHECK(phi::dynload::ncclGroupStart());
+  NCCL_CHECK(common::dynload::ncclGroupStart());
+}
+void NCCLCommContext::GroupEnd() {
+  NCCL_CHECK(common::dynload::ncclGroupEnd());
 }
-void NCCLCommContext::GroupEnd() { NCCL_CHECK(phi::dynload::ncclGroupEnd()); }
 
 #if NCCL_VERSION_CODE >= 21100
 void NCCLCommContext::RedOpCreatePreMulSum(ncclRedOp_t* op,
                                            void* scalar,
                                            ncclDataType_t dtype,
                                            ncclScalarResidence_t residence) {
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRedOpCreatePreMulSum(
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::ncclRedOpCreatePreMulSum(
       op, scalar, dtype, residence, nccl_comm_));
 }
 
 void NCCLCommContext::RedOpDestroy(ncclRedOp_t op) {
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRedOpDestroy(op, nccl_comm_));
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::ncclRedOpDestroy(op, nccl_comm_));
 }
 #endif
 
diff --git a/paddle/phi/core/distributed/nccl_comm_task.cc b/paddle/phi/core/distributed/nccl_comm_task.cc
index f82f39c1954a3d..a495d7ec87621d 100644
--- a/paddle/phi/core/distributed/nccl_comm_task.cc
+++ b/paddle/phi/core/distributed/nccl_comm_task.cc
@@ -17,10 +17,10 @@
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 
+#include "paddle/common/data_type.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/core/distributed/nccl_tools.h"
 #include "paddle/phi/core/distributed/trace_utils.h"
-#include "paddle/phi/core/utils/data_type.h"
 
 namespace phi {
 namespace distributed {
@@ -119,7 +119,7 @@ std::string GetNCCLErrorDetail(ncclResult_t result) {
   std::string last_error;
 #ifdef ENABLE_NCCL_GET_LAST_ERROR
   last_error =
-      ", Last error: " + std::string(phi::dynload::ncclGetLastError(NULL));
+      ", Last error: " + std::string(common::dynload::ncclGetLastError(NULL));
 #endif
   switch (result) {
     case ncclUnhandledCudaError:
@@ -167,7 +167,7 @@ std::string NCCLCommTask::GetCommErrors() {
 
   ncclResult_t nccl_async_error;
   NCCL_CHECK(
-      phi::dynload::ncclCommGetAsyncError(nccl_comm_, &nccl_async_error));
+      common::dynload::ncclCommGetAsyncError(nccl_comm_, &nccl_async_error));
   if (nccl_async_error != ncclSuccess) {
     comm_error_ =
         "\n\t Find nccl comm error: " + GetNCCLErrorDetail(nccl_async_error);
@@ -190,7 +190,7 @@ void NCCLCommTask::AbortComm() {
   if (aborted_) {
     return;
   }
-  NCCL_CHECK(phi::dynload::ncclCommAbort(nccl_comm_));
+  NCCL_CHECK(common::dynload::ncclCommAbort(nccl_comm_));
 
   aborted_ = true;
   nccl_comm_ = nullptr;
diff --git a/paddle/phi/core/distributed/nccl_tools.cc b/paddle/phi/core/distributed/nccl_tools.cc
index e419cfca905fa5..fa224684f119b9 100644
--- a/paddle/phi/core/distributed/nccl_tools.cc
+++ b/paddle/phi/core/distributed/nccl_tools.cc
@@ -16,8 +16,8 @@
 
 #include <unordered_map>
 
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
+#include "paddle/common/enforce.h"
+#include "paddle/common/errors.h"
 
 #if NCCL_VERSION_CODE >= 21300
 #define ENABLE_NCCL_GET_LAST_ERROR
@@ -37,7 +37,7 @@ ncclRedOp_t ToNCCLRedType(ReduceOp reduction) {
   auto it = red_type.find(reduction);
   PADDLE_ENFORCE_EQ(it != red_type.end(),
                     true,
-                    phi::errors::InvalidArgument(
+                    common::errors::InvalidArgument(
                         "Invalid nccl reduction. Must be ncclMin | ncclMax | "
                         "ncclProd | ncclSum"));
   return it->second;
@@ -75,7 +75,7 @@ std::string NCCLDTypeToString(ncclDataType_t dtype) {
   PD_NCCL_DTYPE_TO_STR(ncclUint64, "uint64");
 
 #undef PD_NCCL_DTYPE_TO_STR
-  PADDLE_THROW(phi::errors::InvalidArgument(
+  PADDLE_THROW(common::errors::InvalidArgument(
       "This datatype %d in nccl is not supported.", static_cast<int>(dtype)));
 }
 
diff --git a/paddle/phi/core/distributed/nccl_tools.h b/paddle/phi/core/distributed/nccl_tools.h
index 4268e690e7382d..8f7cdcce9dd434 100644
--- a/paddle/phi/core/distributed/nccl_tools.h
+++ b/paddle/phi/core/distributed/nccl_tools.h
@@ -29,16 +29,16 @@
 namespace phi {
 namespace distributed {
 
-#define NCCL_CHECK(cmd)                                                \
-  do {                                                                 \
-    ncclResult_t r = cmd;                                              \
-    if (r != ncclSuccess) {                                            \
-      PADDLE_THROW(                                                    \
-          phi::errors::External("Failed, NCCL error %s:%d '%s'\n",     \
-                                __FILE__,                              \
-                                __LINE__,                              \
-                                phi::dynload::ncclGetErrorString(r))); \
-    }                                                                  \
+#define NCCL_CHECK(cmd)                                                   \
+  do {                                                                    \
+    ncclResult_t r = cmd;                                                 \
+    if (r != ncclSuccess) {                                               \
+      PADDLE_THROW(                                                       \
+          phi::errors::External("Failed, NCCL error %s:%d '%s'\n",        \
+                                __FILE__,                                 \
+                                __LINE__,                                 \
+                                common::dynload::ncclGetErrorString(r))); \
+    }                                                                     \
   } while (0)
 
 #ifdef PADDLE_WITH_NCCL
diff --git a/paddle/phi/core/distributed/store/store.cc b/paddle/phi/core/distributed/store/store.cc
index 5987b694b4e51e..8d4e6c26bb2b21 100644
--- a/paddle/phi/core/distributed/store/store.cc
+++ b/paddle/phi/core/distributed/store/store.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/core/distributed/store/store.h"
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 
 namespace phi {
 namespace distributed {
diff --git a/paddle/phi/core/distributed/store/tcp_utils.h b/paddle/phi/core/distributed/store/tcp_utils.h
index af11ad27f04254..29130949b4b7ac 100644
--- a/paddle/phi/core/distributed/store/tcp_utils.h
+++ b/paddle/phi/core/distributed/store/tcp_utils.h
@@ -31,7 +31,7 @@
 #include <iostream>
 #include <vector>
 
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 
 // Utility functions for TCP socket.
 namespace phi {
diff --git a/paddle/phi/core/distributed/xccl_comm_context.cc b/paddle/phi/core/distributed/xccl_comm_context.cc
index 5c82e7baf0e82f..154dddeae31db3 100644
--- a/paddle/phi/core/distributed/xccl_comm_context.cc
+++ b/paddle/phi/core/distributed/xccl_comm_context.cc
@@ -16,11 +16,11 @@
 
 #include "glog/logging.h"
 
+#include "paddle/common/data_type.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/distributed/check/static_check.h"
 #include "paddle/phi/core/distributed/utils.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/utils/data_type.h"
 
 namespace phi {
 namespace distributed {
diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h
deleted file mode 100644
index 6106e56ae8dc95..00000000000000
--- a/paddle/phi/core/enforce.h
+++ /dev/null
@@ -1,1036 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef __GNUC__
-#include <cxxabi.h>  // for __cxa_demangle
-#endif               // __GNUC__
-
-#if !defined(_WIN32)
-#include <dlfcn.h>   // dladdr
-#include <unistd.h>  // sleep, usleep
-#else                // _WIN32
-#ifndef NOMINMAX
-#define NOMINMAX  // msvc max/min macro conflict with std::min/max
-#endif
-#include <windows.h>  // GetModuleFileName, Sleep
-#endif
-
-#ifdef PADDLE_WITH_CUDA
-#include <cublas_v2.h>
-#include <cudnn.h>
-#include <cufft.h>
-#include <curand.h>
-#include <cusparse.h>
-#include <thrust/system/cuda/error.h>
-#include <thrust/system_error.h>
-#endif  // PADDLE_WITH_CUDA
-
-#ifdef PADDLE_WITH_HIP
-#include <hiprand.h>
-#include <miopen/miopen.h>
-#include <rocblas.h>
-#include <thrust/system/hip/error.h>
-#include <thrust/system_error.h>  // NOLINT
-#endif
-
-#include <fstream>
-#include <iomanip>
-#include <memory>
-#include <sstream>
-#include <stdexcept>
-#include <string>
-#include <type_traits>
-#include <utility>
-#include "paddle/common/macros.h"
-#if !defined(_WIN32) && !defined(PADDLE_WITH_MUSL)
-#include <execinfo.h>
-#endif
-
-#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
-#include "paddle/common/errors.h"
-
-#include "paddle/utils/string/printf.h"
-#include "paddle/utils/string/to_string.h"
-#include "paddle/utils/test_macros.h"
-
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/common/backends/dynload/cublas.h"
-#include "paddle/common/backends/dynload/cudnn.h"
-#include "paddle/common/backends/dynload/curand.h"
-#include "paddle/common/backends/dynload/cusolver.h"
-#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
-#include <error.h>
-
-#include "paddle/common/backends/dynload/nccl.h"
-#endif  // __APPLE__
-#endif  // PADDLE_WITH_CUDA
-
-#ifdef PADDLE_WITH_HIP
-#include "paddle/common/backends/dynload/hipfft.h"
-#include "paddle/common/backends/dynload/hiprand.h"
-#include "paddle/common/backends/dynload/miopen.h"
-#include "paddle/common/backends/dynload/rocblas.h"
-#if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
-#include <error.h>  // NOLINT
-
-#include "paddle/common/backends/dynload/rccl.h"
-#endif  // __APPLE__
-#endif  // PADDLE_WITH_HIP
-
-// Note: these headers for simplify demangle type string
-#include "paddle/common/type_defs.h"
-// Note: this header for simplify HIP and CUDA type string
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#include "paddle/common/backends/gpu/gpu_types.h"
-#endif
-
-#include "paddle/utils/variant.h"
-
-namespace phi {
-class ErrorSummary;
-}  // namespace phi
-
-namespace phi {
-namespace proto {}  // namespace proto
-}  // namespace phi
-
-namespace phi {
-namespace enforce {
-
-/** HELPER MACROS AND FUNCTIONS **/
-#ifndef PADDLE_MAY_THROW
-#define PADDLE_MAY_THROW noexcept(false)
-#endif
-
-// Because most enforce conditions would evaluate to true, we can use
-// __builtin_expect to instruct the C++ compiler to generate code that
-// always forces branch prediction of true.
-// This generates faster binary code. __builtin_expect is since C++11.
-// For more details, please check https://stackoverflow.com/a/43870188/724872.
-#if !defined(_WIN32)
-#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
-#else
-// there is no equivalent intrinsics in msvc.
-#define UNLIKELY(condition) (condition)
-#endif
-
-#if !defined(_WIN32)
-#define LIKELY(condition) __builtin_expect(static_cast<bool>(condition), 1)
-#else
-// there is no equivalent intrinsics in msvc.
-#define LIKELY(condition) (condition)
-#endif
-
-#if defined _WIN32 && defined PADDLE_ON_INFERENCE && defined PADDLE_NO_PYTHON
-#define HANDLE_THE_ERROR try {
-#define END_HANDLE_THE_ERROR            \
-  }                                     \
-  catch (const std::exception& e) {     \
-    std::cout << e.what() << std::endl; \
-    throw;                              \
-  }
-#else
-#define HANDLE_THE_ERROR
-#define END_HANDLE_THE_ERROR
-#endif
-
-#ifdef __GNUC__
-inline std::string demangle(std::string name) {
-  int status = -4;  // some arbitrary value to eliminate the compiler warning
-  std::unique_ptr<char, void (*)(void*)> res{
-      abi::__cxa_demangle(name.c_str(), NULL, NULL, &status), std::free};
-  return (status == 0) ? res.get() : name;
-}
-#else
-inline std::string demangle(std::string name) { return name; }
-#endif
-
-namespace details {
-template <typename T>
-inline constexpr bool IsArithmetic() {
-  return std::is_arithmetic<T>::value;
-}
-
-template <typename T1, typename T2, bool kIsArithmetic /* = true */>
-struct TypeConverterImpl {
-  using Type1 = typename std::common_type<T1, T2>::type;
-  using Type2 = Type1;
-};
-
-template <typename T1, typename T2>
-struct TypeConverterImpl<T1, T2, false> {
-  using Type1 = T1;
-  using Type2 = T2;
-};
-
-template <typename T1, typename T2>
-struct TypeConverter {
-  static constexpr bool kIsArithmetic =
-      IsArithmetic<T1>() && IsArithmetic<T2>();
-  using Type1 = typename TypeConverterImpl<T1, T2, kIsArithmetic>::Type1;
-  using Type2 = typename TypeConverterImpl<T1, T2, kIsArithmetic>::Type2;
-};
-
-template <typename T1, typename T2>
-using CommonType1 = typename std::add_lvalue_reference<
-    typename std::add_const<typename TypeConverter<T1, T2>::Type1>::type>::type;
-
-template <typename T1, typename T2>
-using CommonType2 = typename std::add_lvalue_reference<
-    typename std::add_const<typename TypeConverter<T1, T2>::Type2>::type>::type;
-
-// Here, we use SFINAE to check whether T can be converted to std::string
-template <typename T>
-struct CanToString {
- private:
-  using YesType = uint8_t;
-  using NoType = uint16_t;
-
-  template <typename U>
-  static YesType Check(decltype(std::cout << std::declval<U>())) {
-    return 0;
-  }
-
-  template <typename U>
-  static NoType Check(...) {
-    return 0;
-  }
-
- public:
-  static constexpr bool kValue =
-      std::is_same<YesType, decltype(Check<T>(std::cout))>::value;
-};
-
-template <bool kCanToString /* = true */>
-struct BinaryCompareMessageConverter {
-  template <typename T>
-  static std::string Convert(const char* expression, const T& value) {
-    return expression + std::string(":") + paddle::string::to_string(value);
-  }
-};
-
-template <>
-struct BinaryCompareMessageConverter<false> {
-  template <typename T>
-  static const char* Convert(const char* expression, const T& value UNUSED) {
-    return expression;
-  }
-};
-}  // namespace details
-
-TEST_API int GetCallStackLevel();
-TEST_API std::string GetCurrentTraceBackString(bool for_signal = false);
-TEST_API std::string SimplifyErrorTypeFormat(const std::string& str);
-
-template <typename StrType>
-static std::string GetErrorSumaryString(StrType&& what,
-                                        const char* file,
-                                        int line) {
-  std::ostringstream sout;
-  if (GetCallStackLevel() > 1) {
-    sout << "\n----------------------\nError Message "
-            "Summary:\n----------------------\n";
-  }
-  sout << paddle::string::Sprintf(
-              "%s (at %s:%d)", std::forward<StrType>(what), file, line)
-       << std::endl;
-  return sout.str();
-}
-
-template <typename StrType>
-std::string GetCompleteTraceBackString(StrType&& what,
-                                       const char* file,
-                                       int line) {
-  std::ostringstream sout;
-  sout << "\n----------------------\nError Message "
-          "Summary:\n----------------------\n";
-  sout << paddle::string::Sprintf(
-              "%s (at %s:%d)", std::forward<StrType>(what), file, line)
-       << std::endl;
-  return GetCurrentTraceBackString() + sout.str();
-}
-
-template <typename StrType>
-static std::string GetTraceBackString(StrType&& what,
-                                      const char* file,
-                                      int line) {
-  if (GetCallStackLevel() > 1) {
-    // FLAGS_call_stack_level>1 means showing c++ call stack
-    return GetCurrentTraceBackString() + GetErrorSumaryString(what, file, line);
-  } else {
-    return GetErrorSumaryString(what, file, line);
-  }
-}
-
-inline bool is_error(bool stat) { return !stat; }
-
-// Note: This Macro can only be used within enforce.h
-#define __THROW_ERROR_INTERNAL__(__ERROR_SUMMARY)                             \
-  do {                                                                        \
-    HANDLE_THE_ERROR                                                          \
-    throw ::phi::enforce::EnforceNotMet(__ERROR_SUMMARY, __FILE__, __LINE__); \
-    END_HANDLE_THE_ERROR                                                      \
-  } while (0)
-
-/**
- * [Why declare function ThrowWarnInternal instead of defining macro
- * __THROW_WARN_INTERNAL__?]
- * ThrowWarnInternal uses `LOG` macro to display warning message, which depends
- * on third-party header file "logging.h". However, "logging.h" has not been
- * exposed to site-package yet, so that error will occur when we include
- * "enforce.h" header file. Hence, we declare function in enforce.h and define
- * it in enforce.cc file.
- */
-void ThrowWarnInternal(const std::string& message);
-
-/** ENFORCE EXCEPTION AND MACROS **/
-
-struct EnforceNotMet : public std::exception {
- public:
-  EnforceNotMet(std::exception_ptr e, const char* file, int line) {
-    try {
-      std::rethrow_exception(e);
-    } catch (EnforceNotMet& e) {
-      code_ = e.code();
-      err_str_ = GetTraceBackString(e.what(), file, line);
-      simple_err_str_ = SimplifyErrorTypeFormat(err_str_);
-    } catch (std::exception& e) {
-      err_str_ = GetTraceBackString(e.what(), file, line);
-      simple_err_str_ = SimplifyErrorTypeFormat(err_str_);
-    }
-  }
-
-  EnforceNotMet(const std::string& str, const char* file, int line)
-      : err_str_(GetTraceBackString(str, file, line)) {
-    simple_err_str_ = SimplifyErrorTypeFormat(err_str_);
-  }
-
-  EnforceNotMet(const common::ErrorCode& error, const char* file, int line)
-      : code_(error.code()),
-        err_str_(GetTraceBackString(error.to_string(), file, line)) {
-    simple_err_str_ = SimplifyErrorTypeFormat(err_str_);
-  }
-
-  const char* what() const noexcept override {
-    if (GetCallStackLevel() > 1) {
-      return err_str_.c_str();
-    } else {
-      return simple_err_str_.c_str();
-    }
-  }
-
-  common::ErrorCode code() const { return code_; }
-
-  const std::string& error_str() const { return err_str_; }
-
-  const std::string& simple_error_str() const { return simple_err_str_; }
-
-  void set_error_str(std::string str) {
-    if (GetCallStackLevel() > 1) {
-      err_str_ = str;
-    } else {
-      simple_err_str_ = str;
-    }
-  }
-
-  ~EnforceNotMet() override = default;
-
- private:
-  // Used to determine the final type of exception thrown
-  common::ErrorCode code_ = common::ErrorCode::LEGACY;
-  // Complete error message
-  // e.g. InvalidArgumentError: ***
-  std::string err_str_;
-  // Simple error message used when no C++ stack and python compile stack
-  // e.g. (InvalidArgument) ***
-  std::string simple_err_str_;
-};
-
-#define PADDLE_THROW(...)                                      \
-  do {                                                         \
-    HANDLE_THE_ERROR                                           \
-    throw ::phi::enforce::EnforceNotMet(                       \
-        ::common::ErrorCode(__VA_ARGS__), __FILE__, __LINE__); \
-    END_HANDLE_THE_ERROR                                       \
-  } while (0)
-
-#if defined(__CUDA_ARCH__)
-// For cuda, the assertions can affect performance and it is therefore
-// recommended to disable them in production code
-// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#assertion
-#define PADDLE_ENFORCE(_IS_NOT_ERROR, __FORMAT, ...)               \
-  do {                                                             \
-    if (!(_IS_NOT_ERROR)) {                                        \
-      printf("Error: %s:%d Assertion `%s` failed. " __FORMAT "\n", \
-             __FILE__,                                             \
-             __LINE__,                                             \
-             #_IS_NOT_ERROR,                                       \
-             ##__VA_ARGS__);                                       \
-      asm("trap;");                                                \
-    }                                                              \
-  } while (0)
-#elif defined(__HIPCC__)
-#define PADDLE_ENFORCE(_IS_NOT_ERROR, __FORMAT, ...)               \
-  do {                                                             \
-    if (!(_IS_NOT_ERROR)) {                                        \
-      printf("Error: %s:%d Assertion `%s` failed. " __FORMAT "\n", \
-             __FILE__,                                             \
-             __LINE__,                                             \
-             #_IS_NOT_ERROR,                                       \
-             ##__VA_ARGS__);                                       \
-      abort();                                                     \
-    }                                                              \
-  } while (0)
-#else
-#define PADDLE_ENFORCE(COND, ...)                               \
-  do {                                                          \
-    auto __cond__ = (COND);                                     \
-    if (UNLIKELY(::phi::is_error(__cond__))) {                  \
-      __THROW_ERROR_INTERNAL__(common::ErrorCode(__VA_ARGS__)); \
-    }                                                           \
-  } while (0)
-#endif
-
-/*
- * Some enforce helpers here, usage:
- *    int a = 1;
- *    int b = 2;
- *    PADDLE_ENFORCE_EQ(a, b);
- *
- *    will raise an expression described as follows:
- *    "Expected input a == b, but received a(1) != b(2)."
- *      with detailed stack information.
- *
- *    extra messages is also supported, for example:
- *    PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2)
- */
-
-#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...)                               \
-  do {                                                                    \
-    if (UNLIKELY(nullptr == (__VAL))) {                                   \
-      auto __summary__ = common::ErrorCode(__VA_ARGS__);                  \
-      auto __message__ = ::paddle::string::Sprintf(                       \
-          "%s\n  [Hint: " #__VAL " should not be null.]",                 \
-          __summary__.error_message());                                   \
-      __THROW_ERROR_INTERNAL__(                                           \
-          common::ErrorCode(__summary__.code(), std::move(__message__))); \
-    }                                                                     \
-  } while (0)
-
-#define PADDLE_WARN_NOT_NULL(__VAL, ...)                         \
-  do {                                                           \
-    if (UNLIKELY(nullptr == (__VAL))) {                          \
-      auto __summary__ = common::ErrorCode(__VA_ARGS__);         \
-      auto __message__ = ::paddle::string::Sprintf(              \
-          "%s\n  [Hint: " #__VAL " should not be null.]",        \
-          __summary__.error_message());                          \
-      ::phi::enforce::ThrowWarnInternal(std::move(__message__)); \
-    }                                                            \
-  } while (0)
-
-#define __PADDLE_BINARY_COMPARE(__VAL1, __VAL2, __CMP, __INV_CMP, ...)    \
-  do {                                                                    \
-    auto __val1 = (__VAL1);                                               \
-    auto __val2 = (__VAL2);                                               \
-    using __TYPE1__ = decltype(__val1);                                   \
-    using __TYPE2__ = decltype(__val2);                                   \
-    using __COMMON_TYPE1__ =                                              \
-        ::phi::details::CommonType1<__TYPE1__, __TYPE2__>;                \
-    using __COMMON_TYPE2__ =                                              \
-        ::phi::details::CommonType2<__TYPE1__, __TYPE2__>;                \
-    bool __is_not_error = (static_cast<__COMMON_TYPE1__>(__val1))__CMP(   \
-        static_cast<__COMMON_TYPE2__>(__val2));                           \
-    if (UNLIKELY(!__is_not_error)) {                                      \
-      auto __summary__ = common::ErrorCode(__VA_ARGS__);                  \
-      constexpr bool __kCanToString__ =                                   \
-          ::phi::details::CanToString<__TYPE1__>::kValue &&               \
-          ::phi::details::CanToString<__TYPE2__>::kValue;                 \
-      auto __message__ = ::paddle::string::Sprintf(                       \
-          "%s\n  [Hint: Expected %s " #__CMP                              \
-          " %s, but received %s " #__INV_CMP " %s.]",                     \
-          __summary__.error_message(),                                    \
-          #__VAL1,                                                        \
-          #__VAL2,                                                        \
-          ::phi::details::BinaryCompareMessageConverter<                  \
-              __kCanToString__>::Convert(#__VAL1, __val1),                \
-          ::phi::details::BinaryCompareMessageConverter<                  \
-              __kCanToString__>::Convert(#__VAL2, __val2));               \
-      __THROW_ERROR_INTERNAL__(                                           \
-          common::ErrorCode(__summary__.code(), std::move(__message__))); \
-    }                                                                     \
-  } while (0)
-
-#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, ==, !=, __VA_ARGS__)
-#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, !=, ==, __VA_ARGS__)
-#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >, <=, __VA_ARGS__)
-#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >=, <, __VA_ARGS__)
-#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__)
-#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__)
-
-/** EXTENDED TOOL FUNCTIONS WITH CHECKING **/
-
-/*
- * Summary: This macro is used to get Variable or internal type
- *   data (such as LoDTensor or SelectedRows) of the Input and
- *   Output in op, generally used when call scope.FindVar(Input/
- *   Output("Name")) or ctx.Input<LoDTensor>().
- *   Firstly this macro check whether the obtained pointer is null,
- *   and then return data if it is not null.
- *
- * Note: This macro is only suitable for specific scenarios and
- *   does not intended to be widely used. If it cannot meet the
- *   requirements, please use other PADDLE_ENFORCE** check macro.
- *
- * Parameters:
- *     __PTR: pointer
- *     __ROLE: (string), Input or Output
- *     __NAME: (string), Input or Output name
- *     __OP_TYPE: (string), the op type
- *
- * Return: The data pointed to by the pointer.
- *
- * Examples:
- *    GET_DATA_SAFELY(ctx.Input<LoDTensor>("X"), "Input", "X", "Mul");
- */
-#define GET_DATA_SAFELY(__PTR, __ROLE, __NAME, __OP_TYPE)               \
-  (([&]() -> std::add_lvalue_reference<decltype(*(__PTR))>::type {      \
-    auto* __ptr = (__PTR);                                              \
-    if (UNLIKELY(nullptr == __ptr)) {                                   \
-      auto __summary__ = common::errors::NotFound(                      \
-          "Unable to get %s data of %s %s in operator %s. "             \
-          "Possible reasons are:\n"                                     \
-          "  1. The %s is not the %s of operator %s;\n"                 \
-          "  2. The %s has no corresponding variable passed in;\n"      \
-          "  3. The %s corresponding variable is not initialized.",     \
-          phi::demangle(                                                \
-              typeid(std::add_lvalue_reference<decltype(*__ptr)>::type) \
-                  .name()),                                             \
-          __ROLE,                                                       \
-          __NAME,                                                       \
-          __OP_TYPE,                                                    \
-          __NAME,                                                       \
-          __ROLE,                                                       \
-          __OP_TYPE,                                                    \
-          __NAME,                                                       \
-          __NAME);                                                      \
-      auto __message__ = ::paddle::string::Sprintf(                     \
-          "%s\n  [Hint: pointer " #__PTR " should not be null.]",       \
-          __summary__.error_message());                                 \
-      __THROW_ERROR_INTERNAL__(                                         \
-          common::ErrorCode(__summary__.code(), __message__));          \
-    }                                                                   \
-    return *__ptr;                                                      \
-  })())
-
-/*
- * Summary: This PADDLE_GET(_**) series macros are used to call paddle::get
- *   safely. paddle::get is not a completely safe api, although it will not
- *   go wrong in most cases, but in extreme cases, it may fail and directly
- *   throw a paddle::bad_variant_access const exception, without any stack
- *information.
- *   This kind of problems is difficult to debug, so add these macros to
- *   enrich paddle::get error information. At the same time, we restrict
- *   the direct use of paddle::get by CI rule.
- *
- * Parameters:
- *     __TYPE: the target variable type
- *     __VALUE: the target variable to get
- *
- * Examples:
- *     - unsafe writing: int x = paddle::get<int>(y);
- *     - safe writing: int x = PADDLE_GET(int, y);
- *
- * Note: GCC 4.8 cannot select right overloaded function here, so need
- *    to define different functions and macros here, after we upgrade
- *    CI gcc version, we can only define one PADDLE_GET macro.
- */
-namespace details {
-
-#define DEFINE_SAFE_PADDLE_GET(                                              \
-    __InputType, __OutputType, __OutputTypePtr, __FuncName)                  \
-  template <typename OutputType, typename InputType>                         \
-  auto __FuncName(                                                           \
-      __InputType input, const char* expression, const char* file, int line) \
-      ->typename std::conditional<std::is_pointer<InputType>::value,         \
-                                  __OutputTypePtr,                           \
-                                  __OutputType>::type {                      \
-    try {                                                                    \
-      return paddle::get<OutputType>(input);                                 \
-    } catch (paddle::bad_variant_access const&) {                            \
-      HANDLE_THE_ERROR                                                       \
-      throw ::phi::enforce::EnforceNotMet(                                   \
-          common::errors::InvalidArgument(                                   \
-              "paddle::get failed, cannot get value "                        \
-              "(%s) by type %s, its type is %s.",                            \
-              expression,                                                    \
-              phi::enforce::demangle(typeid(OutputType).name()),             \
-              phi::enforce::demangle(input.type().name())),                  \
-          file,                                                              \
-          line);                                                             \
-      END_HANDLE_THE_ERROR                                                   \
-    }                                                                        \
-  }
-
-DEFINE_SAFE_PADDLE_GET(InputType&, OutputType&, OutputType*, SafeBoostGet);
-DEFINE_SAFE_PADDLE_GET(const InputType&,
-                       const OutputType&,
-                       const OutputType*,
-                       SafeBoostGetConst);
-DEFINE_SAFE_PADDLE_GET(InputType&&,
-                       OutputType,
-                       OutputType*,
-                       SafeBoostGetMutable);
-
-}  // namespace details
-
-#define PADDLE_GET(__TYPE, __VALUE)            \
-  phi::enforce::details::SafeBoostGet<__TYPE>( \
-      __VALUE, #__VALUE, __FILE__, __LINE__)
-#define PADDLE_GET_CONST(__TYPE, __VALUE)           \
-  phi::enforce::details::SafeBoostGetConst<__TYPE>( \
-      __VALUE, #__VALUE, __FILE__, __LINE__)
-#define PADDLE_GET_MUTABLE(__TYPE, __VALUE)           \
-  phi::enforce::details::SafeBoostGetMutable<__TYPE>( \
-      __VALUE, #__VALUE, __FILE__, __LINE__)
-
-/**************************************************************************/
-/**************************** NVIDIA ERROR ********************************/
-#ifdef PADDLE_WITH_CUDA
-
-namespace details {
-
-template <typename T>
-struct ExternalApiType {};
-
-#define DEFINE_EXTERNAL_API_TYPE(type, success_value) \
-  template <>                                         \
-  struct ExternalApiType<type> {                      \
-    using Type = type;                                \
-    static constexpr Type kSuccess = success_value;   \
-  }
-
-DEFINE_EXTERNAL_API_TYPE(cudaError_t, cudaSuccess);
-DEFINE_EXTERNAL_API_TYPE(curandStatus_t, CURAND_STATUS_SUCCESS);
-DEFINE_EXTERNAL_API_TYPE(cudnnStatus_t, CUDNN_STATUS_SUCCESS);
-DEFINE_EXTERNAL_API_TYPE(cublasStatus_t, CUBLAS_STATUS_SUCCESS);
-DEFINE_EXTERNAL_API_TYPE(cusparseStatus_t, CUSPARSE_STATUS_SUCCESS);
-DEFINE_EXTERNAL_API_TYPE(cusolverStatus_t, CUSOLVER_STATUS_SUCCESS);
-DEFINE_EXTERNAL_API_TYPE(cufftResult_t, CUFFT_SUCCESS);
-DEFINE_EXTERNAL_API_TYPE(CUresult, CUDA_SUCCESS);
-
-#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
-DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess);
-#endif
-
-}  // namespace details
-
-template <typename T>
-std::string GetExternalErrorMsg(T status);
-
-/*************** CUDA ERROR ***************/
-inline bool is_error(cudaError_t e) { return e != cudaSuccess; }
-
-inline std::string build_nvidia_error_msg(cudaError_t e) {
-  std::ostringstream sout;
-  sout << "CUDA error(" << e << "), " << cudaGetErrorString(e) << ". "
-       << GetExternalErrorMsg(e);
-  return sout.str();
-}
-
-/*************** CURAND ERROR ***************/
-inline bool is_error(curandStatus_t stat) {
-  return stat != CURAND_STATUS_SUCCESS;
-}
-
-inline std::string build_nvidia_error_msg(curandStatus_t stat) {
-  std::ostringstream sout;
-  sout << "CURAND error(" << stat << "). " << GetExternalErrorMsg(stat);
-  return sout.str();
-}
-
-/*************** CUDNN ERROR ***************/
-inline bool is_error(cudnnStatus_t stat) {
-  return stat != CUDNN_STATUS_SUCCESS;
-}
-
-inline std::string build_nvidia_error_msg(cudnnStatus_t stat) {
-  std::ostringstream sout;
-  sout << "CUDNN error(" << stat << "), "
-       << phi::dynload::cudnnGetErrorString(stat) << ". "
-       << GetExternalErrorMsg(stat);
-  return sout.str();
-}
-
-/*************** CUBLAS ERROR ***************/
-inline bool is_error(cublasStatus_t stat) {
-  return stat != CUBLAS_STATUS_SUCCESS;
-}
-
-inline std::string build_nvidia_error_msg(cublasStatus_t stat) {
-  std::ostringstream sout;
-  sout << "CUBLAS error(" << stat << "). " << GetExternalErrorMsg(stat);
-  return sout.str();
-}
-
-/*************** CUSPARSE ERROR ***************/
-inline bool is_error(cusparseStatus_t stat) {
-  return stat != CUSPARSE_STATUS_SUCCESS;
-}
-
-inline std::string build_nvidia_error_msg(cusparseStatus_t stat) {
-  std::ostringstream sout;
-  sout << "CUSparse error(" << stat << "). " << GetExternalErrorMsg(stat);
-  return sout.str();
-}
-
-/*************** CUSOLVER ERROR ***************/
-inline bool is_error(cusolverStatus_t stat) {
-  return stat != CUSOLVER_STATUS_SUCCESS;
-}
-
-inline std::string build_nvidia_error_msg(cusolverStatus_t stat) {
-  std::ostringstream sout;
-  sout << "CUSOLVER error(" << stat << "). " << GetExternalErrorMsg(stat);
-  return sout.str();
-}
-
-/*************** CUFFT ERROR ***************/
-inline bool is_error(cufftResult_t stat) { return stat != CUFFT_SUCCESS; }
-
-inline std::string build_nvidia_error_msg(cufftResult_t stat) {
-  std::ostringstream sout;
-  sout << "CUFFT error(" << stat << "). " << GetExternalErrorMsg(stat);
-  return sout.str();
-}
-
-/*************** CUresult ERROR ***************/
-inline bool is_error(CUresult stat) { return stat != CUDA_SUCCESS; }
-
-inline std::string build_nvidia_error_msg(CUresult stat) {
-  std::ostringstream sout;
-  sout << "CU error(" << stat << "). " << GetExternalErrorMsg(stat);
-  return sout.str();
-}
-
-/**************** NCCL ERROR ****************/
-#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
-inline bool is_error(ncclResult_t nccl_result) {
-  return nccl_result != ncclSuccess;
-}
-
-inline std::string build_nvidia_error_msg(ncclResult_t nccl_result) {
-  std::ostringstream sout;
-  sout << "NCCL error(" << nccl_result << "), "
-       << phi::dynload::ncclGetErrorString(nccl_result) << ". ";
-  if (errno == ENOSPC || errno == EAGAIN) {
-    std::string detail(strerror(errno));
-    detail += "\nPlease try one of the following solutions:";
-    detail += "\n1. export NCCL_SHM_DISABLE=1;";
-    detail += "\n2. export NCCL_P2P_LEVEL=SYS;";
-    detail +=
-        "\n3. Increase shared memory by setting the -shm-size "
-        "option when starting docker container, e.g., setting "
-        " -shm-size=2g.\n";
-    sout << " Detail: " + detail;
-  }
-  sout << GetExternalErrorMsg(nccl_result);
-  return sout.str();
-}
-#endif  // not(__APPLE__) and PADDLE_WITH_NCCL
-
-#define PADDLE_ENFORCE_GPU_SUCCESS(COND)                     \
-  do {                                                       \
-    auto __cond__ = (COND);                                  \
-    using __CUDA_STATUS_TYPE__ = decltype(__cond__);         \
-    constexpr auto __success_type__ =                        \
-        ::phi::enforce::details::ExternalApiType<            \
-            __CUDA_STATUS_TYPE__>::kSuccess;                 \
-    if (UNLIKELY(__cond__ != __success_type__)) {            \
-      auto __summary__ = common::errors::External(           \
-          ::phi::enforce::build_nvidia_error_msg(__cond__)); \
-      __THROW_ERROR_INTERNAL__(__summary__);                 \
-    }                                                        \
-  } while (0)
-
-#define PADDLE_WARN_GPU_SUCCESS(COND)                        \
-  do {                                                       \
-    auto __cond__ = (COND);                                  \
-    using __CUDA_STATUS_TYPE__ = decltype(__cond__);         \
-    constexpr auto __success_type__ =                        \
-        ::phi::enforce::details::ExternalApiType<            \
-            __CUDA_STATUS_TYPE__>::kSuccess;                 \
-    if (UNLIKELY(__cond__ != __success_type__)) {            \
-      ::phi::enforce::ThrowWarnInternal(                     \
-          ::phi::enforce::build_nvidia_error_msg(__cond__)); \
-    }                                                        \
-  } while (0)
-
-#define PADDLE_ENFORCE_CUDA_LAUNCH_SUCCESS(OP)                                 \
-  do {                                                                         \
-    auto res = cudaGetLastError();                                             \
-    if (UNLIKELY(res != cudaSuccess)) {                                        \
-      auto msg = ::phi::enforce::build_nvidia_error_msg(res);                  \
-      PADDLE_THROW(                                                            \
-          common::errors::Fatal("CUDA error after kernel (%s): %s", OP, msg)); \
-    }                                                                          \
-  } while (0)
-
-inline void retry_sleep(unsigned milliseconds) {
-#ifdef _WIN32
-  Sleep(milliseconds);
-#else
-  if (milliseconds < 1000) {
-    // usleep argument must be less than 1,000,000. Reference:
-    // https://pubs.opengroup.org/onlinepubs/7908799/xsh/usleep.html
-    usleep(milliseconds * 1000);
-  } else {
-    // clip to sleep in seconds because we can not and don't have to
-    // sleep for exact milliseconds
-    sleep(milliseconds / 1000);
-  }
-#endif
-}
-
-#define PADDLE_RETRY_CUDA_SUCCESS(COND)                                 \
-  do {                                                                  \
-    auto __cond__ = (COND);                                             \
-    int retry_count = 1;                                                \
-    using __CUDA_STATUS_TYPE__ = decltype(__cond__);                    \
-    constexpr auto __success_type__ =                                   \
-        ::phi::enforce::details::ExternalApiType<                       \
-            __CUDA_STATUS_TYPE__>::kSuccess;                            \
-    while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
-      phi::enforce::retry_sleep(10000);                                 \
-      __cond__ = (COND);                                                \
-      ++retry_count;                                                    \
-    }                                                                   \
-    if (UNLIKELY(__cond__ != __success_type__)) {                       \
-      auto __summary__ = common::errors::External(                      \
-          ::phi::enforce::build_nvidia_error_msg(__cond__));            \
-      __THROW_ERROR_INTERNAL__(__summary__);                            \
-    }                                                                   \
-  } while (0)
-
-#undef DEFINE_EXTERNAL_API_TYPE
-#endif  // PADDLE_WITH_CUDA
-
-/**************************************************************************/
-/***************************** HIP ERROR **********************************/
-#ifdef PADDLE_WITH_HIP
-
-/***** HIP ERROR *****/
-inline bool is_error(hipError_t e) { return e != hipSuccess; }
-
-inline std::string build_rocm_error_msg(hipError_t e) {
-  std::ostringstream sout;
-  sout << " Hip error(" << e << "), " << hipGetErrorString(e) << ".";
-  return sout.str();
-}
-
-/***** HIPRAND ERROR *****/
-inline bool is_error(hiprandStatus_t stat) {
-  return stat != HIPRAND_STATUS_SUCCESS;
-}
-
-inline const char* hiprandGetErrorString(hiprandStatus_t stat) {
-  switch (stat) {
-    case HIPRAND_STATUS_SUCCESS:
-      return "HIPRAND_STATUS_SUCCESS";
-    case HIPRAND_STATUS_VERSION_MISMATCH:
-      return "HIPRAND_STATUS_VERSION_MISMATCH";
-    case HIPRAND_STATUS_NOT_INITIALIZED:
-      return "HIPRAND_STATUS_NOT_INITIALIZED";
-    case HIPRAND_STATUS_ALLOCATION_FAILED:
-      return "HIPRAND_STATUS_ALLOCATION_FAILED";
-    case HIPRAND_STATUS_TYPE_ERROR:
-      return "HIPRAND_STATUS_TYPE_ERROR";
-    case HIPRAND_STATUS_OUT_OF_RANGE:
-      return "HIPRAND_STATUS_OUT_OF_RANGE";
-    case HIPRAND_STATUS_LENGTH_NOT_MULTIPLE:
-      return "HIPRAND_STATUS_LENGTH_NOT_MULTIPLE";
-    case HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED:
-      return "HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED";
-    case HIPRAND_STATUS_LAUNCH_FAILURE:
-      return "HIPRAND_STATUS_LAUNCH_FAILURE";
-    case HIPRAND_STATUS_PREEXISTING_FAILURE:
-      return "HIPRAND_STATUS_PREEXISTING_FAILURE";
-    case HIPRAND_STATUS_INITIALIZATION_FAILED:
-      return "HIPRAND_STATUS_INITIALIZATION_FAILED";
-    case HIPRAND_STATUS_ARCH_MISMATCH:
-      return "HIPRAND_STATUS_ARCH_MISMATCH";
-    case HIPRAND_STATUS_INTERNAL_ERROR:
-      return "HIPRAND_STATUS_INTERNAL_ERROR";
-    case HIPRAND_STATUS_NOT_IMPLEMENTED:
-      return "HIPRAND_STATUS_NOT_IMPLEMENTED";
-    default:
-      return "Unknown hiprand status";
-  }
-}
-
-inline std::string build_rocm_error_msg(hiprandStatus_t stat) {
-  std::string msg(" Hiprand error, ");
-  return msg + hiprandGetErrorString(stat) + " ";
-}
-
-/***** MIOPEN ERROR *****/
-inline bool is_error(miopenStatus_t stat) {
-  return stat != miopenStatusSuccess;
-}
-
-inline std::string build_rocm_error_msg(miopenStatus_t stat) {
-  std::string msg(" Miopen error, ");
-  return msg + phi::dynload::miopenGetErrorString(stat) + " ";
-}
-
-/***** ROCBLAS ERROR *****/
-inline bool is_error(rocblas_status stat) {
-  return stat != rocblas_status_success;
-}
-
-inline const char* rocblasGetErrorString(rocblas_status stat) {
-  switch (stat) {
-    case rocblas_status_invalid_handle:
-      return "rocblas_status_invalid_handle";
-    case rocblas_status_memory_error:
-      return "rocblas_status_memory_error";
-    case rocblas_status_invalid_value:
-      return "rocblas_status_invalid_value";
-    case rocblas_status_not_implemented:
-      return "rocblas_status_not_implemented";
-    case rocblas_status_invalid_pointer:
-      return "rocblas_status_invalid_pointer";
-    case rocblas_status_invalid_size:
-      return "rocblas_status_invalid_size";
-    case rocblas_status_internal_error:
-      return "rocblas_status_internal_error";
-    default:
-      return "Unknown cublas status";
-  }
-}
-
-inline std::string build_rocm_error_msg(rocblas_status stat) {
-  std::string msg(" Rocblas error, ");
-  return msg + rocblasGetErrorString(stat) + " ";
-}
-
-/****** RCCL ERROR ******/
-#if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
-inline bool is_error(ncclResult_t nccl_result) {
-  return nccl_result != ncclSuccess;
-}
-
-inline std::string build_rocm_error_msg(ncclResult_t nccl_result) {
-  std::string msg(" Rccl error, ");
-  return msg + phi::dynload::ncclGetErrorString(nccl_result) + " ";
-}
-#endif  // not(__APPLE__) and PADDLE_WITH_NCCL
-
-/***** HIPFFT ERROR *****/
-inline bool is_error(hipfftResult_t stat) { return stat != HIPFFT_SUCCESS; }
-
-inline std::string build_rocm_error_msg(hipfftResult_t stat) {
-  std::string msg(" HIPFFT error, ");
-  return msg + phi::dynload::hipfftGetErrorString(stat) + " ";
-}
-
-namespace details {
-
-template <typename T>
-struct ExternalApiType {};
-
-#define DEFINE_EXTERNAL_API_TYPE(type, success_value) \
-  template <>                                         \
-  struct ExternalApiType<type> {                      \
-    using Type = type;                                \
-    static constexpr Type kSuccess = success_value;   \
-  }
-
-DEFINE_EXTERNAL_API_TYPE(hipError_t, hipSuccess);
-DEFINE_EXTERNAL_API_TYPE(hiprandStatus_t, HIPRAND_STATUS_SUCCESS);
-DEFINE_EXTERNAL_API_TYPE(miopenStatus_t, miopenStatusSuccess);
-DEFINE_EXTERNAL_API_TYPE(rocblas_status, rocblas_status_success);
-DEFINE_EXTERNAL_API_TYPE(hipfftResult_t, HIPFFT_SUCCESS);
-
-#if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
-DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess);
-#endif
-
-}  // namespace details
-
-#define PADDLE_ENFORCE_GPU_SUCCESS(COND)                   \
-  do {                                                     \
-    auto __cond__ = (COND);                                \
-    using __CUDA_STATUS_TYPE__ = decltype(__cond__);       \
-    constexpr auto __success_type__ =                      \
-        ::phi::enforce::details::ExternalApiType<          \
-            __CUDA_STATUS_TYPE__>::kSuccess;               \
-    if (UNLIKELY(__cond__ != __success_type__)) {          \
-      auto __summary__ = common::errors::External(         \
-          ::phi::enforce::build_rocm_error_msg(__cond__)); \
-      __THROW_ERROR_INTERNAL__(__summary__);               \
-    }                                                      \
-  } while (0)
-
-#define PADDLE_WARN_GPU_SUCCESS(COND)                      \
-  do {                                                     \
-    auto __cond__ = (COND);                                \
-    using __CUDA_STATUS_TYPE__ = decltype(__cond__);       \
-    constexpr auto __success_type__ =                      \
-        ::phi::enforce::details::ExternalApiType<          \
-            __CUDA_STATUS_TYPE__>::kSuccess;               \
-    if (UNLIKELY(__cond__ != __success_type__)) {          \
-      ::phi::enforce::ThrowWarnInternal(                   \
-          ::phi::enforce::build_rocm_error_msg(__cond__)); \
-    }                                                      \
-  } while (0)
-
-inline void retry_sleep(unsigned millisecond) {
-#ifdef _WIN32
-  Sleep(millisecond);
-#else
-  sleep(millisecond);
-#endif
-}
-
-#define PADDLE_RETRY_CUDA_SUCCESS(COND)                                 \
-  do {                                                                  \
-    auto __cond__ = (COND);                                             \
-    int retry_count = 1;                                                \
-    using __CUDA_STATUS_TYPE__ = decltype(__cond__);                    \
-    constexpr auto __success_type__ =                                   \
-        ::phi::enforce::details::ExternalApiType<                       \
-            __CUDA_STATUS_TYPE__>::kSuccess;                            \
-    while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
-      ::phi::enforce::retry_sleep(10000);                               \
-      __cond__ = (COND);                                                \
-      ++retry_count;                                                    \
-    }                                                                   \
-    if (UNLIKELY(__cond__ != __success_type__)) {                       \
-      auto __summary__ = common::errors::External(                      \
-          ::phi::enforce::build_rocm_error_msg(__cond__));              \
-      __THROW_ERROR_INTERNAL__(__summary__);                            \
-    }                                                                   \
-  } while (0)
-
-#undef DEFINE_EXTERNAL_API_TYPE
-#endif  // PADDLE_WITH_HIP
-
-}  // namespace enforce
-using namespace enforce;  // NOLINT
-}  // namespace phi
diff --git a/paddle/phi/core/errors.cc b/paddle/phi/core/errors.cc
index 0fcf8f292c1e17..d76802b1d023fa 100644
--- a/paddle/phi/core/errors.cc
+++ b/paddle/phi/core/errors.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/core/errors.h"
+#include "paddle/common/errors.h"
 
 #include <stdexcept>
 
-namespace phi {
+namespace common {
 std::string error_name(ErrorCode code) {
   switch (code) {
     case ErrorCode::LEGACY:
@@ -70,4 +70,4 @@ std::string ErrorSummary::to_string() const {
   result += error_message();
   return result;
 }
-}  // namespace phi
+}  // namespace common
diff --git a/paddle/phi/core/errors.h b/paddle/phi/core/errors.h
deleted file mode 100644
index 5d1143b2e76a70..00000000000000
--- a/paddle/phi/core/errors.h
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <stdexcept>
-#include <string>
-#include <tuple>
-#include <type_traits>
-
-#include "paddle/utils/string/printf.h"
-#include "paddle/utils/test_macros.h"
-
-namespace phi {
-enum ErrorCode {
-  // Legacy error.
-  // Error type string: "Error"
-  LEGACY = 0,
-
-  // Client specified an invalid argument.
-  // Error type string: "InvalidArgumentError"
-  INVALID_ARGUMENT = 1,
-
-  // Some requested entity (e.g., file or directory) was not found.
-  // Error type string: "NotFoundError"
-  NOT_FOUND = 2,
-
-  // Operation tried to iterate past the valid input range.  E.g., seeking or
-  // reading past end of file.
-  // Error type string: "OutOfRangeError"
-  OUT_OF_RANGE = 3,
-
-  // Some entity that we attempted to create (e.g., file or directory)
-  // already exists.
-  // Error type string: "AlreadyExistsError"
-  ALREADY_EXISTS = 4,
-
-  // Some resource has been exhausted, perhaps a per-user quota, or
-  // perhaps the entire file system is out of space.
-  // Error type string: "ResourceExhaustedError"
-  RESOURCE_EXHAUSTED = 5,
-
-  // Operation was rejected because the system is not in a state
-  // required for the operation's execution.
-  // Error type string: "PreconditionNotMetError"
-  PRECONDITION_NOT_MET = 6,
-
-  // The caller does not have permission to execute the specified
-  // operation.
-  // Error type string: "PermissionDeniedError"
-  PERMISSION_DENIED = 7,
-
-  // Deadline expired before operation could complete.
-  // Error type string: "ExecutionTimeout"
-  EXECUTION_TIMEOUT = 8,
-
-  // Operation is not implemented or not supported/enabled in this service.
-  // Error type string: "UnimplementedError"
-  UNIMPLEMENTED = 9,
-
-  // The service is currently unavailable.  This is a most likely a
-  // transient condition and may be corrected by retrying with
-  // a backoff.
-  // Error type string: "UnavailableError"
-  UNAVAILABLE = 10,
-
-  // Fatal errors.  Means some invariant expected by the underlying
-  // system has been broken.  If you see one of these errors,
-  // something is very broken.
-  // Error type string: "FatalError"
-  FATAL = 11,
-
-  // Third-party library error.
-  // Error type string: "ExternalError"
-  EXTERNAL = 12,
-};
-
-class ErrorSummary {
- public:
-  // Note(chenweihang): Final deprecated constructor
-  //   This constructor is used to be compatible with
-  //   current existing untyped PADDLE_ENFORCE_*
-  //   PADDLE_ENFORCE
-  // Note(chenweihang): Windows openblas need this
-  //   constructor for compiling PADDLE_ENFORCE in *.cu,
-  //   this is a bug cause we can't remove this
-  //   constructor now.
-  template <typename... Args>
-  explicit ErrorSummary(Args... args) {
-    code_ = common::ErrorCode::LEGACY;
-    msg_ = paddle::string::Sprintf(args...);
-  }
-
-  // Note(chenweihang): Only recommended constructor
-  //   No longer supports PADDLE_ENFORCE without type or without error message
-  explicit ErrorSummary(ErrorCode code, std::string msg)
-      : code_(code), msg_(msg) {}
-
-  ErrorCode code() const { return code_; }
-
-  const std::string& error_message() const { return msg_; }
-
-  TEST_API std::string to_string() const;
-
- private:
-  ErrorCode code_;
-  std::string msg_;
-};
-
-namespace errors {
-
-#define REGISTER_ERROR(FUNC, CONST, ...)                          \
-  template <typename... Args>                                     \
-  common::ErrorCode FUNC(Args... args) {                          \
-    return common::ErrorCode(common::CONST,                       \
-                             ::paddle::string::Sprintf(args...)); \
-  }
-
-REGISTER_ERROR(InvalidArgument, ErrorCode::INVALID_ARGUMENT)
-REGISTER_ERROR(NotFound, ErrorCode::NOT_FOUND)
-REGISTER_ERROR(OutOfRange, ErrorCode::OUT_OF_RANGE)
-REGISTER_ERROR(AlreadyExists, ErrorCode::ALREADY_EXISTS)
-REGISTER_ERROR(ResourceExhausted, ErrorCode::RESOURCE_EXHAUSTED)
-REGISTER_ERROR(PreconditionNotMet, ErrorCode::PRECONDITION_NOT_MET)
-REGISTER_ERROR(PermissionDenied, ErrorCode::PERMISSION_DENIED)
-REGISTER_ERROR(ExecutionTimeout, ErrorCode::EXECUTION_TIMEOUT)
-REGISTER_ERROR(Unimplemented, ErrorCode::UNIMPLEMENTED)
-REGISTER_ERROR(Unavailable, ErrorCode::UNAVAILABLE)
-REGISTER_ERROR(Fatal, ErrorCode::FATAL)
-REGISTER_ERROR(External, ErrorCode::EXTERNAL)
-
-#undef REGISTER_ERROR
-
-}  // namespace errors
-}  // namespace phi
diff --git a/paddle/phi/core/generator.cc b/paddle/phi/core/generator.cc
index b3f8a2d19caba0..278024ee353dec 100644
--- a/paddle/phi/core/generator.cc
+++ b/paddle/phi/core/generator.cc
@@ -19,9 +19,9 @@ limitations under the License. */
 #include <memory>
 #include <utility>
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/xpu/xpu_info.h"
-#include "paddle/phi/core/enforce.h"
 
 static uint64_t GetRandomSeed() {
   std::random_device rd;
diff --git a/paddle/phi/core/infermeta_utils.h b/paddle/phi/core/infermeta_utils.h
index 84b944bbe19c5c..ff4c2b5ee8a3e1 100644
--- a/paddle/phi/core/infermeta_utils.h
+++ b/paddle/phi/core/infermeta_utils.h
@@ -19,12 +19,12 @@ limitations under the License. */
 #include <typeinfo>
 #include <utility>
 
+#include "paddle/common/enforce.h"
 #include "paddle/common/macros.h"
 #include "paddle/common/scalar.h"
 #include "paddle/common/type_defs.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/core/attribute.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/meta_tensor.h"
 #include "paddle/utils/any.h"
 #include "paddle/utils/flat_hash_map.h"
diff --git a/paddle/phi/core/kernel_context.h b/paddle/phi/core/kernel_context.h
index df7bfd31251c8e..097f003d7345b4 100644
--- a/paddle/phi/core/kernel_context.h
+++ b/paddle/phi/core/kernel_context.h
@@ -17,10 +17,10 @@
 #include <iterator>
 #include <utility>
 
+#include "paddle/common/enforce.h"
 #include "paddle/common/type_defs.h"
 #include "paddle/phi/core/attribute.h"
 #include "paddle/phi/core/device_context.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/tensor_base.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/utils/optional.h"
diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc
index 69c7900def16ba..326aad1a0001bc 100644
--- a/paddle/phi/core/kernel_factory.cc
+++ b/paddle/phi/core/kernel_factory.cc
@@ -15,7 +15,7 @@
 #include "paddle/phi/core/kernel_factory.h"
 
 #include "glog/logging.h"
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 #include "paddle/utils/flags.h"
 #if defined(PADDLE_WITH_XPU)
 #include "paddle/phi/backends/xpu/xpu_op_list.h"
diff --git a/paddle/phi/core/kernel_factory.h b/paddle/phi/core/kernel_factory.h
index 7d62485d703fbd..486206023e2912 100644
--- a/paddle/phi/core/kernel_factory.h
+++ b/paddle/phi/core/kernel_factory.h
@@ -18,13 +18,13 @@
 #include <ostream>
 #include <unordered_map>
 #include <unordered_set>
+#include "paddle/common/data_type.h"
 #include "paddle/common/type_defs.h"
 #include "paddle/phi/common/backend.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/compat/get_kerneltype_forvar_utils.h"
-#include "paddle/phi/core/utils/data_type.h"
 #include "paddle/utils/flat_hash_map.h"
 #include "paddle/utils/small_vector.h"
 namespace phi {
diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h
index 88350b88e5011e..33bc151d7f8583 100644
--- a/paddle/phi/core/kernel_utils.h
+++ b/paddle/phi/core/kernel_utils.h
@@ -14,12 +14,12 @@
 
 #pragma once
 
+#include "paddle/common/enforce.h"
 #include "paddle/common/scalar.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/tensor_ref.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/extended_tensor.h"
 #include "paddle/phi/core/kernel_context.h"
 #include "paddle/phi/core/selected_rows.h"
diff --git a/paddle/phi/core/lod_utils.cc b/paddle/phi/core/lod_utils.cc
index 2ebf0f23116417..8b8bc5dabdc887 100644
--- a/paddle/phi/core/lod_utils.cc
+++ b/paddle/phi/core/lod_utils.cc
@@ -14,7 +14,7 @@
 
 #include "paddle/phi/core/lod_utils.h"
 
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 
 namespace phi {
 
diff --git a/paddle/phi/core/meta_tensor.cc b/paddle/phi/core/meta_tensor.cc
index 8f63dc5d4d56cf..40e94f06cbddee 100644
--- a/paddle/phi/core/meta_tensor.cc
+++ b/paddle/phi/core/meta_tensor.cc
@@ -16,10 +16,10 @@ limitations under the License. */
 
 #include "glog/logging.h"
 
+#include "paddle/common/enforce.h"
 #include "paddle/fluid/pir/dialect/operator/ir/meta_tensor.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/selected_rows.h"
 #include "paddle/phi/core/string_tensor.h"
 #include "paddle/phi/core/string_tensor_utils.h"
diff --git a/paddle/phi/core/meta_tensor.h b/paddle/phi/core/meta_tensor.h
index 4b0480af48cdad..38a1556fbd73c3 100644
--- a/paddle/phi/core/meta_tensor.h
+++ b/paddle/phi/core/meta_tensor.h
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/common/ddim.h"
 #include "paddle/common/macros.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/layout.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/tensor_base.h"
 #include "paddle/phi/core/tensor_meta.h"
 
diff --git a/paddle/phi/core/mixed_vector.h b/paddle/phi/core/mixed_vector.h
index d25a646608d3d2..f06ab5c6bb52d9 100644
--- a/paddle/phi/core/mixed_vector.h
+++ b/paddle/phi/core/mixed_vector.h
@@ -22,10 +22,10 @@ limitations under the License. */
 #include <vector>
 
 #include "glog/logging.h"
+#include "paddle/common/enforce.h"
+#include "paddle/common/errors.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/allocator.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/utils/none.h"
 #include "paddle/utils/optional.h"
 
diff --git a/paddle/phi/core/selected_rows_impl.cc b/paddle/phi/core/selected_rows_impl.cc
index ff96342940d923..c8bc294c6b4622 100644
--- a/paddle/phi/core/selected_rows_impl.cc
+++ b/paddle/phi/core/selected_rows_impl.cc
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include "glog/logging.h"
 
+#include "paddle/common/data_type.h"
 #include "paddle/phi/common/memory_utils.h"
-#include "paddle/phi/core/utils/data_type.h"
 
 namespace phi {
 
diff --git a/paddle/phi/core/selected_rows_impl.h b/paddle/phi/core/selected_rows_impl.h
index a29f66b99420ab..445fb4cfaf51f7 100644
--- a/paddle/phi/core/selected_rows_impl.h
+++ b/paddle/phi/core/selected_rows_impl.h
@@ -21,10 +21,10 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#include "paddle/common/ddim.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/common/place.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/utils/rw_lock.h"
 
 namespace phi {
diff --git a/paddle/phi/core/storage_properties.h b/paddle/phi/core/storage_properties.h
index a6a66305c6297e..9d662544d9404b 100644
--- a/paddle/phi/core/storage_properties.h
+++ b/paddle/phi/core/storage_properties.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <memory>
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 #include "paddle/phi/core/utils/type_registry.h"
 
 #ifdef PADDLE_WITH_DNNL
diff --git a/paddle/phi/core/tensor_base.h b/paddle/phi/core/tensor_base.h
index 069382720e19de..334ef081e92787 100644
--- a/paddle/phi/core/tensor_base.h
+++ b/paddle/phi/core/tensor_base.h
@@ -14,12 +14,12 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/common/backend.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/allocator.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/utils/type_registry.h"
 
 namespace phi {
diff --git a/paddle/phi/core/tensor_meta.h b/paddle/phi/core/tensor_meta.h
index c57573e59be37a..02211c055c5074 100644
--- a/paddle/phi/core/tensor_meta.h
+++ b/paddle/phi/core/tensor_meta.h
@@ -16,10 +16,10 @@ limitations under the License. */
 
 #include <vector>
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/common/backend.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/layout.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/utils/any.h"
 #include "paddle/utils/optional.h"
 #include "paddle/utils/test_macros.h"
diff --git a/paddle/phi/core/threadpool.cc b/paddle/phi/core/threadpool.cc
index 7538087f4e8553..3d4894bcadf40c 100644
--- a/paddle/phi/core/threadpool.cc
+++ b/paddle/phi/core/threadpool.cc
@@ -17,7 +17,7 @@
 #include <thread>
 
 #include "glog/logging.h"
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 #include "paddle/utils/flags.h"
 
 PD_DECLARE_int32(dist_threadpool_size);
diff --git a/paddle/phi/core/threadpool.h b/paddle/phi/core/threadpool.h
index 318ec38d3c8c58..d05adec044f65f 100644
--- a/paddle/phi/core/threadpool.h
+++ b/paddle/phi/core/threadpool.h
@@ -24,8 +24,8 @@
 #include <utility>
 #include <vector>
 
+#include "paddle/common/enforce.h"
 #include "paddle/common/macros.h"  // for DISABLE_COPY_AND_ASSIGN
-#include "paddle/phi/core/enforce.h"
 
 namespace phi {
 
diff --git a/paddle/phi/core/utils/array.h b/paddle/phi/core/utils/array.h
index 20e120b990c02f..5d7726b8320db1 100644
--- a/paddle/phi/core/utils/array.h
+++ b/paddle/phi/core/utils/array.h
@@ -16,8 +16,8 @@
 
 #include <cstdint>
 
+#include "paddle/common/enforce.h"
 #include "paddle/common/unroll_array_ops.h"
-#include "paddle/phi/core/enforce.h"
 
 namespace phi {
 
diff --git a/paddle/phi/core/utils/intrusive_ptr.h b/paddle/phi/core/utils/intrusive_ptr.h
index e2e6cb7060d057..aab4fe45a6cf2b 100644
--- a/paddle/phi/core/utils/intrusive_ptr.h
+++ b/paddle/phi/core/utils/intrusive_ptr.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <utility>
 
 #include "glog/logging.h"
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 
 namespace phi {
 
diff --git a/paddle/phi/core/utils/rw_lock.h b/paddle/phi/core/utils/rw_lock.h
index fa87cfcbb5feeb..62dfadf4edc1fd 100644
--- a/paddle/phi/core/utils/rw_lock.h
+++ b/paddle/phi/core/utils/rw_lock.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include <mutex>  // NOLINT
 #endif            // !_WIN32
 
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 
 namespace phi {
 
diff --git a/paddle/phi/core/utils/visit_place.h b/paddle/phi/core/utils/visit_place.h
index 6318b17647cd61..aed8ee48024d71 100644
--- a/paddle/phi/core/utils/visit_place.h
+++ b/paddle/phi/core/utils/visit_place.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/common/place.h"
-#include "paddle/phi/core/enforce.h"
 
 namespace phi {
 
diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc
index 4c5e130aab7a07..f399ce6de11b2b 100644
--- a/paddle/phi/infermeta/backward.cc
+++ b/paddle/phi/infermeta/backward.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/infermeta/backward.h"
+#include "paddle/common/data_type.h"
 #include "paddle/phi/common/type_traits.h"
-#include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/axis_utils.h"
 
 namespace phi {
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index add5013298d309..d917c8dea3d243 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -18,12 +18,12 @@ limitations under the License. */
 #include <vector>
 
 #include "glog/logging.h"
+#include "paddle/common/data_type.h"
+#include "paddle/common/ddim.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/common/type_traits.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/infermeta/unary.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/funcs/axis_utils.h"
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 311f054e9dd274..5aeae9435f6bf5 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -18,13 +18,13 @@ limitations under the License. */
 
 #include "glog/logging.h"
 
+#include "paddle/common/data_type.h"
 #include "paddle/common/scalar.h"
 #include "paddle/phi/backends/device_memory_aligment.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/core/meta_tensor.h"
-#include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/infermeta/binary.h"
 #include "paddle/phi/infermeta/nullary.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
diff --git a/paddle/phi/infermeta/spmd_rules/dim_trans.cc b/paddle/phi/infermeta/spmd_rules/dim_trans.cc
index d781cc415ae4c4..e4608ca67aa659 100644
--- a/paddle/phi/infermeta/spmd_rules/dim_trans.cc
+++ b/paddle/phi/infermeta/spmd_rules/dim_trans.cc
@@ -17,8 +17,8 @@ limitations under the License. */
 #include <cstdio>
 #include <numeric>
 #include <set>
+#include "paddle/common/enforce.h"
 #include "paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.h"
-#include "paddle/phi/core/enforce.h"
 
 namespace phi {
 namespace distributed {
diff --git a/paddle/phi/infermeta/spmd_rules/utils.cc b/paddle/phi/infermeta/spmd_rules/utils.cc
index 42bbc659b2f2be..ca7b2608f1e4e2 100644
--- a/paddle/phi/infermeta/spmd_rules/utils.cc
+++ b/paddle/phi/infermeta/spmd_rules/utils.cc
@@ -16,9 +16,9 @@ limitations under the License. */
 
 #include "glog/logging.h"
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
 #include "paddle/phi/core/distributed/auto_parallel/utils.h"
-#include "paddle/phi/core/enforce.h"
 
 namespace phi {
 namespace distributed {
diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc
index d86b25b7ba224f..406b38dd0f3d1c 100644
--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include "glog/logging.h"
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/common/layout.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 #include "paddle/phi/kernels/impl/box_coder.h"
 
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 0308093ed9fc67..5c9124430b8be4 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -17,12 +17,12 @@ limitations under the License. */
 #include <algorithm>
 #include <set>
 
+#include "paddle/common/data_type.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/type_traits.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/flags.h"
 #include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/parse_qr_mode.h"
 #include "paddle/phi/kernels/funcs/pooling.h"
 #include "paddle/phi/kernels/funcs/slice_utils.h"
diff --git a/paddle/phi/kernels/autotune/cache_base.h b/paddle/phi/kernels/autotune/cache_base.h
index 68463e900c3578..64450fc99c01d4 100644
--- a/paddle/phi/kernels/autotune/cache_base.h
+++ b/paddle/phi/kernels/autotune/cache_base.h
@@ -18,8 +18,8 @@
 #include <unordered_map>
 #include <vector>
 
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
+#include "paddle/common/enforce.h"
+#include "paddle/common/errors.h"
 #include "paddle/phi/core/flags.h"
 
 PHI_DECLARE_int32(search_cache_max_number);
diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h
index 7433bb9e5ee22d..f31df8ebecb2cd 100644
--- a/paddle/phi/kernels/autotune/gpu_timer.h
+++ b/paddle/phi/kernels/autotune/gpu_timer.h
@@ -15,8 +15,8 @@
 #pragma once
 
 #include "paddle/common/backends/gpu/gpu_decls.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
+#include "paddle/common/enforce.h"
+#include "paddle/common/errors.h"
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
 #endif
diff --git a/paddle/phi/kernels/cpu/allclose_kernel.cc b/paddle/phi/kernels/cpu/allclose_kernel.cc
index fd6cf3aebc2687..e878a1d55314b2 100644
--- a/paddle/phi/kernels/cpu/allclose_kernel.cc
+++ b/paddle/phi/kernels/cpu/allclose_kernel.cc
@@ -17,7 +17,7 @@
 #include <cmath>
 
 #include "glog/logging.h"
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/arg_min_max_kernel.cc b/paddle/phi/kernels/cpu/arg_min_max_kernel.cc
index ce00926101f2cc..c89701e955fa9a 100644
--- a/paddle/phi/kernels/cpu/arg_min_max_kernel.cc
+++ b/paddle/phi/kernels/cpu/arg_min_max_kernel.cc
@@ -14,10 +14,10 @@
 
 #include "paddle/phi/kernels/arg_min_max_kernel.h"
 
+#include "paddle/common/data_type.h"
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/phi/kernels/cpu/assign_pos_kernel.cc b/paddle/phi/kernels/cpu/assign_pos_kernel.cc
index ceab18c5ecc7b4..7bad2262dad685 100644
--- a/paddle/phi/kernels/cpu/assign_pos_kernel.cc
+++ b/paddle/phi/kernels/cpu/assign_pos_kernel.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/assign_pos_kernel.h"
-#include "paddle/phi/core/errors.h"
+#include "paddle/common/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
index 168f4e159cb811..9ca3ea52ae9461 100644
--- a/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
@@ -16,9 +16,9 @@
 
 #include <vector>
 
+#include "paddle/common/enforce.h"
 #include "paddle/common/float16.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
diff --git a/paddle/phi/kernels/cpu/conv_util.h b/paddle/phi/kernels/cpu/conv_util.h
index 159a5cfbeb6b41..d9301ac8aae08e 100644
--- a/paddle/phi/kernels/cpu/conv_util.h
+++ b/paddle/phi/kernels/cpu/conv_util.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/cudnn_lstm_kernel.cc b/paddle/phi/kernels/cpu/cudnn_lstm_kernel.cc
index cd709fe2bf4656..797cf05a3f28a8 100644
--- a/paddle/phi/kernels/cpu/cudnn_lstm_kernel.cc
+++ b/paddle/phi/kernels/cpu/cudnn_lstm_kernel.cc
@@ -14,8 +14,8 @@
 
 #include "paddle/phi/kernels/cudnn_lstm_kernel.h"
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/cumprod_grad_kernel.cc b/paddle/phi/kernels/cpu/cumprod_grad_kernel.cc
index a2cc99c59fe2d8..01adfe7df0c0da 100644
--- a/paddle/phi/kernels/cpu/cumprod_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/cumprod_grad_kernel.cc
@@ -14,16 +14,16 @@
 
 #include "paddle/phi/kernels/cumprod_grad_kernel.h"
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/allocator.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 #include "paddle/phi/kernels/funcs/cumprod.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
 
 // NOTE(@xiongkun): use of IsComplex<>
-#include "paddle/phi/core/utils/data_type.h"
+#include "paddle/common/data_type.h"
 
 namespace phi {
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/cpu/eigvals_kernel.cc b/paddle/phi/kernels/cpu/eigvals_kernel.cc
index 5380106fd020bc..01d636bfaf06d3 100644
--- a/paddle/phi/kernels/cpu/eigvals_kernel.cc
+++ b/paddle/phi/kernels/cpu/eigvals_kernel.cc
@@ -17,9 +17,9 @@
 #include "glog/logging.h"
 
 #include "paddle/common/complex.h"
+#include "paddle/common/data_type.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
diff --git a/paddle/phi/kernels/cpu/embedding_kernel.cc b/paddle/phi/kernels/cpu/embedding_kernel.cc
index 0d937e6364eacc..0c92c68deb1087 100644
--- a/paddle/phi/kernels/cpu/embedding_kernel.cc
+++ b/paddle/phi/kernels/cpu/embedding_kernel.cc
@@ -14,10 +14,10 @@
 
 #include "paddle/phi/kernels/embedding_kernel.h"
 
+#include "paddle/common/data_type.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/embedding_util.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/gather_tree_kernel.cc b/paddle/phi/kernels/cpu/gather_tree_kernel.cc
index dac1441cb5006e..5df9058c2297e0 100644
--- a/paddle/phi/kernels/cpu/gather_tree_kernel.cc
+++ b/paddle/phi/kernels/cpu/gather_tree_kernel.cc
@@ -14,7 +14,7 @@
 
 #include "paddle/phi/kernels/gather_tree_kernel.h"
 
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/index_add_grad_kernel.cc b/paddle/phi/kernels/cpu/index_add_grad_kernel.cc
index a60d52f2005a4b..902bb9473c4bf7 100644
--- a/paddle/phi/kernels/cpu/index_add_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/index_add_grad_kernel.cc
@@ -14,8 +14,8 @@
 
 #include "paddle/phi/kernels/index_add_grad_kernel.h"
 
+#include "paddle/common/data_type.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/cpu/index_select_impl.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/index_add_kernel.cc b/paddle/phi/kernels/cpu/index_add_kernel.cc
index c2c5aa60814c51..3bdc770371b4b4 100644
--- a/paddle/phi/kernels/cpu/index_add_kernel.cc
+++ b/paddle/phi/kernels/cpu/index_add_kernel.cc
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/index_add_kernel.h"
+#include "paddle/common/data_type.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/cpu/index_add_impl.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 
diff --git a/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc b/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc
index c49a4531aea7a1..c87b2d72e67304 100644
--- a/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc
@@ -14,11 +14,11 @@
 
 #include "paddle/phi/kernels/index_sample_grad_kernel.h"
 
+#include "paddle/common/data_type.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
-#include "paddle/phi/core/utils/data_type.h"
 namespace phi {
 template <typename T, typename Context, typename IndexT = int>
 void IndexSampleGradInner(const Context& context,
diff --git a/paddle/phi/kernels/cpu/index_sample_kernel.cc b/paddle/phi/kernels/cpu/index_sample_kernel.cc
index 02f3afcb67b6ef..b8b697a3d451a4 100644
--- a/paddle/phi/kernels/cpu/index_sample_kernel.cc
+++ b/paddle/phi/kernels/cpu/index_sample_kernel.cc
@@ -23,11 +23,11 @@
 
 #include "glog/logging.h"
 
+#include "paddle/common/data_type.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
-#include "paddle/phi/core/utils/data_type.h"
 namespace phi {
 template <typename T, typename Context, typename IndexT = int>
 void IndexSampleInner(const Context &context,
diff --git a/paddle/phi/kernels/cpu/index_select_grad_kernel.cc b/paddle/phi/kernels/cpu/index_select_grad_kernel.cc
index 4e53056bd117f7..451f4d14b82523 100644
--- a/paddle/phi/kernels/cpu/index_select_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/index_select_grad_kernel.cc
@@ -14,8 +14,8 @@
 
 #include "paddle/phi/kernels/index_select_grad_kernel.h"
 
+#include "paddle/common/data_type.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/cpu/index_select_impl.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/index_select_kernel.cc b/paddle/phi/kernels/cpu/index_select_kernel.cc
index a9e64d1d183b46..8d02554d8b108c 100644
--- a/paddle/phi/kernels/cpu/index_select_kernel.cc
+++ b/paddle/phi/kernels/cpu/index_select_kernel.cc
@@ -14,8 +14,8 @@
 
 #include "paddle/phi/kernels/index_select_kernel.h"
 
+#include "paddle/common/data_type.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/cpu/index_select_impl.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/limit_by_capacity_kernel.cc b/paddle/phi/kernels/cpu/limit_by_capacity_kernel.cc
index ea2f6cbc6ee82c..1057120b2ae5e1 100644
--- a/paddle/phi/kernels/cpu/limit_by_capacity_kernel.cc
+++ b/paddle/phi/kernels/cpu/limit_by_capacity_kernel.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/limit_by_capacity_kernel.h"
-#include "paddle/phi/core/errors.h"
+#include "paddle/common/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 #if defined(PADDLE_WITH_GLOO)
diff --git a/paddle/phi/kernels/cpu/matrix_nms_kernel.cc b/paddle/phi/kernels/cpu/matrix_nms_kernel.cc
index b2827d039bacce..49983182d644d6 100644
--- a/paddle/phi/kernels/cpu/matrix_nms_kernel.cc
+++ b/paddle/phi/kernels/cpu/matrix_nms_kernel.cc
@@ -14,8 +14,8 @@
 
 #include "paddle/phi/kernels/matrix_nms_kernel.h"
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
diff --git a/paddle/phi/kernels/cpu/nll_loss_kernel.cc b/paddle/phi/kernels/cpu/nll_loss_kernel.cc
index c966e91a9a6e96..66c6e9449b6103 100644
--- a/paddle/phi/kernels/cpu/nll_loss_kernel.cc
+++ b/paddle/phi/kernels/cpu/nll_loss_kernel.cc
@@ -14,8 +14,8 @@
 
 #include "paddle/phi/kernels/nll_loss_kernel.h"
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/one_hot_kernel.cc b/paddle/phi/kernels/cpu/one_hot_kernel.cc
index 0958e2c02b4c1c..a03eae4b12e303 100644
--- a/paddle/phi/kernels/cpu/one_hot_kernel.cc
+++ b/paddle/phi/kernels/cpu/one_hot_kernel.cc
@@ -14,8 +14,8 @@
 
 #include "paddle/phi/kernels/one_hot_kernel.h"
 
+#include "paddle/common/data_type.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/prune_gate_by_capacity_kernel.cc b/paddle/phi/kernels/cpu/prune_gate_by_capacity_kernel.cc
index ed26b4f37dd5cc..7f2717b8ecacef 100644
--- a/paddle/phi/kernels/cpu/prune_gate_by_capacity_kernel.cc
+++ b/paddle/phi/kernels/cpu/prune_gate_by_capacity_kernel.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/prune_gate_by_capacity_kernel.h"
-#include "paddle/phi/core/errors.h"
+#include "paddle/common/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/random_routing_kernel.cc b/paddle/phi/kernels/cpu/random_routing_kernel.cc
index 0e1d450c1894ae..cdeab98f4c1ab3 100644
--- a/paddle/phi/kernels/cpu/random_routing_kernel.cc
+++ b/paddle/phi/kernels/cpu/random_routing_kernel.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/random_routing_kernel.h"
-#include "paddle/phi/core/errors.h"
+#include "paddle/common/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/repeat_interleave_grad_kernel.cc b/paddle/phi/kernels/cpu/repeat_interleave_grad_kernel.cc
index 05f19ac36107ec..6a03a88b020d45 100644
--- a/paddle/phi/kernels/cpu/repeat_interleave_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/repeat_interleave_grad_kernel.cc
@@ -14,10 +14,10 @@
 
 #include "paddle/phi/kernels/repeat_interleave_grad_kernel.h"
 
+#include "paddle/common/data_type.h"
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/cpu/index_select_impl.h"
 #include "paddle/phi/kernels/funcs/repeat_tensor2index_tensor.h"
 
diff --git a/paddle/phi/kernels/cpu/sparse_weight_embedding_grad_kernel.cc b/paddle/phi/kernels/cpu/sparse_weight_embedding_grad_kernel.cc
index d296aba66503b7..df67b2e66b5dbc 100644
--- a/paddle/phi/kernels/cpu/sparse_weight_embedding_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/sparse_weight_embedding_grad_kernel.cc
@@ -14,10 +14,10 @@
 
 #include "paddle/phi/kernels/sparse_weight_embedding_grad_kernel.h"
 
+#include "paddle/common/data_type.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/embedding_util.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/sparse_weight_embedding_kernel.cc b/paddle/phi/kernels/cpu/sparse_weight_embedding_kernel.cc
index 175b4a750a8203..081ac2b9d05bcf 100644
--- a/paddle/phi/kernels/cpu/sparse_weight_embedding_kernel.cc
+++ b/paddle/phi/kernels/cpu/sparse_weight_embedding_kernel.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/common/data_type.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/embedding_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/embedding_util.h"
diff --git a/paddle/phi/kernels/cpu/triangular_solve_kernel.cc b/paddle/phi/kernels/cpu/triangular_solve_kernel.cc
index 06c897b2199845..50ecc73d004c23 100644
--- a/paddle/phi/kernels/cpu/triangular_solve_kernel.cc
+++ b/paddle/phi/kernels/cpu/triangular_solve_kernel.cc
@@ -14,8 +14,8 @@
 
 #include "paddle/phi/kernels/triangular_solve_kernel.h"
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/expand_kernel.h"
diff --git a/paddle/phi/kernels/cpu/unique_consecutive_kernel.cc b/paddle/phi/kernels/cpu/unique_consecutive_kernel.cc
index d0d674d06ee2bd..9cdbe84342d3c0 100644
--- a/paddle/phi/kernels/cpu/unique_consecutive_kernel.cc
+++ b/paddle/phi/kernels/cpu/unique_consecutive_kernel.cc
@@ -17,10 +17,10 @@
 #include "paddle/phi/kernels/cpu/unique_consecutive_functor.h"
 #include "paddle/phi/kernels/unique_consecutive_kernel.h"
 
+#include "paddle/common/data_type.h"
+#include "paddle/common/errors.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/utils/data_type.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/unique_kernel.cc b/paddle/phi/kernels/cpu/unique_kernel.cc
index 1ea8452e1d1fa5..cbf495e970ea98 100644
--- a/paddle/phi/kernels/cpu/unique_kernel.cc
+++ b/paddle/phi/kernels/cpu/unique_kernel.cc
@@ -16,9 +16,9 @@
 
 #include "paddle/phi/kernels/unique_kernel.h"
 
+#include "paddle/common/data_type.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/unique_functor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h
index 799bfa45c416b6..c6404d50466040 100644
--- a/paddle/phi/kernels/funcs/activation_functor.h
+++ b/paddle/phi/kernels/funcs/activation_functor.h
@@ -30,10 +30,10 @@
 #include <type_traits>
 
 #include "paddle/common/bfloat16.h"
+#include "paddle/common/enforce.h"
 #include "paddle/common/float16.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/extensions.h"
 
diff --git a/paddle/phi/kernels/funcs/axis_utils.h b/paddle/phi/kernels/funcs/axis_utils.h
index 368c4a9e14061c..41bbd4f048c6b4 100644
--- a/paddle/phi/kernels/funcs/axis_utils.h
+++ b/paddle/phi/kernels/funcs/axis_utils.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 
 namespace phi {
 namespace funcs {
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
index c15c7c25954bcb..59d8e6d9df2ebf 100644
--- a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
@@ -38,33 +38,33 @@ template <>
 struct CUBlas<float> {
   template <typename... ARGS>
   static void GEMM(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSgemm(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasSgemm(args...));
   }
 
   template <typename... ARGS>
   static void AXPY(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSaxpy(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasSaxpy(args...));
   }
 
   template <typename... ARGS>
   static void SCAL(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSscal(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasSscal(args...));
   }
 
   template <typename... ARGS>
   static void VCOPY(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasScopy(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasScopy(args...));
   }
 
   template <typename... ARGS>
   static void GEMV(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSgemv(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasSgemv(args...));
   }
 
   template <typename... ARGS>
   static void GEMM_BATCH(ARGS... args) {
 #if CUDA_VERSION >= 8000
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSgemmBatched(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasSgemmBatched(args...));
 #else
     PADDLE_THROW(phi::errors::Unimplemented(
         "SgemmBatched is not supported on cuda <= 7.5"));
@@ -75,7 +75,7 @@ struct CUBlas<float> {
   static void GEMM_STRIDED_BATCH(ARGS... args) {
 #if CUDA_VERSION >= 8000
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cublasSgemmStridedBatched(args...));
+        common::dynload::cublasSgemmStridedBatched(args...));
 #else
     PADDLE_THROW(phi::errors::Unimplemented(
         "SgemmStridedBatched is not supported on cuda <= 7.5"));
@@ -109,23 +109,23 @@ struct CUBlas<float> {
     VLOG(5) << "use_tensor_op_math: "
             << (dev_ctx->tensor_core_available() ? "True" : "False");
     dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSgemmEx(handle,
-                                                             transa,
-                                                             transb,
-                                                             m,
-                                                             n,
-                                                             k,
-                                                             alpha,
-                                                             A,
-                                                             Atype,
-                                                             lda,
-                                                             B,
-                                                             Btype,
-                                                             ldb,
-                                                             beta,
-                                                             C,
-                                                             Ctype,
-                                                             ldc));
+      PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasSgemmEx(handle,
+                                                                transa,
+                                                                transb,
+                                                                m,
+                                                                n,
+                                                                k,
+                                                                alpha,
+                                                                A,
+                                                                Atype,
+                                                                lda,
+                                                                B,
+                                                                Btype,
+                                                                ldb,
+                                                                beta,
+                                                                C,
+                                                                Ctype,
+                                                                ldc));
     });
 #else
     PADDLE_THROW(phi::errors::Unimplemented(
@@ -135,32 +135,32 @@ struct CUBlas<float> {
 
   template <typename... ARGS>
   static void TRSM(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasStrsm(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasStrsm(args...));
   }
 
   template <typename... ARGS>
   static void GETRF_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSgetrfBatched(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasSgetrfBatched(args...));
   }
 
   template <typename... ARGS>
   static void GETRI_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSgetriBatched(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasSgetriBatched(args...));
   }
 
   template <typename... ARGS>
   static void MATINV_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSmatinvBatched(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasSmatinvBatched(args...));
   }
 
   template <typename... ARGS>
   static void GETRS_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSgetrsBatched(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasSgetrsBatched(args...));
   }
 
   template <typename... ARGS>
   static void TRSM_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasStrsmBatched(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasStrsmBatched(args...));
   }
 };
 
@@ -168,33 +168,33 @@ template <>
 struct CUBlas<double> {
   template <typename... ARGS>
   static void GEMM(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDgemm(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasDgemm(args...));
   }
 
   template <typename... ARGS>
   static void AXPY(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDaxpy(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasDaxpy(args...));
   }
 
   template <typename... ARGS>
   static void SCAL(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDscal(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasDscal(args...));
   }
 
   template <typename... ARGS>
   static void VCOPY(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDcopy(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasDcopy(args...));
   }
 
   template <typename... ARGS>
   static void GEMV(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDgemv(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasDgemv(args...));
   }
 
   template <typename... ARGS>
   static void GEMM_BATCH(ARGS... args) {
 #if CUDA_VERSION >= 8000
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDgemmBatched(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasDgemmBatched(args...));
 #else
     PADDLE_THROW(phi::errors::Unimplemented(
         "DgemmBatched is not supported on cuda <= 7.5"));
@@ -205,7 +205,7 @@ struct CUBlas<double> {
   static void GEMM_STRIDED_BATCH(ARGS... args) {
 #if CUDA_VERSION >= 8000
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cublasDgemmStridedBatched(args...));
+        common::dynload::cublasDgemmStridedBatched(args...));
 #else
     PADDLE_THROW(phi::errors::Unimplemented(
         "DgemmStridedBatched is not supported on cuda <= 7.5"));
@@ -220,32 +220,32 @@ struct CUBlas<double> {
 
   template <typename... ARGS>
   static void TRSM(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDtrsm(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasDtrsm(args...));
   }
 
   template <typename... ARGS>
   static void GETRF_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDgetrfBatched(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasDgetrfBatched(args...));
   }
 
   template <typename... ARGS>
   static void GETRI_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDgetriBatched(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasDgetriBatched(args...));
   }
 
   template <typename... ARGS>
   static void MATINV_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDmatinvBatched(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasDmatinvBatched(args...));
   }
 
   template <typename... ARGS>
   static void GETRS_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDgetrsBatched(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasDgetrsBatched(args...));
   }
 
   template <typename... ARGS>
   static void TRSM_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDtrsmBatched(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasDtrsmBatched(args...));
   }
 };
 
@@ -268,20 +268,20 @@ struct CUBlas<phi::dtype::float16> {
                    float16 *C,
                    int ldc) {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cublasHgemm(handle,
-                                  transa,
-                                  transb,
-                                  m,
-                                  n,
-                                  k,
-                                  reinterpret_cast<const __half *>(alpha),
-                                  reinterpret_cast<const __half *>(A),
-                                  lda,
-                                  reinterpret_cast<const __half *>(B),
-                                  ldb,
-                                  reinterpret_cast<const __half *>(beta),
-                                  reinterpret_cast<__half *>(C),
-                                  ldc));
+        common::dynload::cublasHgemm(handle,
+                                     transa,
+                                     transb,
+                                     m,
+                                     n,
+                                     k,
+                                     reinterpret_cast<const __half *>(alpha),
+                                     reinterpret_cast<const __half *>(A),
+                                     lda,
+                                     reinterpret_cast<const __half *>(B),
+                                     ldb,
+                                     reinterpret_cast<const __half *>(beta),
+                                     reinterpret_cast<__half *>(C),
+                                     ldc));
   }
 
 #if defined(__NVCC__)
@@ -319,26 +319,26 @@ struct CUBlas<phi::dtype::float16> {
     thrust::device_vector<void *> C_ptr(C, C + batchCount);
     dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
       PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::cublasGemmBatchedEx(handle,
-                                            transa,
-                                            transb,
-                                            m,
-                                            n,
-                                            k,
-                                            alpha,
-                                            A_ptr.data().get(),
-                                            Atype,
-                                            lda,
-                                            B_ptr.data().get(),
-                                            Btype,
-                                            ldb,
-                                            beta,
-                                            C_ptr.data().get(),
-                                            Ctype,
-                                            ldc,
-                                            batchCount,
-                                            computeType,
-                                            algo));
+          common::dynload::cublasGemmBatchedEx(handle,
+                                               transa,
+                                               transb,
+                                               m,
+                                               n,
+                                               k,
+                                               alpha,
+                                               A_ptr.data().get(),
+                                               Atype,
+                                               lda,
+                                               B_ptr.data().get(),
+                                               Btype,
+                                               ldb,
+                                               beta,
+                                               C_ptr.data().get(),
+                                               Ctype,
+                                               ldc,
+                                               batchCount,
+                                               computeType,
+                                               algo));
     });
 #else
     PADDLE_THROW(phi::errors::Unimplemented(
@@ -366,7 +366,7 @@ struct CUBlas<phi::dtype::float16> {
                                  long long int strideC,  // NOLINT
                                  int batchCount) {
 #if CUDA_VERSION >= 8000
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasHgemmStridedBatched(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasHgemmStridedBatched(
         handle,
         transa,
         transb,
@@ -424,25 +424,25 @@ struct CUBlas<phi::dtype::float16> {
 #endif  // CUDA_VERSION >= 9000
 
     dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
-                                                            transa,
-                                                            transb,
-                                                            m,
-                                                            n,
-                                                            k,
-                                                            alpha,
-                                                            A,
-                                                            Atype,
-                                                            lda,
-                                                            B,
-                                                            Btype,
-                                                            ldb,
-                                                            beta,
-                                                            C,
-                                                            Ctype,
-                                                            ldc,
-                                                            computeType,
-                                                            algo));
+      PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasGemmEx(handle,
+                                                               transa,
+                                                               transb,
+                                                               m,
+                                                               n,
+                                                               k,
+                                                               alpha,
+                                                               A,
+                                                               Atype,
+                                                               lda,
+                                                               B,
+                                                               Btype,
+                                                               ldb,
+                                                               beta,
+                                                               C,
+                                                               Ctype,
+                                                               ldc,
+                                                               computeType,
+                                                               algo));
     });
 #else
     PADDLE_THROW(phi::errors::Unimplemented(
@@ -465,7 +465,7 @@ struct CUBlas<phi::dtype::complex<float>> {
                    const phi::dtype::complex<float> *beta,
                    phi::dtype::complex<float> *C,
                    int ldc) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCgemv(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasCgemv(
         handle,
         transa,
         m,
@@ -487,7 +487,7 @@ struct CUBlas<phi::dtype::complex<float>> {
                    const int incX,
                    phi::dtype::complex<float> *Y,
                    const int incY) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCaxpy(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasCaxpy(
         handle,
         n,
         reinterpret_cast<const cuFloatComplex *>(alpha),
@@ -516,7 +516,7 @@ struct CUBlas<phi::dtype::complex<float>> {
                                  long long int strideC,  // NOLINT
                                  int batchCount) {
 #if CUDA_VERSION >= 8000
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCgemmStridedBatched(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasCgemmStridedBatched(
         handle,
         transa,
         transb,
@@ -555,7 +555,7 @@ struct CUBlas<phi::dtype::complex<float>> {
                    const phi::dtype::complex<float> *beta,
                    phi::dtype::complex<float> *C,
                    int ldc) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCgemm(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasCgemm(
         handle,
         transa,
         transb,
@@ -584,7 +584,7 @@ struct CUBlas<phi::dtype::complex<float>> {
                    int lda,
                    phi::dtype::complex<float> *B,
                    int ldb) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCtrsm(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasCtrsm(
         handle,
         side,
         uplo,
@@ -632,25 +632,25 @@ struct CUBlas<phi::dtype::complex<float>> {
 #endif  // CUDA_VERSION >= 9000
 
     dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
-                                                            transa,
-                                                            transb,
-                                                            m,
-                                                            n,
-                                                            k,
-                                                            alpha,
-                                                            A,
-                                                            Atype,
-                                                            lda,
-                                                            B,
-                                                            Btype,
-                                                            ldb,
-                                                            beta,
-                                                            C,
-                                                            Ctype,
-                                                            ldc,
-                                                            computeType,
-                                                            algo));
+      PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasGemmEx(handle,
+                                                               transa,
+                                                               transb,
+                                                               m,
+                                                               n,
+                                                               k,
+                                                               alpha,
+                                                               A,
+                                                               Atype,
+                                                               lda,
+                                                               B,
+                                                               Btype,
+                                                               ldb,
+                                                               beta,
+                                                               C,
+                                                               Ctype,
+                                                               ldc,
+                                                               computeType,
+                                                               algo));
     });
 #else
     PADDLE_THROW(phi::errors::Unimplemented(
@@ -671,7 +671,7 @@ struct CUBlas<phi::dtype::complex<float>> {
                          phi::dtype::complex<float> **B,
                          int ldb,
                          int batch_size) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCtrsmBatched(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasCtrsmBatched(
         handle,
         side,
         uplo,
@@ -702,7 +702,7 @@ struct CUBlas<phi::dtype::complex<double>> {
                    const phi::dtype::complex<double> *beta,
                    phi::dtype::complex<double> *C,
                    int ldc) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZgemv(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasZgemv(
         handle,
         transa,
         m,
@@ -724,7 +724,7 @@ struct CUBlas<phi::dtype::complex<double>> {
                    const int incX,
                    phi::dtype::complex<double> *Y,
                    const int incY) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZaxpy(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasZaxpy(
         handle,
         n,
         reinterpret_cast<const cuDoubleComplex *>(alpha),
@@ -754,7 +754,7 @@ struct CUBlas<phi::dtype::complex<double>> {
       long long int strideC,  // NOLINT
       int batchCount) {
 #if CUDA_VERSION >= 8000
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZgemmStridedBatched(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasZgemmStridedBatched(
         handle,
         transa,
         transb,
@@ -793,7 +793,7 @@ struct CUBlas<phi::dtype::complex<double>> {
                    const phi::dtype::complex<double> *beta,
                    phi::dtype::complex<double> *C,
                    int ldc) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZgemm(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasZgemm(
         handle,
         transa,
         transb,
@@ -822,7 +822,7 @@ struct CUBlas<phi::dtype::complex<double>> {
                    int lda,
                    phi::dtype::complex<double> *B,
                    int ldb) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZtrsm(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasZtrsm(
         handle,
         side,
         uplo,
@@ -850,7 +850,7 @@ struct CUBlas<phi::dtype::complex<double>> {
                          phi::dtype::complex<double> **B,
                          int ldb,
                          int batch_size) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZtrsmBatched(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasZtrsmBatched(
         handle,
         side,
         uplo,
@@ -899,25 +899,25 @@ struct CUBlas<phi::dtype::complex<double>> {
 #endif  // CUDA_VERSION >= 9000
 
     dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
-                                                            transa,
-                                                            transb,
-                                                            m,
-                                                            n,
-                                                            k,
-                                                            alpha,
-                                                            A,
-                                                            Atype,
-                                                            lda,
-                                                            B,
-                                                            Btype,
-                                                            ldb,
-                                                            beta,
-                                                            C,
-                                                            Ctype,
-                                                            ldc,
-                                                            computeType,
-                                                            algo));
+      PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasGemmEx(handle,
+                                                               transa,
+                                                               transb,
+                                                               m,
+                                                               n,
+                                                               k,
+                                                               alpha,
+                                                               A,
+                                                               Atype,
+                                                               lda,
+                                                               B,
+                                                               Btype,
+                                                               ldb,
+                                                               beta,
+                                                               C,
+                                                               Ctype,
+                                                               ldc,
+                                                               computeType,
+                                                               algo));
     });
 #else
     PADDLE_THROW(phi::errors::Unimplemented(
@@ -1111,25 +1111,25 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
   VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False");
 
   context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
-                                                          cuTransB,
-                                                          cuTransA,
-                                                          N,
-                                                          M,
-                                                          K,
-                                                          &h_alpha,
-                                                          B,
-                                                          CUDA_R_16BF,
-                                                          ldb,
-                                                          A,
-                                                          CUDA_R_16BF,
-                                                          lda,
-                                                          &h_beta,
-                                                          C,
-                                                          CUDA_R_16BF,
-                                                          N,
-                                                          CUDA_R_32F,
-                                                          algo));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasGemmEx(handle,
+                                                             cuTransB,
+                                                             cuTransA,
+                                                             N,
+                                                             M,
+                                                             K,
+                                                             &h_alpha,
+                                                             B,
+                                                             CUDA_R_16BF,
+                                                             ldb,
+                                                             A,
+                                                             CUDA_R_16BF,
+                                                             lda,
+                                                             &h_beta,
+                                                             C,
+                                                             CUDA_R_16BF,
+                                                             N,
+                                                             CUDA_R_32F,
+                                                             algo));
   });
 #else
   // raise error
@@ -1443,25 +1443,25 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
   VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False");
 
   context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
-                                                          cuTransB,
-                                                          cuTransA,
-                                                          N,
-                                                          M,
-                                                          K,
-                                                          &h_alpha,
-                                                          B,
-                                                          CUDA_R_16BF,
-                                                          ldb,
-                                                          A,
-                                                          CUDA_R_16BF,
-                                                          lda,
-                                                          &h_beta,
-                                                          C,
-                                                          CUDA_R_16BF,
-                                                          ldc,
-                                                          CUDA_R_32F,
-                                                          algo));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasGemmEx(handle,
+                                                             cuTransB,
+                                                             cuTransA,
+                                                             N,
+                                                             M,
+                                                             K,
+                                                             &h_alpha,
+                                                             B,
+                                                             CUDA_R_16BF,
+                                                             ldb,
+                                                             A,
+                                                             CUDA_R_16BF,
+                                                             lda,
+                                                             &h_beta,
+                                                             C,
+                                                             CUDA_R_16BF,
+                                                             ldc,
+                                                             CUDA_R_32F,
+                                                             algo));
   });
 #else
   // raise error
@@ -1615,29 +1615,29 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
 
     context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
       PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::cublasGemmStridedBatchedEx(handle,
-                                                   cuTransB,
-                                                   cuTransA,
-                                                   N,
-                                                   M,
-                                                   K,
-                                                   a,
-                                                   B,
-                                                   fp,
-                                                   ldb,
-                                                   strideB,
-                                                   A,
-                                                   fp,
-                                                   lda,
-                                                   strideA,
-                                                   b,
-                                                   C,
-                                                   fp,
-                                                   ldc,
-                                                   strideC,
-                                                   batchCount,
-                                                   compute_type,
-                                                   algo));
+          common::dynload::cublasGemmStridedBatchedEx(handle,
+                                                      cuTransB,
+                                                      cuTransA,
+                                                      N,
+                                                      M,
+                                                      K,
+                                                      a,
+                                                      B,
+                                                      fp,
+                                                      ldb,
+                                                      strideB,
+                                                      A,
+                                                      fp,
+                                                      lda,
+                                                      strideA,
+                                                      b,
+                                                      C,
+                                                      fp,
+                                                      ldc,
+                                                      strideC,
+                                                      batchCount,
+                                                      compute_type,
+                                                      algo));
     });
   } else {
 #endif  // CUDA_VERSION >= 9010
@@ -1707,29 +1707,29 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
 
   context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cublasGemmStridedBatchedEx(handle,
-                                                 cuTransB,
-                                                 cuTransA,
-                                                 N,
-                                                 M,
-                                                 K,
-                                                 &h_alpha,
-                                                 B,
-                                                 CUDA_R_16BF,
-                                                 ldb,
-                                                 strideB,
-                                                 A,
-                                                 CUDA_R_16BF,
-                                                 lda,
-                                                 strideA,
-                                                 &h_beta,
-                                                 C,
-                                                 CUDA_R_16BF,
-                                                 ldc,
-                                                 strideC,
-                                                 batchCount,
-                                                 CUBLAS_COMPUTE_32F,
-                                                 algo));
+        common::dynload::cublasGemmStridedBatchedEx(handle,
+                                                    cuTransB,
+                                                    cuTransA,
+                                                    N,
+                                                    M,
+                                                    K,
+                                                    &h_alpha,
+                                                    B,
+                                                    CUDA_R_16BF,
+                                                    ldb,
+                                                    strideB,
+                                                    A,
+                                                    CUDA_R_16BF,
+                                                    lda,
+                                                    strideA,
+                                                    &h_beta,
+                                                    C,
+                                                    CUDA_R_16BF,
+                                                    ldc,
+                                                    strideC,
+                                                    batchCount,
+                                                    CUBLAS_COMPUTE_32F,
+                                                    algo));
   });
 #else
   // raise error
@@ -1950,26 +1950,26 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
   thrust::device_vector<void *> C_ptr(C, C + batchCount);
   context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cublasGemmBatchedEx(handle,
-                                          cuTransB,
-                                          cuTransA,
-                                          N,
-                                          M,
-                                          K,
-                                          &f_alpha,
-                                          B_ptr.data().get(),
-                                          CUDA_R_16BF,
-                                          ldb,
-                                          A_ptr.data().get(),
-                                          CUDA_R_16BF,
-                                          lda,
-                                          &f_beta,
-                                          C_ptr.data().get(),
-                                          CUDA_R_16BF,
-                                          ldc,
-                                          batchCount,
-                                          CUDA_R_32F,
-                                          algo));
+        common::dynload::cublasGemmBatchedEx(handle,
+                                             cuTransB,
+                                             cuTransA,
+                                             N,
+                                             M,
+                                             K,
+                                             &f_alpha,
+                                             B_ptr.data().get(),
+                                             CUDA_R_16BF,
+                                             ldb,
+                                             A_ptr.data().get(),
+                                             CUDA_R_16BF,
+                                             lda,
+                                             &f_beta,
+                                             C_ptr.data().get(),
+                                             CUDA_R_16BF,
+                                             ldc,
+                                             batchCount,
+                                             CUDA_R_32F,
+                                             algo));
   });
 #else
   // raise error
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.h b/paddle/phi/kernels/funcs/blas/blas_impl.h
index 11eff02fefe214..34f8c830892db5 100644
--- a/paddle/phi/kernels/funcs/blas/blas_impl.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.h
@@ -113,27 +113,27 @@ template <>
 struct CBlas<float> {
   template <typename... ARGS>
   static void GEMM(ARGS... args) {
-    phi::dynload::cblas_sgemm(args...);
+    common::dynload::cblas_sgemm(args...);
   }
 
   template <typename... ARGS>
   static float *GEMM_ALLOC(ARGS... args) {
-    return phi::dynload::cblas_sgemm_alloc(args...);
+    return common::dynload::cblas_sgemm_alloc(args...);
   }
 
   template <typename... ARGS>
   static void GEMM_PACK(ARGS... args) {
-    phi::dynload::cblas_sgemm_pack(args...);
+    common::dynload::cblas_sgemm_pack(args...);
   }
 
   template <typename... ARGS>
   static void GEMM_COMPUTE(ARGS... args) {
-    phi::dynload::cblas_sgemm_compute(args...);
+    common::dynload::cblas_sgemm_compute(args...);
   }
 
   template <typename... ARGS>
   static void GEMM_FREE(ARGS... args) {
-    phi::dynload::cblas_sgemm_free(args...);
+    common::dynload::cblas_sgemm_free(args...);
   }
 
 #ifdef PADDLE_WITH_LIBXSMM
@@ -145,93 +145,93 @@ struct CBlas<float> {
 
   template <typename... ARGS>
   static void AXPY(ARGS... args) {
-    phi::dynload::cblas_saxpy(args...);
+    common::dynload::cblas_saxpy(args...);
   }
 
   template <typename... ARGS>
   static void VCOPY(ARGS... args) {
-    phi::dynload::cblas_scopy(args...);
+    common::dynload::cblas_scopy(args...);
   }
 
   template <typename... ARGS>
   static void GEMV(ARGS... args) {
-    phi::dynload::cblas_sgemv(args...);
+    common::dynload::cblas_sgemv(args...);
   }
 
   template <typename... ARGS>
   static float DOT(ARGS... args) {
-    return phi::dynload::cblas_sdot(args...);
+    return common::dynload::cblas_sdot(args...);
   }
 
   template <typename... ARGS>
   static void SCAL(ARGS... args) {
-    phi::dynload::cblas_sscal(args...);
+    common::dynload::cblas_sscal(args...);
   }
 
   template <typename... ARGS>
   static float ASUM(ARGS... args) {
-    return phi::dynload::cblas_sasum(args...);
+    return common::dynload::cblas_sasum(args...);
   }
 
   template <typename... ARGS>
   static void GEMM_BATCH(ARGS... args) {
-    phi::dynload::cblas_sgemm_batch(args...);
+    common::dynload::cblas_sgemm_batch(args...);
   }
 
   template <typename... ARGS>
   static void VADD(ARGS... args) {
-    phi::dynload::vsAdd(args...);
+    common::dynload::vsAdd(args...);
   }
 
   template <typename... ARGS>
   static void VSUB(ARGS... args) {
-    phi::dynload::vsSub(args...);
+    common::dynload::vsSub(args...);
   }
 
   template <typename... ARGS>
   static void VMUL(ARGS... args) {
-    phi::dynload::vsMul(args...);
+    common::dynload::vsMul(args...);
   }
 
   template <typename... ARGS>
   static void VDIV(ARGS... args) {
-    phi::dynload::vsDiv(args...);
+    common::dynload::vsDiv(args...);
   }
 
   template <typename... ARGS>
   static void VEXP(ARGS... args) {
-    phi::dynload::vsExp(args...);
+    common::dynload::vsExp(args...);
   }
 
   template <typename... ARGS>
   static void VSQUARE(ARGS... args) {
-    phi::dynload::vsSqr(args...);
+    common::dynload::vsSqr(args...);
   }
 
   template <typename... ARGS>
   static void VPOW(ARGS... args) {
-    phi::dynload::vsPowx(args...);
+    common::dynload::vsPowx(args...);
   }
 
   template <typename... ARGS>
   static void VINV(ARGS... args) {
-    phi::dynload::vsInv(args...);
+    common::dynload::vsInv(args...);
   }
 
   template <typename... ARGS>
   static void VMERF(ARGS... args) {
-    phi::dynload::vmsErf(args...);
+    common::dynload::vmsErf(args...);
   }
 #if !defined(_WIN32)
   template <typename... ARGS>
   static void CSRMM(ARGS... args) {
-    phi::dynload::mkl_scsrmm(args...);
+    common::dynload::mkl_scsrmm(args...);
   }
 #endif
 
   template <typename... ARGS>
   static void TRSM(ARGS... args) {
-    phi::dynload::cblas_strsm(args...);
+    common::dynload::cblas_strsm(args...);
   }
 };
 
@@ -239,27 +239,27 @@ template <>
 struct CBlas<double> {
   template <typename... ARGS>
   static void GEMM(ARGS... args) {
-    phi::dynload::cblas_dgemm(args...);
+    common::dynload::cblas_dgemm(args...);
   }
 
   template <typename... ARGS>
   static double *GEMM_ALLOC(ARGS... args) {
-    return phi::dynload::cblas_dgemm_alloc(args...);
+    return common::dynload::cblas_dgemm_alloc(args...);
   }
 
   template <typename... ARGS>
   static void GEMM_PACK(ARGS... args) {
-    phi::dynload::cblas_dgemm_pack(args...);
+    common::dynload::cblas_dgemm_pack(args...);
   }
 
   template <typename... ARGS>
   static void GEMM_COMPUTE(ARGS... args) {
-    phi::dynload::cblas_dgemm_compute(args...);
+    common::dynload::cblas_dgemm_compute(args...);
   }
 
   template <typename... ARGS>
   static void GEMM_FREE(ARGS... args) {
-    phi::dynload::cblas_dgemm_free(args...);
+    common::dynload::cblas_dgemm_free(args...);
   }
 
 #ifdef PADDLE_WITH_LIBXSMM
@@ -271,93 +271,93 @@ struct CBlas<double> {
 
   template <typename... ARGS>
   static void AXPY(ARGS... args) {
-    phi::dynload::cblas_daxpy(args...);
+    common::dynload::cblas_daxpy(args...);
   }
 
   template <typename... ARGS>
   static void VCOPY(ARGS... args) {
-    phi::dynload::cblas_dcopy(args...);
+    common::dynload::cblas_dcopy(args...);
   }
 
   template <typename... ARGS>
   static void GEMV(ARGS... args) {
-    phi::dynload::cblas_dgemv(args...);
+    common::dynload::cblas_dgemv(args...);
   }
 
   template <typename... ARGS>
   static double DOT(ARGS... args) {
-    return phi::dynload::cblas_ddot(args...);
+    return common::dynload::cblas_ddot(args...);
   }
 
   template <typename... ARGS>
   static void SCAL(ARGS... args) {
-    phi::dynload::cblas_dscal(args...);
+    common::dynload::cblas_dscal(args...);
   }
 
   template <typename... ARGS>
   static double ASUM(ARGS... args) {
-    return phi::dynload::cblas_dasum(args...);
+    return common::dynload::cblas_dasum(args...);
   }
 
   template <typename... ARGS>
   static void GEMM_BATCH(ARGS... args) {
-    phi::dynload::cblas_dgemm_batch(args...);
+    common::dynload::cblas_dgemm_batch(args...);
   }
 
   template <typename... ARGS>
   static void VADD(ARGS... args) {
-    phi::dynload::vdAdd(args...);
+    common::dynload::vdAdd(args...);
   }
 
   template <typename... ARGS>
   static void VSUB(ARGS... args) {
-    phi::dynload::vdSub(args...);
+    common::dynload::vdSub(args...);
   }
 
   template <typename... ARGS>
   static void VMUL(ARGS... args) {
-    phi::dynload::vdMul(args...);
+    common::dynload::vdMul(args...);
   }
 
   template <typename... ARGS>
   static void VDIV(ARGS... args) {
-    phi::dynload::vdDiv(args...);
+    common::dynload::vdDiv(args...);
   }
 
   template <typename... ARGS>
   static void VEXP(ARGS... args) {
-    phi::dynload::vdExp(args...);
+    common::dynload::vdExp(args...);
   }
 
   template <typename... ARGS>
   static void VSQUARE(ARGS... args) {
-    phi::dynload::vdSqr(args...);
+    common::dynload::vdSqr(args...);
   }
 
   template <typename... ARGS>
   static void VPOW(ARGS... args) {
-    phi::dynload::vdPowx(args...);
+    common::dynload::vdPowx(args...);
   }
 
   template <typename... ARGS>
   static void VINV(ARGS... args) {
-    phi::dynload::vdInv(args...);
+    common::dynload::vdInv(args...);
   }
 
   template <typename... ARGS>
   static void VMERF(ARGS... args) {
-    phi::dynload::vmdErf(args...);
+    common::dynload::vmdErf(args...);
   }
 #if !defined(_WIN32)
   template <typename... ARGS>
   static void CSRMM(ARGS... args) {
-    phi::dynload::mkl_dcsrmm(args...);
+    common::dynload::mkl_dcsrmm(args...);
   }
 #endif
 
   template <typename... ARGS>
   static void TRSM(ARGS... args) {
-    phi::dynload::cblas_dtrsm(args...);
+    common::dynload::cblas_dtrsm(args...);
   }
 };
 
@@ -370,12 +370,12 @@ struct CBlas<phi::dtype::complex<float>> {
                    const int incX,
                    phi::dtype::complex<float> *Y,
                    const int incY) {
-    phi::dynload::cblas_caxpy(n, &alpha, X, incX, Y, incY);
+    common::dynload::cblas_caxpy(n, &alpha, X, incX, Y, incY);
   }
 
   template <typename... ARGS>
   static void VCOPY(ARGS... args) {
-    phi::dynload::cblas_ccopy(args...);
+    common::dynload::cblas_ccopy(args...);
   }
 
   // the libmklml_intel.so paddle used has no vcAdd, vcSub,
@@ -384,22 +384,22 @@ struct CBlas<phi::dtype::complex<float>> {
   /*
   template <typename... ARGS>
   static void VADD(ARGS... args) {
-    phi::dynload::vcAdd(args...);
+    common::dynload::vcAdd(args...);
   }
 
   template <typename... ARGS>
   static void VSUB(ARGS... args) {
-    phi::dynload::vcSub(args...);
+    common::dynload::vcSub(args...);
   }
 
   template <typename... ARGS>
   static void VMUL(ARGS... args) {
-    phi::dynload::vcMul(args...);
+    common::dynload::vcMul(args...);
   }
 
   template <typename... ARGS>
   static void VDIV(ARGS... args) {
-    phi::dynload::vcDiv(args...);
+    common::dynload::vcDiv(args...);
   }
   */
 
@@ -458,7 +458,7 @@ struct CBlas<phi::dtype::complex<float>> {
     const void *a_ = (const void *)(A);
     const void *x_ = (const void *)(X);
     void *y_ = static_cast<void *>(Y);
-    phi::dynload::cblas_cgemv(
+    common::dynload::cblas_cgemv(
         layout, trans, M, N, &alpha, a_, lda, x_, incx, &beta, y_, incy);
   }
 
@@ -480,20 +480,20 @@ struct CBlas<phi::dtype::complex<float>> {
     const void *a_ = (const void *)(A);
     const void *b_ = (const void *)(B);
     void *c_ = static_cast<void *>(C);
-    phi::dynload::cblas_cgemm(layout,
-                              trans_a,
-                              trans_b,
-                              M,
-                              N,
-                              K,
-                              &alpha,
-                              a_,
-                              lda,
-                              b_,
-                              ldb,
-                              &beta,
-                              c_,
-                              ldc);
+    common::dynload::cblas_cgemm(layout,
+                                 trans_a,
+                                 trans_b,
+                                 M,
+                                 N,
+                                 K,
+                                 &alpha,
+                                 a_,
+                                 lda,
+                                 b_,
+                                 ldb,
+                                 &beta,
+                                 c_,
+                                 ldc);
   }
 
   static void TRSM(CBLAS_LAYOUT layout,
@@ -510,7 +510,7 @@ struct CBlas<phi::dtype::complex<float>> {
                    int ldb) {
     const void *a_ = (const void *)(A);
     void *b_ = static_cast<void *>(B);
-    phi::dynload::cblas_ctrsm(
+    common::dynload::cblas_ctrsm(
         layout, side, uplo, trans_a, diag, M, N, &alpha, a_, lda, b_, ldb);
   }
 
@@ -535,27 +535,27 @@ struct CBlas<phi::dtype::complex<float>> {
     const void **B_void = (const void **)(&(*B));
     void **C_void = reinterpret_cast<void **>(C);
 
-    phi::dynload::cblas_cgemm_batch(layout,
-                                    trans_a,
-                                    trans_b,
-                                    M,
-                                    N,
-                                    K,
-                                    alpha,
-                                    A_void,
-                                    lda,
-                                    B_void,
-                                    ldb,
-                                    beta,
-                                    C_void,
-                                    ldc,
-                                    group_count,
-                                    group_size);
+    common::dynload::cblas_cgemm_batch(layout,
+                                       trans_a,
+                                       trans_b,
+                                       M,
+                                       N,
+                                       K,
+                                       alpha,
+                                       A_void,
+                                       lda,
+                                       B_void,
+                                       ldb,
+                                       beta,
+                                       C_void,
+                                       ldc,
+                                       group_count,
+                                       group_size);
   }
 
   template <typename... ARGS>
   static void GEMM_EX(ARGS... args) {
-    phi::dynload::cblas_cgemm_batch(args...);
+    common::dynload::cblas_cgemm_batch(args...);
   }
 };
 
@@ -568,12 +568,12 @@ struct CBlas<phi::dtype::complex<double>> {
                    const int incX,
                    phi::dtype::complex<double> *Y,
                    const int incY) {
-    phi::dynload::cblas_zaxpy(n, &alpha, X, incX, Y, incY);
+    common::dynload::cblas_zaxpy(n, &alpha, X, incX, Y, incY);
   }
 
   template <typename... ARGS>
   static void VCOPY(ARGS... args) {
-    phi::dynload::cblas_zcopy(args...);
+    common::dynload::cblas_zcopy(args...);
   }
 
   // the libmklml_intel.so paddle used has no vzAdd, vzSub,
@@ -582,22 +582,22 @@ struct CBlas<phi::dtype::complex<double>> {
   /*
   template <typename... ARGS>
   static void VADD(ARGS... args) {
-    phi::dynload::vzAdd(args...);
+    common::dynload::vzAdd(args...);
   }
 
   template <typename... ARGS>
   static void VSUB(ARGS... args) {
-    phi::dynload::vzSub(args...);
+    common::dynload::vzSub(args...);
   }
 
   template <typename... ARGS>
   static void VMUL(ARGS... args) {
-    phi::dynload::vzMul(args...);
+    common::dynload::vzMul(args...);
   }
 
   template <typename... ARGS>
   static void VDIV(ARGS... args) {
-    phi::dynload::vzDiv(args...);
+    common::dynload::vzDiv(args...);
   }
   */
 
@@ -656,7 +656,7 @@ struct CBlas<phi::dtype::complex<double>> {
     const void *a_ = (const void *)(A);
     const void *x_ = (const void *)(X);
     void *y_ = static_cast<void *>(Y);
-    phi::dynload::cblas_zgemv(
+    common::dynload::cblas_zgemv(
         layout, trans, M, N, &alpha, a_, lda, x_, incx, &beta, y_, incy);
   }
 
@@ -678,20 +678,20 @@ struct CBlas<phi::dtype::complex<double>> {
     const void *a_ = (const void *)(A);
     const void *b_ = (const void *)(B);
     void *c_ = static_cast<void *>(C);
-    phi::dynload::cblas_zgemm(layout,
-                              trans_a,
-                              trans_b,
-                              M,
-                              N,
-                              K,
-                              &alpha,
-                              a_,
-                              lda,
-                              b_,
-                              ldb,
-                              &beta,
-                              c_,
-                              ldc);
+    common::dynload::cblas_zgemm(layout,
+                                 trans_a,
+                                 trans_b,
+                                 M,
+                                 N,
+                                 K,
+                                 &alpha,
+                                 a_,
+                                 lda,
+                                 b_,
+                                 ldb,
+                                 &beta,
+                                 c_,
+                                 ldc);
   }
 
   static void TRSM(CBLAS_LAYOUT layout,
@@ -708,7 +708,7 @@ struct CBlas<phi::dtype::complex<double>> {
                    int ldb) {
     const void *a_ = (const void *)(A);
     void *b_ = static_cast<void *>(B);
-    phi::dynload::cblas_ztrsm(
+    common::dynload::cblas_ztrsm(
         layout, side, uplo, trans_a, diag, M, N, &alpha, a_, lda, b_, ldb);
   }
 
@@ -733,27 +733,27 @@ struct CBlas<phi::dtype::complex<double>> {
     const void **B_void = (const void **)(&(*B));
     void **C_void = reinterpret_cast<void **>(C);
 
-    phi::dynload::cblas_zgemm_batch(layout,
-                                    trans_a,
-                                    trans_b,
-                                    M,
-                                    N,
-                                    K,
-                                    alpha,
-                                    A_void,
-                                    lda,
-                                    B_void,
-                                    ldb,
-                                    beta,
-                                    C_void,
-                                    ldc,
-                                    group_count,
-                                    group_size);
+    common::dynload::cblas_zgemm_batch(layout,
+                                       trans_a,
+                                       trans_b,
+                                       M,
+                                       N,
+                                       K,
+                                       alpha,
+                                       A_void,
+                                       lda,
+                                       B_void,
+                                       ldb,
+                                       beta,
+                                       C_void,
+                                       ldc,
+                                       group_count,
+                                       group_size);
   }
 
   template <typename... ARGS>
   static void GEMM_EX(ARGS... args) {
-    phi::dynload::cblas_zgemm_batch(args...);
+    common::dynload::cblas_zgemm_batch(args...);
   }
 };
 
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.hip.h b/paddle/phi/kernels/funcs/blas/blas_impl.hip.h
index 224cf4d6cb4970..44890354f231c4 100644
--- a/paddle/phi/kernels/funcs/blas/blas_impl.hip.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.hip.h
@@ -32,33 +32,33 @@ template <>
 struct CUBlas<float> {
   template <typename... ARGS>
   static void GEMM(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_sgemm(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::rocblas_sgemm(args...));
   }
 
   template <typename... ARGS>
   static void AXPY(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_saxpy(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::rocblas_saxpy(args...));
   }
 
   template <typename... ARGS>
   static void SCAL(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_sscal(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::rocblas_sscal(args...));
   }
 
   template <typename... ARGS>
   static void VCOPY(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_scopy(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::rocblas_scopy(args...));
   }
 
   template <typename... ARGS>
   static void GEMV(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_sgemv(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::rocblas_sgemv(args...));
   }
 
   template <typename... ARGS>
   static void GEMM_STRIDED_BATCH(ARGS... args) {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::rocblas_sgemm_strided_batched(args...));
+        common::dynload::rocblas_sgemm_strided_batched(args...));
   }
 
   // HIP not supportted, refer to the doc here:
@@ -71,7 +71,7 @@ struct CUBlas<float> {
 
   template <typename... ARGS>
   static void TRSM(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_strsm(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::rocblas_strsm(args...));
   }
 
   template <typename... ARGS>
@@ -103,33 +103,33 @@ template <>
 struct CUBlas<double> {
   template <typename... ARGS>
   static void GEMM(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_dgemm(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::rocblas_dgemm(args...));
   }
 
   template <typename... ARGS>
   static void AXPY(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_daxpy(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::rocblas_daxpy(args...));
   }
 
   template <typename... ARGS>
   static void SCAL(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_dscal(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::rocblas_dscal(args...));
   }
 
   template <typename... ARGS>
   static void VCOPY(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_dcopy(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::rocblas_dcopy(args...));
   }
 
   template <typename... ARGS>
   static void GEMV(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_dgemv(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::rocblas_dgemv(args...));
   }
 
   template <typename... ARGS>
   static void GEMM_STRIDED_BATCH(ARGS... args) {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::rocblas_dgemm_strided_batched(args...));
+        common::dynload::rocblas_dgemm_strided_batched(args...));
   }
 
   template <typename... ARGS>
@@ -140,7 +140,7 @@ struct CUBlas<double> {
 
   template <typename... ARGS>
   static void TRSM(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_dtrsm(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::rocblas_dtrsm(args...));
   }
 
   template <typename... ARGS>
@@ -186,7 +186,7 @@ struct CUBlas<phi::dtype::float16> {
                    const float16 *beta,
                    float16 *C,
                    int ldc) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_hgemm(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::rocblas_hgemm(
         handle,
         transa,
         transb,
@@ -221,7 +221,7 @@ struct CUBlas<phi::dtype::float16> {
                                  int ldc,
                                  long long int strideC,  // NOLINT
                                  int batchCount) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_hgemm_strided_batched(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::rocblas_hgemm_strided_batched(
         handle,
         transa,
         transb,
@@ -265,30 +265,30 @@ struct CUBlas<phi::dtype::float16> {
                       rocblas_datatype computeType) {
     rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
     dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_gemm_ex(handle,
-                                                               transa,
-                                                               transb,
-                                                               m,
-                                                               n,
-                                                               k,
-                                                               alpha,
-                                                               A,
-                                                               Atype,
-                                                               lda,
-                                                               B,
-                                                               Btype,
-                                                               ldb,
-                                                               beta,
-                                                               C,
-                                                               Ctype,
-                                                               ldc,
-                                                               C,
-                                                               Ctype,
-                                                               ldc,
-                                                               computeType,
-                                                               algo,
-                                                               0,
-                                                               0));
+      PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::rocblas_gemm_ex(handle,
+                                                                  transa,
+                                                                  transb,
+                                                                  m,
+                                                                  n,
+                                                                  k,
+                                                                  alpha,
+                                                                  A,
+                                                                  Atype,
+                                                                  lda,
+                                                                  B,
+                                                                  Btype,
+                                                                  ldb,
+                                                                  beta,
+                                                                  C,
+                                                                  Ctype,
+                                                                  ldc,
+                                                                  C,
+                                                                  Ctype,
+                                                                  ldc,
+                                                                  computeType,
+                                                                  algo,
+                                                                  0,
+                                                                  0));
     });
   }
 };
@@ -307,7 +307,7 @@ struct CUBlas<phi::dtype::complex<float>> {
                    const phi::dtype::complex<float> *beta,
                    phi::dtype::complex<float> *C,
                    int ldc) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_cgemv(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::rocblas_cgemv(
         handle,
         transa,
         m,
@@ -329,7 +329,7 @@ struct CUBlas<phi::dtype::complex<float>> {
                    const int incX,
                    phi::dtype::complex<float> *Y,
                    const int incY) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_caxpy(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::rocblas_caxpy(
         handle,
         n,
         reinterpret_cast<const rocblas_float_complex *>(alpha),
@@ -357,7 +357,7 @@ struct CUBlas<phi::dtype::complex<float>> {
                                  int ldc,
                                  long long int strideC,  // NOLINT
                                  int batchCount) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_cgemm_strided_batched(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::rocblas_cgemm_strided_batched(
         handle,
         transa,
         transb,
@@ -392,7 +392,7 @@ struct CUBlas<phi::dtype::complex<float>> {
                    const phi::dtype::complex<float> *beta,
                    phi::dtype::complex<float> *C,
                    int ldc) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_cgemm(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::rocblas_cgemm(
         handle,
         transa,
         transb,
@@ -432,30 +432,30 @@ struct CUBlas<phi::dtype::complex<float>> {
                       rocblas_datatype computeType) {
     rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
     dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_gemm_ex(handle,
-                                                               transa,
-                                                               transb,
-                                                               m,
-                                                               n,
-                                                               k,
-                                                               alpha,
-                                                               A,
-                                                               Atype,
-                                                               lda,
-                                                               B,
-                                                               Btype,
-                                                               ldb,
-                                                               beta,
-                                                               C,
-                                                               Ctype,
-                                                               ldc,
-                                                               C,
-                                                               Ctype,
-                                                               ldc,
-                                                               computeType,
-                                                               algo,
-                                                               0,
-                                                               0));
+      PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::rocblas_gemm_ex(handle,
+                                                                  transa,
+                                                                  transb,
+                                                                  m,
+                                                                  n,
+                                                                  k,
+                                                                  alpha,
+                                                                  A,
+                                                                  Atype,
+                                                                  lda,
+                                                                  B,
+                                                                  Btype,
+                                                                  ldb,
+                                                                  beta,
+                                                                  C,
+                                                                  Ctype,
+                                                                  ldc,
+                                                                  C,
+                                                                  Ctype,
+                                                                  ldc,
+                                                                  computeType,
+                                                                  algo,
+                                                                  0,
+                                                                  0));
     });
   }
 };
@@ -474,7 +474,7 @@ struct CUBlas<phi::dtype::complex<double>> {
                    const phi::dtype::complex<double> *beta,
                    phi::dtype::complex<double> *C,
                    int ldc) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_zgemv(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::rocblas_zgemv(
         handle,
         transa,
         m,
@@ -496,7 +496,7 @@ struct CUBlas<phi::dtype::complex<double>> {
                    const int incX,
                    phi::dtype::complex<double> *Y,
                    const int incY) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_zaxpy(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::rocblas_zaxpy(
         handle,
         n,
         reinterpret_cast<const rocblas_double_complex *>(alpha),
@@ -525,7 +525,7 @@ struct CUBlas<phi::dtype::complex<double>> {
       int ldc,
       long long int strideC,  // NOLINT
       int batchCount) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_zgemm_strided_batched(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::rocblas_zgemm_strided_batched(
         handle,
         transa,
         transb,
@@ -560,7 +560,7 @@ struct CUBlas<phi::dtype::complex<double>> {
                    const phi::dtype::complex<double> *beta,
                    phi::dtype::complex<double> *C,
                    int ldc) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_zgemm(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::rocblas_zgemm(
         handle,
         transa,
         transb,
@@ -600,30 +600,30 @@ struct CUBlas<phi::dtype::complex<double>> {
                       rocblas_datatype computeType) {
     rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
     dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_gemm_ex(handle,
-                                                               transa,
-                                                               transb,
-                                                               m,
-                                                               n,
-                                                               k,
-                                                               alpha,
-                                                               A,
-                                                               Atype,
-                                                               lda,
-                                                               B,
-                                                               Btype,
-                                                               ldb,
-                                                               beta,
-                                                               C,
-                                                               Ctype,
-                                                               ldc,
-                                                               C,
-                                                               Ctype,
-                                                               ldc,
-                                                               computeType,
-                                                               algo,
-                                                               0,
-                                                               0));
+      PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::rocblas_gemm_ex(handle,
+                                                                  transa,
+                                                                  transb,
+                                                                  m,
+                                                                  n,
+                                                                  k,
+                                                                  alpha,
+                                                                  A,
+                                                                  Atype,
+                                                                  lda,
+                                                                  B,
+                                                                  Btype,
+                                                                  ldb,
+                                                                  beta,
+                                                                  C,
+                                                                  Ctype,
+                                                                  ldc,
+                                                                  C,
+                                                                  Ctype,
+                                                                  ldc,
+                                                                  computeType,
+                                                                  algo,
+                                                                  0,
+                                                                  0));
     });
   }
 };
@@ -761,30 +761,30 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
 
   context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::rocblas_gemm_ex(handle,
-                                      cuTransB,
-                                      cuTransA,
-                                      N,
-                                      M,
-                                      K,
-                                      &h_alpha,
-                                      B,
-                                      rocblas_datatype_bf16_r,
-                                      ldb,
-                                      A,
-                                      rocblas_datatype_bf16_r,
-                                      lda,
-                                      &h_beta,
-                                      C,
-                                      rocblas_datatype_bf16_r,
-                                      N,
-                                      C,
-                                      rocblas_datatype_bf16_r,
-                                      N,
-                                      rocblas_datatype_f32_r,
-                                      algo,
-                                      0,
-                                      0));
+        common::dynload::rocblas_gemm_ex(handle,
+                                         cuTransB,
+                                         cuTransA,
+                                         N,
+                                         M,
+                                         K,
+                                         &h_alpha,
+                                         B,
+                                         rocblas_datatype_bf16_r,
+                                         ldb,
+                                         A,
+                                         rocblas_datatype_bf16_r,
+                                         lda,
+                                         &h_beta,
+                                         C,
+                                         rocblas_datatype_bf16_r,
+                                         N,
+                                         C,
+                                         rocblas_datatype_bf16_r,
+                                         N,
+                                         rocblas_datatype_f32_r,
+                                         algo,
+                                         0,
+                                         0));
   });
 }
 
@@ -1017,30 +1017,30 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
 
   context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::rocblas_gemm_ex(handle,
-                                      cuTransB,
-                                      cuTransA,
-                                      N,
-                                      M,
-                                      K,
-                                      &h_alpha,
-                                      B,
-                                      rocblas_datatype_bf16_r,
-                                      ldb,
-                                      A,
-                                      rocblas_datatype_bf16_r,
-                                      lda,
-                                      &h_beta,
-                                      C,
-                                      rocblas_datatype_bf16_r,
-                                      ldc,
-                                      C,
-                                      rocblas_datatype_bf16_r,
-                                      ldc,
-                                      rocblas_datatype_f32_r,
-                                      algo,
-                                      0,
-                                      0));
+        common::dynload::rocblas_gemm_ex(handle,
+                                         cuTransB,
+                                         cuTransA,
+                                         N,
+                                         M,
+                                         K,
+                                         &h_alpha,
+                                         B,
+                                         rocblas_datatype_bf16_r,
+                                         ldb,
+                                         A,
+                                         rocblas_datatype_bf16_r,
+                                         lda,
+                                         &h_beta,
+                                         C,
+                                         rocblas_datatype_bf16_r,
+                                         ldc,
+                                         C,
+                                         rocblas_datatype_bf16_r,
+                                         ldc,
+                                         rocblas_datatype_f32_r,
+                                         algo,
+                                         0,
+                                         0));
   });
 }
 
@@ -1201,7 +1201,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
                                    : rocblas_operation_transpose;
   const int64_t strideC = M * N;
   context_.CublasCall([&](rocblas_handle handle) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_hgemm_strided_batched(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::rocblas_hgemm_strided_batched(
         handle,
         cuTransB,
         cuTransA,
@@ -1254,24 +1254,24 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
   const int64_t strideC = M * N;
   context_.CublasCall([&](rocblas_handle handle) {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::rocblas_sgemm_strided_batched(handle,
-                                                    cuTransB,
-                                                    cuTransA,
-                                                    N,
-                                                    M,
-                                                    K,
-                                                    &alpha,
-                                                    B,
-                                                    ldb,
-                                                    strideB,
-                                                    A,
-                                                    lda,
-                                                    strideA,
-                                                    &beta,
-                                                    C,
-                                                    ldc,
-                                                    strideC,
-                                                    batchCount));
+        common::dynload::rocblas_sgemm_strided_batched(handle,
+                                                       cuTransB,
+                                                       cuTransA,
+                                                       N,
+                                                       M,
+                                                       K,
+                                                       &alpha,
+                                                       B,
+                                                       ldb,
+                                                       strideB,
+                                                       A,
+                                                       lda,
+                                                       strideA,
+                                                       &beta,
+                                                       C,
+                                                       ldc,
+                                                       strideC,
+                                                       batchCount));
   });
 }
 
@@ -1304,24 +1304,24 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
   const int64_t strideC = M * N;
   context_.CublasCall([&](rocblas_handle handle) {
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::rocblas_dgemm_strided_batched(handle,
-                                                    cuTransB,
-                                                    cuTransA,
-                                                    N,
-                                                    M,
-                                                    K,
-                                                    &alpha,
-                                                    B,
-                                                    ldb,
-                                                    strideB,
-                                                    A,
-                                                    lda,
-                                                    strideA,
-                                                    &beta,
-                                                    C,
-                                                    ldc,
-                                                    strideC,
-                                                    batchCount));
+        common::dynload::rocblas_dgemm_strided_batched(handle,
+                                                       cuTransB,
+                                                       cuTransA,
+                                                       N,
+                                                       M,
+                                                       K,
+                                                       &alpha,
+                                                       B,
+                                                       ldb,
+                                                       strideB,
+                                                       A,
+                                                       lda,
+                                                       strideA,
+                                                       &beta,
+                                                       C,
+                                                       ldc,
+                                                       strideC,
+                                                       batchCount));
   });
 }
 
@@ -1355,36 +1355,36 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
   rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
 
   context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::rocblas_gemm_strided_batched_ex(handle,
-                                                      cuTransB,
-                                                      cuTransA,
-                                                      N,
-                                                      M,
-                                                      K,
-                                                      &h_alpha,
-                                                      B,
-                                                      rocblas_datatype_bf16_r,
-                                                      ldb,
-                                                      strideB,
-                                                      A,
-                                                      rocblas_datatype_bf16_r,
-                                                      lda,
-                                                      strideA,
-                                                      &h_beta,
-                                                      C,
-                                                      rocblas_datatype_bf16_r,
-                                                      ldc,
-                                                      strideC,
-                                                      C,
-                                                      rocblas_datatype_bf16_r,
-                                                      ldc,
-                                                      strideC,
-                                                      batchCount,
-                                                      rocblas_datatype_f32_r,
-                                                      algo,
-                                                      0,
-                                                      0));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::rocblas_gemm_strided_batched_ex(
+        handle,
+        cuTransB,
+        cuTransA,
+        N,
+        M,
+        K,
+        &h_alpha,
+        B,
+        rocblas_datatype_bf16_r,
+        ldb,
+        strideB,
+        A,
+        rocblas_datatype_bf16_r,
+        lda,
+        strideA,
+        &h_beta,
+        C,
+        rocblas_datatype_bf16_r,
+        ldc,
+        strideC,
+        C,
+        rocblas_datatype_bf16_r,
+        ldc,
+        strideC,
+        batchCount,
+        rocblas_datatype_f32_r,
+        algo,
+        0,
+        0));
   });
 }
 
diff --git a/paddle/phi/kernels/funcs/check_numerics_utils.h b/paddle/phi/kernels/funcs/check_numerics_utils.h
index 473d7994058a8d..7e9514dab43be3 100644
--- a/paddle/phi/kernels/funcs/check_numerics_utils.h
+++ b/paddle/phi/kernels/funcs/check_numerics_utils.h
@@ -19,9 +19,9 @@
 #endif
 #include <fstream>
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/data_type.h"
-#include "paddle/phi/core/enforce.h"
 
 #ifdef _WIN32
 #include <direct.h>
diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.h b/paddle/phi/kernels/funcs/concat_and_split_functor.h
index 9e3f663cb419c7..f2beeb96ff473e 100644
--- a/paddle/phi/kernels/funcs/concat_and_split_functor.h
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.h
@@ -16,13 +16,13 @@ limitations under the License. */
 
 #include <vector>
 
+#include "paddle/common/data_type.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/device_context.h"
-#include "paddle/phi/core/utils/data_type.h"
 
 namespace phi {
 namespace funcs {
diff --git a/paddle/phi/kernels/funcs/concat_funcs.h b/paddle/phi/kernels/funcs/concat_funcs.h
index db965c2ef9b654..c888a9d4e0d46a 100644
--- a/paddle/phi/kernels/funcs/concat_funcs.h
+++ b/paddle/phi/kernels/funcs/concat_funcs.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
+#include "paddle/common/enforce.h"
+#include "paddle/common/errors.h"
 namespace phi {
 namespace funcs {
 
diff --git a/paddle/phi/kernels/funcs/cpu_vec.h b/paddle/phi/kernels/funcs/cpu_vec.h
index 6774cd391dd5d6..e6d08533e31c1f 100644
--- a/paddle/phi/kernels/funcs/cpu_vec.h
+++ b/paddle/phi/kernels/funcs/cpu_vec.h
@@ -17,8 +17,8 @@ limitations under the License. */
 #include <functional>
 #include <string>
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/cpu/cpu_info.h"
-#include "paddle/phi/core/enforce.h"
 
 #ifdef PADDLE_WITH_MKLML
 #include "paddle/phi/backends/dynload/mklml.h"
@@ -60,23 +60,23 @@ inline void vec_exp<float>(const int n, const float* x, float* y) {
       y[i] = std::exp(x[i]);
     }
   } else {
-    phi::dynload::vsExp(n, x, y);
+    common::dynload::vsExp(n, x, y);
   }
 }
 
 template <>
 inline void vec_exp<double>(const int n, const double* x, double* y) {
-  phi::dynload::vdExp(n, x, y);
+  common::dynload::vdExp(n, x, y);
 }
 
 template <>
 inline void vec_scal<float>(const int n, const float a, float* x) {
-  phi::dynload::cblas_sscal(n, a, x, 1);
+  common::dynload::cblas_sscal(n, a, x, 1);
 }
 
 template <>
 inline void vec_scal<double>(const int n, const double a, double* x) {
-  phi::dynload::cblas_dscal(n, a, x, 1);
+  common::dynload::cblas_dscal(n, a, x, 1);
 }
 #endif
 
diff --git a/paddle/phi/kernels/funcs/cross_entropy.cc b/paddle/phi/kernels/funcs/cross_entropy.cc
index cf53e9ea65efcc..e1949cc26fb7b9 100644
--- a/paddle/phi/kernels/funcs/cross_entropy.cc
+++ b/paddle/phi/kernels/funcs/cross_entropy.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/funcs/cross_entropy.h"
 
+#include "paddle/common/data_type.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/utils/data_type.h"
 
 namespace phi {
 namespace funcs {
diff --git a/paddle/phi/kernels/funcs/cross_entropy.cu b/paddle/phi/kernels/funcs/cross_entropy.cu
index 20a15f9e944fef..94a58912a1746d 100644
--- a/paddle/phi/kernels/funcs/cross_entropy.cu
+++ b/paddle/phi/kernels/funcs/cross_entropy.cu
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/funcs/cross_entropy.h"
 
+#include "paddle/common/data_type.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
-#include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/math.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/funcs/cublaslt.h b/paddle/phi/kernels/funcs/cublaslt.h
index 6278f159df075d..f0434def707bc6 100644
--- a/paddle/phi/kernels/funcs/cublaslt.h
+++ b/paddle/phi/kernels/funcs/cublaslt.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/phi/backends/dynload/cublasLt.h"
 #include "paddle/phi/core/dense_tensor.h"
 
-namespace dyl = phi::dynload;
+namespace dyl = common::dynload;
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/funcs/cufft_util.h b/paddle/phi/kernels/funcs/cufft_util.h
index 3a4a3ef5e59149..80b4f378d6f0da 100644
--- a/paddle/phi/kernels/funcs/cufft_util.h
+++ b/paddle/phi/kernels/funcs/cufft_util.h
@@ -15,9 +15,9 @@
 #pragma once
 #include <vector>
 
+#include "paddle/common/ddim.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/dynload/cufft.h"
-#include "paddle/phi/core/ddim.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/funcs/fft.h"
 #include "paddle/phi/kernels/funcs/fft_key.h"
 
@@ -29,7 +29,7 @@ namespace detail {
 class CuFFTHandle {
  public:
   CuFFTHandle() {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cufftCreate(&handle_));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cufftCreate(&handle_));
   }
 
   CuFFTHandle(const CuFFTHandle& other) = delete;
@@ -41,7 +41,7 @@ class CuFFTHandle {
   ::cufftHandle& get() { return handle_; }
   const ::cufftHandle& get() const { return handle_; }
 
-  ~CuFFTHandle() { phi::dynload::cufftDestroy(handle_); }
+  ~CuFFTHandle() { common::dynload::cufftDestroy(handle_); }
 
  private:
   ::cufftHandle handle_;
@@ -108,23 +108,23 @@ class FFTConfig {
 
     // disable auto allocation of workspace to use allocator from the framework
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cufftSetAutoAllocation(plan(), /* autoAllocate */ 0));
+        common::dynload::cufftSetAutoAllocation(plan(), /* autoAllocate */ 0));
 
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cufftXtMakePlanMany(plan(),
-                                          signal_ndim,
-                                          signal_sizes.data(),
-                                          /* inembed */ nullptr,
-                                          /* base_istride */ 1L,
-                                          /* idist */ 1L,
-                                          itype,
-                                          /* onembed */ nullptr,
-                                          /* base_ostride */ 1L,
-                                          /* odist */ 1L,
-                                          otype,
-                                          batch_size,
-                                          &ws_size_,
-                                          exec_type));
+        common::dynload::cufftXtMakePlanMany(plan(),
+                                             signal_ndim,
+                                             signal_sizes.data(),
+                                             /* inembed */ nullptr,
+                                             /* base_istride */ 1L,
+                                             /* idist */ 1L,
+                                             itype,
+                                             /* onembed */ nullptr,
+                                             /* base_ostride */ 1L,
+                                             /* odist */ 1L,
+                                             otype,
+                                             batch_size,
+                                             &ws_size_,
+                                             exec_type));
   }
 
   FFTConfig(const FFTConfig& other) = delete;
@@ -151,7 +151,7 @@ static void exec_plan(const FFTConfig& config,
                       void* out_data,
                       bool forward) {
   auto& plan = config.plan();
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cufftXtExec(
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cufftXtExec(
       plan, in_data, out_data, forward ? CUFFT_FORWARD : CUFFT_INVERSE));
 }
 
diff --git a/paddle/phi/kernels/funcs/cumprod.h b/paddle/phi/kernels/funcs/cumprod.h
index 4eefd4559c33a2..fad43f4acef72a 100644
--- a/paddle/phi/kernels/funcs/cumprod.h
+++ b/paddle/phi/kernels/funcs/cumprod.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/funcs/detail/strided_memcpy.h b/paddle/phi/kernels/funcs/detail/strided_memcpy.h
index 0cd07fdfd0e1ae..03e3bdde05ad09 100644
--- a/paddle/phi/kernels/funcs/detail/strided_memcpy.h
+++ b/paddle/phi/kernels/funcs/detail/strided_memcpy.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "paddle/common/ddim.h"
 #include "paddle/phi/common/memory_utils.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/device_context.h"
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/phi/kernels/funcs/dims_simplifier.h b/paddle/phi/kernels/funcs/dims_simplifier.h
index 35621ed0abddb3..57056a19963c1e 100644
--- a/paddle/phi/kernels/funcs/dims_simplifier.h
+++ b/paddle/phi/kernels/funcs/dims_simplifier.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 #include "glog/logging.h"
diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h
index 5112bf3f35da49..80dfed1544340b 100644
--- a/paddle/phi/kernels/funcs/elementwise_functor.h
+++ b/paddle/phi/kernels/funcs/elementwise_functor.h
@@ -16,10 +16,10 @@ limitations under the License. */
 
 #include "paddle/common/bfloat16.h"
 #include "paddle/common/complex.h"
+#include "paddle/common/enforce.h"
 #include "paddle/common/float16.h"
 #include "paddle/common/hostdevice.h"
 #include "paddle/common/macros.h"
-#include "paddle/phi/core/enforce.h"
 #if defined(__xpu__)
 #include <xpu/runtime.h>
 
diff --git a/paddle/phi/kernels/funcs/elementwise_utils.h b/paddle/phi/kernels/funcs/elementwise_utils.h
index 3790044346dc42..5e3ae5a5d8ac7c 100644
--- a/paddle/phi/kernels/funcs/elementwise_utils.h
+++ b/paddle/phi/kernels/funcs/elementwise_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "paddle/common/enforce.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/enforce.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/funcs/fft.cc b/paddle/phi/kernels/funcs/fft.cc
index 31ea37f5b5037e..a5bd2e1281ffce 100644
--- a/paddle/phi/kernels/funcs/fft.cc
+++ b/paddle/phi/kernels/funcs/fft.cc
@@ -113,18 +113,18 @@ void exec_fft(const phi::CPUContext& ctx,
   const FFTTransformType fft_type = GetFFTTransformType(x.dtype(), out->type());
   if (fft_type == FFTTransformType::C2R && forward) {
     ConjKernel<Ti, phi::CPUContext>(ctx, collapsed_input, &collapsed_input);
-    MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward(
+    MKL_DFTI_CHECK(common::dynload::DftiComputeBackward(
         desc.get(), collapsed_input.data(), collapsed_output.data()));
   } else if (fft_type == FFTTransformType::R2C && !forward) {
-    MKL_DFTI_CHECK(phi::dynload::DftiComputeForward(
+    MKL_DFTI_CHECK(common::dynload::DftiComputeForward(
         desc.get(), collapsed_input.data(), collapsed_output.data()));
     ConjKernel<To, phi::CPUContext>(ctx, collapsed_output, &collapsed_output);
   } else {
     if (forward) {
-      MKL_DFTI_CHECK(phi::dynload::DftiComputeForward(
+      MKL_DFTI_CHECK(common::dynload::DftiComputeForward(
           desc.get(), collapsed_input.data(), collapsed_output.data()));
     } else {
-      MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward(
+      MKL_DFTI_CHECK(common::dynload::DftiComputeBackward(
           desc.get(), collapsed_input.data(), collapsed_output.data()));
     }
   }
diff --git a/paddle/phi/kernels/funcs/fft.cu b/paddle/phi/kernels/funcs/fft.cu
index e13a79b335ac0e..2010f7bdab0130 100644
--- a/paddle/phi/kernels/funcs/fft.cu
+++ b/paddle/phi/kernels/funcs/fft.cu
@@ -17,8 +17,8 @@
 #include "paddle/phi/kernels/funcs/fft.h"
 #include "paddle/phi/kernels/funcs/fft_cache.h"
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/assign_kernel.h"
 #include "paddle/phi/kernels/complex_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
@@ -93,7 +93,7 @@ bool has_large_prime_factor(int64_t n) {
 inline bool use_cache(const int64_t* signal_size) {
   bool using_cache = true;
   int cufft_version;
-  phi::dynload::cufftGetVersion(&cufft_version);
+  common::dynload::cufftGetVersion(&cufft_version);
   if (10300 <= cufft_version && cufft_version <= 10400) {
     using_cache = std::none_of(
         signal_size + 1, signal_size + kMaxDataNdim, [](int64_t dim_size) {
@@ -190,14 +190,14 @@ void exec_fft(const phi::GPUContext& ctx,
   // prepare cufft for execution
 #if defined(PADDLE_WITH_CUDA)
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cufftSetStream(config->plan(), ctx.stream()));
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cufftSetWorkArea(config->plan(), workspace_tensor.data()));
+      common::dynload::cufftSetStream(config->plan(), ctx.stream()));
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cufftSetWorkArea(
+      config->plan(), workspace_tensor.data()));
 #elif defined(PADDLE_WITH_HIP)
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::hipfftSetStream(config->plan(), ctx.stream()));
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::hipfftSetWorkArea(config->plan(), workspace_tensor.data()));
+      common::dynload::hipfftSetStream(config->plan(), ctx.stream()));
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::hipfftSetWorkArea(
+      config->plan(), workspace_tensor.data()));
 #endif
 
   // execution of fft plan
diff --git a/paddle/phi/kernels/funcs/fft.h b/paddle/phi/kernels/funcs/fft.h
index 3f9e1191ebb3e6..86aa6e1f8cb729 100644
--- a/paddle/phi/kernels/funcs/fft.h
+++ b/paddle/phi/kernels/funcs/fft.h
@@ -15,9 +15,9 @@
 #pragma once
 
 #include <string>
+#include "paddle/common/data_type.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/utils/data_type.h"
 
 namespace phi {
 namespace funcs {
diff --git a/paddle/phi/kernels/funcs/fft_key.h b/paddle/phi/kernels/funcs/fft_key.h
index 5893cfc6ba019f..2531ff25310fa9 100644
--- a/paddle/phi/kernels/funcs/fft_key.h
+++ b/paddle/phi/kernels/funcs/fft_key.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/phi/core/utils/data_type.h"
+#include "paddle/common/data_type.h"
 #include "paddle/phi/kernels/funcs/fft.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/funcs/fused_gemm_epilogue.h b/paddle/phi/kernels/funcs/fused_gemm_epilogue.h
index df55625eada6d1..0944980da7218f 100644
--- a/paddle/phi/kernels/funcs/fused_gemm_epilogue.h
+++ b/paddle/phi/kernels/funcs/fused_gemm_epilogue.h
@@ -27,6 +27,7 @@ limitations under the License. */
 #if CUDA_VERSION >= 11060
 
 #include "glog/logging.h"
+#include "paddle/common/enforce.h"
 #include "paddle/common/float16.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/backends/dynload/cublasLt.h"
@@ -34,7 +35,6 @@ limitations under the License. */
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/scope_guard.h"
 #include "paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h"
 #include "paddle/utils/flags.h"
@@ -90,9 +90,9 @@ class GemmEpilogueAlgoCache {
 
     cublasLtMatmulPreference_t preference;
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cublasLtMatmulPreferenceCreate(&preference));
+        common::dynload::cublasLtMatmulPreferenceCreate(&preference));
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cublasLtMatmulPreferenceSetAttribute(
+        common::dynload::cublasLtMatmulPreferenceSetAttribute(
             preference,
             CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
             &workspace_size,
@@ -101,17 +101,17 @@ class GemmEpilogueAlgoCache {
     int returned_results = 0;
     std::vector<cublasLtMatmulHeuristicResult_t> heuristic_results(
         requested_algo_count_);
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cublasLtMatmulAlgoGetHeuristic(lt_handle,
-                                                     op_desc,
-                                                     a_desc,
-                                                     b_desc,
-                                                     c_desc,
-                                                     c_desc,
-                                                     preference,
-                                                     requested_algo_count_,
-                                                     heuristic_results.data(),
-                                                     &returned_results));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasLtMatmulAlgoGetHeuristic(
+        lt_handle,
+        op_desc,
+        a_desc,
+        b_desc,
+        c_desc,
+        c_desc,
+        preference,
+        requested_algo_count_,
+        heuristic_results.data(),
+        &returned_results));
 
     PADDLE_ENFORCE_GT(
         returned_results,
@@ -119,7 +119,7 @@ class GemmEpilogueAlgoCache {
         phi::errors::Unavailable("No GEMM epilogue algorithm support!"));
 
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cublasLtMatmulPreferenceDestroy(preference));
+        common::dynload::cublasLtMatmulPreferenceDestroy(preference));
 
     int best_algo_idx = -1;
     float best_algo_time = 0;
@@ -127,23 +127,23 @@ class GemmEpilogueAlgoCache {
     // Run 100 times for warmup
     int warmup_algo_idx = 0;
     for (int t = 0; t < 100; t++) {
-      cublasStatus_t status =
-          phi::dynload::cublasLtMatmul(lt_handle,
-                                       op_desc,
-                                       alpha,
-                                       a,
-                                       a_desc,
-                                       b,
-                                       b_desc,
-                                       beta,
-                                       c,
-                                       c_desc,
-                                       c,
-                                       c_desc,
-                                       &heuristic_results[warmup_algo_idx].algo,
-                                       workspace,
-                                       workspace_size,
-                                       stream);
+      cublasStatus_t status = common::dynload::cublasLtMatmul(
+          lt_handle,
+          op_desc,
+          alpha,
+          a,
+          a_desc,
+          b,
+          b_desc,
+          beta,
+          c,
+          c_desc,
+          c,
+          c_desc,
+          &heuristic_results[warmup_algo_idx].algo,
+          workspace,
+          workspace_size,
+          stream);
       if (status != CUBLAS_STATUS_SUCCESS) {
         t = -1;
         warmup_algo_idx += 1;
@@ -165,22 +165,22 @@ class GemmEpilogueAlgoCache {
         PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(start_event, stream));
 
         cublasStatus_t status =
-            phi::dynload::cublasLtMatmul(lt_handle,
-                                         op_desc,
-                                         alpha,
-                                         a,
-                                         a_desc,
-                                         b,
-                                         b_desc,
-                                         beta,
-                                         c,
-                                         c_desc,
-                                         c,
-                                         c_desc,
-                                         &heuristic_results[algo_idx].algo,
-                                         workspace,
-                                         workspace_size,
-                                         stream);
+            common::dynload::cublasLtMatmul(lt_handle,
+                                            op_desc,
+                                            alpha,
+                                            a,
+                                            a_desc,
+                                            b,
+                                            b_desc,
+                                            beta,
+                                            c,
+                                            c_desc,
+                                            c,
+                                            c_desc,
+                                            &heuristic_results[algo_idx].algo,
+                                            workspace,
+                                            workspace_size,
+                                            stream);
 
         PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(stop_event, stream));
         PADDLE_ENFORCE_GPU_SUCCESS(cudaEventSynchronize(stop_event));
@@ -236,7 +236,7 @@ class GemmEpilogueAlgoCache {
     int trans_a, trans_b;
     uint32_t epilogue;
 
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatmulDescGetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasLtMatmulDescGetAttribute(
         desc,
         CUBLASLT_MATMUL_DESC_TRANSA,
         &trans_a,
@@ -244,7 +244,7 @@ class GemmEpilogueAlgoCache {
         &size_to_write));
     HashValue_(seed, hash_fn, static_cast<int64_t>(trans_a));
 
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatmulDescGetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasLtMatmulDescGetAttribute(
         desc,
         CUBLASLT_MATMUL_DESC_TRANSB,
         &trans_b,
@@ -252,7 +252,7 @@ class GemmEpilogueAlgoCache {
         &size_to_write));
     HashValue_(seed, hash_fn, static_cast<int64_t>(trans_b));
 
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatmulDescGetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasLtMatmulDescGetAttribute(
         desc,
         CUBLASLT_MATMUL_DESC_EPILOGUE,
         &epilogue,
@@ -270,40 +270,54 @@ class GemmEpilogueAlgoCache {
     uint64_t row, col;
     int64_t ld, batch_offset;
 
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatrixLayoutGetAttribute(
-        desc,
-        CUBLASLT_MATRIX_LAYOUT_TYPE,
-        &dtype,
-        sizeof(dtype),
-        &size_to_write));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        common::dynload::cublasLtMatrixLayoutGetAttribute(
+            desc,
+            CUBLASLT_MATRIX_LAYOUT_TYPE,
+            &dtype,
+            sizeof(dtype),
+            &size_to_write));
     HashValue_(seed, hash_fn, static_cast<int64_t>(dtype));
 
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatrixLayoutGetAttribute(
-        desc,
-        CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT,
-        &batch,
-        sizeof(batch),
-        &size_to_write));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        common::dynload::cublasLtMatrixLayoutGetAttribute(
+            desc,
+            CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT,
+            &batch,
+            sizeof(batch),
+            &size_to_write));
     HashValue_(seed, hash_fn, static_cast<int64_t>(batch));
 
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatrixLayoutGetAttribute(
-        desc, CUBLASLT_MATRIX_LAYOUT_ROWS, &row, sizeof(row), &size_to_write));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        common::dynload::cublasLtMatrixLayoutGetAttribute(
+            desc,
+            CUBLASLT_MATRIX_LAYOUT_ROWS,
+            &row,
+            sizeof(row),
+            &size_to_write));
     HashValue_(seed, hash_fn, static_cast<int64_t>(row));
 
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatrixLayoutGetAttribute(
-        desc, CUBLASLT_MATRIX_LAYOUT_COLS, &col, sizeof(col), &size_to_write));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        common::dynload::cublasLtMatrixLayoutGetAttribute(
+            desc,
+            CUBLASLT_MATRIX_LAYOUT_COLS,
+            &col,
+            sizeof(col),
+            &size_to_write));
     HashValue_(seed, hash_fn, static_cast<int64_t>(col));
 
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatrixLayoutGetAttribute(
-        desc, CUBLASLT_MATRIX_LAYOUT_LD, &ld, sizeof(ld), &size_to_write));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        common::dynload::cublasLtMatrixLayoutGetAttribute(
+            desc, CUBLASLT_MATRIX_LAYOUT_LD, &ld, sizeof(ld), &size_to_write));
     HashValue_(seed, hash_fn, static_cast<int64_t>(ld));
 
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatrixLayoutGetAttribute(
-        desc,
-        CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
-        &batch_offset,
-        sizeof(batch_offset),
-        &size_to_write));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        common::dynload::cublasLtMatrixLayoutGetAttribute(
+            desc,
+            CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
+            &batch_offset,
+            sizeof(batch_offset),
+            &size_to_write));
     HashValue_(seed, hash_fn, static_cast<int64_t>(batch_offset));
   }
 
@@ -365,24 +379,24 @@ void ComputeFusedGemmEpilogueForward(const phi::GPUContext& dev_ctx,
   }
 
   cublasLtMatmulDesc_t operation_desc = NULL;
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatmulDescCreate(
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasLtMatmulDescCreate(
       &operation_desc, compute_type, scale_type));
   cublasOperation_t transx = trans_x ? CUBLAS_OP_T : CUBLAS_OP_N;
   cublasOperation_t transy = trans_y ? CUBLAS_OP_T : CUBLAS_OP_N;
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatmulDescSetAttribute(
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasLtMatmulDescSetAttribute(
       operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &transx, sizeof(transx)));
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatmulDescSetAttribute(
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasLtMatmulDescSetAttribute(
       operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &transy, sizeof(transy)));
 
   cublasLtEpilogue_t epiloque_func =
       GetEpilogueType(activation, enable_auxiliary);
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatmulDescSetAttribute(
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasLtMatmulDescSetAttribute(
       operation_desc,
       CUBLASLT_MATMUL_DESC_EPILOGUE,
       &epiloque_func,
       sizeof(epiloque_func)));
   const T* bias_data = bias->data<T>();
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatmulDescSetAttribute(
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasLtMatmulDescSetAttribute(
       operation_desc,
       CUBLASLT_MATMUL_DESC_BIAS_POINTER,
       &bias_data,
@@ -404,13 +418,13 @@ void ComputeFusedGemmEpilogueForward(const phi::GPUContext& dev_ctx,
 
     void* aux_data = reserve_space->data();
 
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasLtMatmulDescSetAttribute(
         operation_desc,
         CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER,
         &aux_data,
         sizeof(aux_data)));
     int64_t aux_ld = N;
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasLtMatmulDescSetAttribute(
         operation_desc,
         CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD,
         &aux_ld,
@@ -419,21 +433,21 @@ void ComputeFusedGemmEpilogueForward(const phi::GPUContext& dev_ctx,
 
   cublasLtMatrixLayout_t x_desc = NULL, y_desc = NULL, out_desc = NULL;
   if (trans_x) {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cublasLtMatrixLayoutCreate(&x_desc, mat_type, M, K, M));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasLtMatrixLayoutCreate(
+        &x_desc, mat_type, M, K, M));
   } else {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cublasLtMatrixLayoutCreate(&x_desc, mat_type, K, M, K));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasLtMatrixLayoutCreate(
+        &x_desc, mat_type, K, M, K));
   }
   if (trans_y) {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cublasLtMatrixLayoutCreate(&y_desc, mat_type, K, N, K));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasLtMatrixLayoutCreate(
+        &y_desc, mat_type, K, N, K));
   } else {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cublasLtMatrixLayoutCreate(&y_desc, mat_type, N, K, N));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasLtMatrixLayoutCreate(
+        &y_desc, mat_type, N, K, N));
   }
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cublasLtMatrixLayoutCreate(&out_desc, mat_type, N, M, N));
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasLtMatrixLayoutCreate(
+      &out_desc, mat_type, N, M, N));
 
   cublasLtHandle_t lt_handle = dev_ctx.cublaslt_handle();
   // NOTE(zengjinle): I do not know whether the 4MB workspace size is
@@ -464,29 +478,31 @@ void ComputeFusedGemmEpilogueForward(const phi::GPUContext& dev_ctx,
                                                             stream,
                                                             workspace->ptr(),
                                                             workspace_size);
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatmul(lt_handle,
-                                                          operation_desc,
-                                                          &alpha,
-                                                          y_data,
-                                                          y_desc,
-                                                          x_data,
-                                                          x_desc,
-                                                          &beta,
-                                                          out_data,
-                                                          out_desc,
-                                                          out_data,
-                                                          out_desc,
-                                                          algo,
-                                                          workspace->ptr(),
-                                                          workspace_size,
-                                                          stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasLtMatmul(lt_handle,
+                                                             operation_desc,
+                                                             &alpha,
+                                                             y_data,
+                                                             y_desc,
+                                                             x_data,
+                                                             x_desc,
+                                                             &beta,
+                                                             out_data,
+                                                             out_desc,
+                                                             out_data,
+                                                             out_desc,
+                                                             algo,
+                                                             workspace->ptr(),
+                                                             workspace_size,
+                                                             stream));
 
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cublasLtMatmulDescDestroy(operation_desc));
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatrixLayoutDestroy(y_desc));
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatrixLayoutDestroy(x_desc));
+      common::dynload::cublasLtMatmulDescDestroy(operation_desc));
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cublasLtMatrixLayoutDestroy(out_desc));
+      common::dynload::cublasLtMatrixLayoutDestroy(y_desc));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      common::dynload::cublasLtMatrixLayoutDestroy(x_desc));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      common::dynload::cublasLtMatrixLayoutDestroy(out_desc));
 }
 
 struct BwdFusedEpilogueSetter {
@@ -671,18 +687,18 @@ void ComputeFusedGemmEpilogueBackwardImplDev(
     for (auto desc : descs) {
       if (desc) {
         PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::cublasLtMatrixLayoutDestroy(desc));
+            common::dynload::cublasLtMatrixLayoutDestroy(desc));
       }
     }
 
     if (dx_operation_desc) {
       PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::cublasLtMatmulDescDestroy(dx_operation_desc));
+          common::dynload::cublasLtMatmulDescDestroy(dx_operation_desc));
     }
 
     if (dy_operation_desc) {
       PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::cublasLtMatmulDescDestroy(dy_operation_desc));
+          common::dynload::cublasLtMatmulDescDestroy(dy_operation_desc));
     }
   });
 
@@ -700,16 +716,16 @@ void ComputeFusedGemmEpilogueBackwardImplDev(
 
     if (TransX) {
       dx_dout_desc = &dout_trans_desc;
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatrixLayoutCreate(
+      PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasLtMatrixLayoutCreate(
           dx_dout_desc, mat_type, z_row, z_col, z_row));
     } else {
       dx_dout_desc = &dout_desc;
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatrixLayoutCreate(
+      PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasLtMatrixLayoutCreate(
           dx_dout_desc, mat_type, z_col, z_row, z_col));
     }
 
     dx_y_desc = &y_trans_desc;
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasLtMatrixLayoutCreate(
         dx_y_desc, mat_type, y_col, y_row, y_col));
 
     auto& a_desc = kXGradAIsDZ ? (*dx_dout_desc) : (*dx_y_desc);
@@ -717,21 +733,21 @@ void ComputeFusedGemmEpilogueBackwardImplDev(
     auto a_trans = BoolToCuBlasEnum(Trait::kXGradATrans);
     auto b_trans = BoolToCuBlasEnum(Trait::kXGradBTrans);
 
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasLtMatrixLayoutCreate(
         &dx_desc,
         phi::backends::gpu::ToCudaDataType<DXT>(),
         x_col,
         x_row,
         x_col));
 
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatmulDescCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasLtMatmulDescCreate(
         &dx_operation_desc, compute_type, scale_type));
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasLtMatmulDescSetAttribute(
         dx_operation_desc,
         CUBLASLT_MATMUL_DESC_TRANSB,
         &a_trans,
         sizeof(a_trans)));
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasLtMatmulDescSetAttribute(
         dx_operation_desc,
         CUBLASLT_MATMUL_DESC_TRANSA,
         &b_trans,
@@ -739,7 +755,7 @@ void ComputeFusedGemmEpilogueBackwardImplDev(
 
     cublasLtEpilogue_t epiloque_func_for_dx =
         GetEpilogueGradType(activation_grad);
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasLtMatmulDescSetAttribute(
         dx_operation_desc,
         CUBLASLT_MATMUL_DESC_EPILOGUE,
         &epiloque_func_for_dx,
@@ -747,17 +763,19 @@ void ComputeFusedGemmEpilogueBackwardImplDev(
 
     if (activation_grad != "none") {
       auto* aux_data = reserve_space->data();
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatmulDescSetAttribute(
-          dx_operation_desc,
-          CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER,
-          &aux_data,
-          sizeof(aux_data)));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          common::dynload::cublasLtMatmulDescSetAttribute(
+              dx_operation_desc,
+              CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER,
+              &aux_data,
+              sizeof(aux_data)));
       int64_t aux_ld = TransX ? M : K;
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatmulDescSetAttribute(
-          dx_operation_desc,
-          CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD,
-          &aux_ld,
-          sizeof(aux_ld)));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          common::dynload::cublasLtMatmulDescSetAttribute(
+              dx_operation_desc,
+              CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD,
+              &aux_ld,
+              sizeof(aux_ld)));
     }
 
     auto dx_workspace = memory_utils::Alloc(
@@ -786,22 +804,23 @@ void ComputeFusedGemmEpilogueBackwardImplDev(
                                                       dx_workspace->ptr(),
                                                       workspace_size);
 
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatmul(lt_handle,
-                                                            dx_operation_desc,
-                                                            &alpha,
-                                                            b_data,
-                                                            b_desc,
-                                                            a_data,
-                                                            a_desc,
-                                                            &beta_dx,
-                                                            dx_data,
-                                                            dx_desc,
-                                                            dx_data,
-                                                            dx_desc,
-                                                            algo,
-                                                            dx_workspace->ptr(),
-                                                            workspace_size,
-                                                            stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        common::dynload::cublasLtMatmul(lt_handle,
+                                        dx_operation_desc,
+                                        &alpha,
+                                        b_data,
+                                        b_desc,
+                                        a_data,
+                                        a_desc,
+                                        &beta_dx,
+                                        dx_data,
+                                        dx_desc,
+                                        dx_data,
+                                        dx_desc,
+                                        algo,
+                                        dx_workspace->ptr(),
+                                        workspace_size,
+                                        stream));
   }
 
   // dy = func(dout, x)
@@ -812,19 +831,19 @@ void ComputeFusedGemmEpilogueBackwardImplDev(
     if (TransX) {
       dy_dout_desc = &dout_trans_desc;
       if (dout_trans_desc == nullptr) {
-        PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatrixLayoutCreate(
+        PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasLtMatrixLayoutCreate(
             dy_dout_desc, mat_type, z_row, z_col, z_row));
       }
     } else {
       dy_dout_desc = &dout_desc;
       if (dout_desc == nullptr) {
-        PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatrixLayoutCreate(
+        PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasLtMatrixLayoutCreate(
             dy_dout_desc, mat_type, z_col, z_row, z_col));
       }
     }
 
     dy_x_desc = &x_trans_desc;
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasLtMatrixLayoutCreate(
         dy_x_desc, mat_type, x_col, x_row, x_col));
 
     auto& a_desc = kYGradAIsDZ ? (*dy_dout_desc) : (*dy_x_desc);
@@ -832,22 +851,22 @@ void ComputeFusedGemmEpilogueBackwardImplDev(
     auto a_trans = BoolToCuBlasEnum(Trait::kYGradATrans);
     auto b_trans = BoolToCuBlasEnum(Trait::kYGradBTrans);
 
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasLtMatrixLayoutCreate(
         &dy_desc,
         phi::backends::gpu::ToCudaDataType<DYT>(),
         y_col,
         y_row,
         y_col));
 
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatmulDescCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasLtMatmulDescCreate(
         &dy_operation_desc, compute_type, scale_type));
 
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasLtMatmulDescSetAttribute(
         dy_operation_desc,
         CUBLASLT_MATMUL_DESC_TRANSB,
         &a_trans,
         sizeof(a_trans)));
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasLtMatmulDescSetAttribute(
         dy_operation_desc,
         CUBLASLT_MATMUL_DESC_TRANSA,
         &b_trans,
@@ -864,7 +883,7 @@ void ComputeFusedGemmEpilogueBackwardImplDev(
       }
     }
 
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cublasLtMatmulDescSetAttribute(
         dy_operation_desc,
         CUBLASLT_MATMUL_DESC_EPILOGUE,
         &epiloque_func_for_dy,
@@ -873,11 +892,12 @@ void ComputeFusedGemmEpilogueBackwardImplDev(
     if (dbias) {
       auto* dbias_data =
           dev_ctx.Alloc<DYT>(dbias, dbias->numel() * sizeof(DYT));
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatmulDescSetAttribute(
-          dy_operation_desc,
-          CUBLASLT_MATMUL_DESC_BIAS_POINTER,
-          &dbias_data,
-          sizeof(dbias_data)));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          common::dynload::cublasLtMatmulDescSetAttribute(
+              dy_operation_desc,
+              CUBLASLT_MATMUL_DESC_BIAS_POINTER,
+              &dbias_data,
+              sizeof(dbias_data)));
     }
 
     auto dy_workspace = memory_utils::Alloc(
@@ -905,22 +925,23 @@ void ComputeFusedGemmEpilogueBackwardImplDev(
                                                       dy_workspace->ptr(),
                                                       workspace_size);
 
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasLtMatmul(lt_handle,
-                                                            dy_operation_desc,
-                                                            &alpha,
-                                                            b_data,
-                                                            b_desc,
-                                                            a_data,
-                                                            a_desc,
-                                                            &beta_dy,
-                                                            dy_data,
-                                                            dy_desc,
-                                                            dy_data,
-                                                            dy_desc,
-                                                            algo,
-                                                            dy_workspace->ptr(),
-                                                            workspace_size,
-                                                            stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        common::dynload::cublasLtMatmul(lt_handle,
+                                        dy_operation_desc,
+                                        &alpha,
+                                        b_data,
+                                        b_desc,
+                                        a_data,
+                                        a_desc,
+                                        &beta_dy,
+                                        dy_data,
+                                        dy_desc,
+                                        dy_data,
+                                        dy_desc,
+                                        algo,
+                                        dy_workspace->ptr(),
+                                        workspace_size,
+                                        stream));
   }
 }
 
diff --git a/paddle/phi/kernels/funcs/gather.h b/paddle/phi/kernels/funcs/gather.h
index 7afc6280d374d9..87f85efc956436 100644
--- a/paddle/phi/kernels/funcs/gather.h
+++ b/paddle/phi/kernels/funcs/gather.h
@@ -18,9 +18,9 @@ limitations under the License. */
 #include <cstring>
 #include <vector>
 
+#include "paddle/common/ddim.h"
 #include "paddle/common/macros.h"
 #include "paddle/phi/common/place.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 namespace phi {
diff --git a/paddle/phi/kernels/funcs/gpc.cc b/paddle/phi/kernels/funcs/gpc.cc
index cd02f276392086..038ffde44605f6 100644
--- a/paddle/phi/kernels/funcs/gpc.cc
+++ b/paddle/phi/kernels/funcs/gpc.cc
@@ -26,7 +26,7 @@
 #include "paddle/phi/kernels/funcs/gpc.h"
 #include <array>
 
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 
 namespace phi {
 namespace funcs {
diff --git a/paddle/phi/kernels/funcs/gru_compute.h b/paddle/phi/kernels/funcs/gru_compute.h
index 7e53c88b7394a0..a0529d7f75858c 100644
--- a/paddle/phi/kernels/funcs/gru_compute.h
+++ b/paddle/phi/kernels/funcs/gru_compute.h
@@ -11,8 +11,8 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/all_context.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/funcs/detail/activation_functions.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/funcs/hipfft_util.h b/paddle/phi/kernels/funcs/hipfft_util.h
index 74ca06fcf17f04..a3baf5cb3f26d6 100644
--- a/paddle/phi/kernels/funcs/hipfft_util.h
+++ b/paddle/phi/kernels/funcs/hipfft_util.h
@@ -15,8 +15,8 @@
 #pragma once
 #include <vector>
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/dynload/hipfft.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/funcs/fft.h"
 #include "paddle/phi/kernels/funcs/fft_key.h"
 
@@ -28,7 +28,7 @@ namespace detail {
 class HIPFFTHandle {
  public:
   HIPFFTHandle() {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::hipfftCreate(&handle_));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::hipfftCreate(&handle_));
   }
 
   HIPFFTHandle(const HIPFFTHandle& other) = delete;
@@ -40,7 +40,7 @@ class HIPFFTHandle {
   ::hipfftHandle& get() { return handle_; }
   const ::hipfftHandle& get() const { return handle_; }
 
-  ~HIPFFTHandle() { phi::dynload::hipfftDestroy(handle_); }
+  ~HIPFFTHandle() { common::dynload::hipfftDestroy(handle_); }
 
  private:
   ::hipfftHandle handle_;
@@ -88,20 +88,20 @@ class FFTConfig {
 
     // disable auto allocation of workspace to use allocator from the framework
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::hipfftSetAutoAllocation(plan(), /* autoAllocate */ 0));
+        common::dynload::hipfftSetAutoAllocation(plan(), /* autoAllocate */ 0));
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::hipfftMakePlanMany(plan(),
-                                         signal_ndim,
-                                         signal_sizes.data(),
-                                         /* inembed */ nullptr,
-                                         /* base_istride */ 1,
-                                         /* idist */ 1,
-                                         /* onembed */ nullptr,
-                                         /* base_ostride */ 1,
-                                         /* odist */ 1,
-                                         exec_type,
-                                         batch_size,
-                                         &ws_size_));
+        common::dynload::hipfftMakePlanMany(plan(),
+                                            signal_ndim,
+                                            signal_sizes.data(),
+                                            /* inembed */ nullptr,
+                                            /* base_istride */ 1,
+                                            /* idist */ 1,
+                                            /* onembed */ nullptr,
+                                            /* base_ostride */ 1,
+                                            /* odist */ 1,
+                                            exec_type,
+                                            batch_size,
+                                            &ws_size_));
   }
 
   const hipfftHandle& plan() const { return plan_.get(); }
@@ -127,7 +127,7 @@ static void exec_plan(const FFTConfig& config,
   if (value_type == DataType::FLOAT32) {
     switch (config.transform_type()) {
       case FFTTransformType::C2C: {
-        PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::hipfftExecC2C(
+        PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::hipfftExecC2C(
             plan,
             static_cast<hipfftComplex*>(in_data),
             static_cast<hipfftComplex*>(out_data),
@@ -135,24 +135,24 @@ static void exec_plan(const FFTConfig& config,
         return;
       }
       case FFTTransformType::R2C: {
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::hipfftExecR2C(plan,
-                                        static_cast<hipfftReal*>(in_data),
-                                        static_cast<hipfftComplex*>(out_data)));
+        PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::hipfftExecR2C(
+            plan,
+            static_cast<hipfftReal*>(in_data),
+            static_cast<hipfftComplex*>(out_data)));
         return;
       }
       case FFTTransformType::C2R: {
         PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::hipfftExecC2R(plan,
-                                        static_cast<hipfftComplex*>(in_data),
-                                        static_cast<hipfftReal*>(out_data)));
+            common::dynload::hipfftExecC2R(plan,
+                                           static_cast<hipfftComplex*>(in_data),
+                                           static_cast<hipfftReal*>(out_data)));
         return;
       }
     }
   } else if (value_type == DataType::FLOAT64) {
     switch (config.transform_type()) {
       case FFTTransformType::C2C: {
-        PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::hipfftExecZ2Z(
+        PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::hipfftExecZ2Z(
             plan,
             static_cast<hipfftDoubleComplex*>(in_data),
             static_cast<hipfftDoubleComplex*>(out_data),
@@ -160,14 +160,14 @@ static void exec_plan(const FFTConfig& config,
         return;
       }
       case FFTTransformType::R2C: {
-        PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::hipfftExecD2Z(
+        PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::hipfftExecD2Z(
             plan,
             static_cast<hipfftDoubleReal*>(in_data),
             static_cast<hipfftDoubleComplex*>(out_data)));
         return;
       }
       case FFTTransformType::C2R: {
-        PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::hipfftExecZ2D(
+        PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::hipfftExecZ2D(
             plan,
             static_cast<hipfftDoubleComplex*>(in_data),
             static_cast<hipfftDoubleReal*>(out_data)));
diff --git a/paddle/phi/kernels/funcs/im2col.h b/paddle/phi/kernels/funcs/im2col.h
index 73b2866924d1e9..683a1615ee3486 100644
--- a/paddle/phi/kernels/funcs/im2col.h
+++ b/paddle/phi/kernels/funcs/im2col.h
@@ -16,10 +16,10 @@ limitations under the License. */
 
 #include <vector>
 
+#include "paddle/common/enforce.h"
+#include "paddle/common/errors.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 
 namespace phi {
 namespace funcs {
diff --git a/paddle/phi/kernels/funcs/inclusive_scan.h b/paddle/phi/kernels/funcs/inclusive_scan.h
index 265febd306f334..413f9d5eae67cb 100644
--- a/paddle/phi/kernels/funcs/inclusive_scan.h
+++ b/paddle/phi/kernels/funcs/inclusive_scan.h
@@ -25,9 +25,9 @@ namespace cub = hipcub;
 #include <thrust/device_ptr.h>
 #include <thrust/iterator/reverse_iterator.h>
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/common/type_traits.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/funcs/interpolate_function.h b/paddle/phi/kernels/funcs/interpolate_function.h
index 23731285926da4..91e797cda90920 100644
--- a/paddle/phi/kernels/funcs/interpolate_function.h
+++ b/paddle/phi/kernels/funcs/interpolate_function.h
@@ -14,9 +14,9 @@
 
 #pragma once
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/common/layout.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 
 #if defined(__NVCC__) || defined(__HIPCC__)
diff --git a/paddle/phi/kernels/funcs/jit/benchmark.cc b/paddle/phi/kernels/funcs/jit/benchmark.cc
index 894a711ddec6d7..545cbc079c46ae 100644
--- a/paddle/phi/kernels/funcs/jit/benchmark.cc
+++ b/paddle/phi/kernels/funcs/jit/benchmark.cc
@@ -16,10 +16,10 @@
 #include <random>
 
 #include "glog/logging.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/api/profiler/device_tracer.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/funcs/jit/kernels.h"
 #include "paddle/utils/flags.h"
 
diff --git a/paddle/phi/kernels/funcs/jit/gen/act.h b/paddle/phi/kernels/funcs/jit/gen/act.h
index 098bfe19728be3..9019af68176cf1 100644
--- a/paddle/phi/kernels/funcs/jit/gen/act.h
+++ b/paddle/phi/kernels/funcs/jit/gen/act.h
@@ -17,7 +17,7 @@
 #include <string>
 
 #include "glog/logging.h"
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/kernels/funcs/jit/gen/jitcode.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/funcs/jit/gen/adam.h b/paddle/phi/kernels/funcs/jit/gen/adam.h
index 5c432e03ec9214..be56661ff55338 100644
--- a/paddle/phi/kernels/funcs/jit/gen/adam.h
+++ b/paddle/phi/kernels/funcs/jit/gen/adam.h
@@ -17,7 +17,7 @@
 #include <string>
 
 #include "glog/logging.h"
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/kernels/funcs/jit/gen/jitcode.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/funcs/jit/gen/adamw.h b/paddle/phi/kernels/funcs/jit/gen/adamw.h
index dab90e0e0f69e1..9e0caa549e6a67 100644
--- a/paddle/phi/kernels/funcs/jit/gen/adamw.h
+++ b/paddle/phi/kernels/funcs/jit/gen/adamw.h
@@ -17,7 +17,7 @@
 #include <string>
 
 #include "glog/logging.h"
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/kernels/funcs/jit/gen/jitcode.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/funcs/jit/gen/blas.h b/paddle/phi/kernels/funcs/jit/gen/blas.h
index a046634440ea81..5c889e246077a3 100644
--- a/paddle/phi/kernels/funcs/jit/gen/blas.h
+++ b/paddle/phi/kernels/funcs/jit/gen/blas.h
@@ -17,7 +17,7 @@
 #include <string>
 
 #include "glog/logging.h"
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/kernels/funcs/jit/gen/jitcode.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/funcs/jit/gen/embseqpool.h b/paddle/phi/kernels/funcs/jit/gen/embseqpool.h
index 8e201b7538ebe1..86464b74b632e1 100644
--- a/paddle/phi/kernels/funcs/jit/gen/embseqpool.h
+++ b/paddle/phi/kernels/funcs/jit/gen/embseqpool.h
@@ -17,7 +17,7 @@
 #include <string>
 
 #include "glog/logging.h"
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/kernels/funcs/jit/gen/jitcode.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/funcs/jit/gen/matmul.h b/paddle/phi/kernels/funcs/jit/gen/matmul.h
index dcbec14250d86a..62eda7742eae16 100644
--- a/paddle/phi/kernels/funcs/jit/gen/matmul.h
+++ b/paddle/phi/kernels/funcs/jit/gen/matmul.h
@@ -20,7 +20,7 @@
 #include <vector>
 
 #include "glog/logging.h"
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/kernels/funcs/jit/gen/jitcode.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/funcs/jit/gen/seqpool.h b/paddle/phi/kernels/funcs/jit/gen/seqpool.h
index 260fd6cde88ffa..83a75cb8f81f13 100644
--- a/paddle/phi/kernels/funcs/jit/gen/seqpool.h
+++ b/paddle/phi/kernels/funcs/jit/gen/seqpool.h
@@ -17,7 +17,7 @@
 #include <string>
 
 #include "glog/logging.h"
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/kernels/funcs/jit/gen/jitcode.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/funcs/jit/gen/sgd.h b/paddle/phi/kernels/funcs/jit/gen/sgd.h
index 4f9617ccdafb2e..b58b21e15cce7f 100644
--- a/paddle/phi/kernels/funcs/jit/gen/sgd.h
+++ b/paddle/phi/kernels/funcs/jit/gen/sgd.h
@@ -17,7 +17,7 @@
 #include <string>
 
 #include "glog/logging.h"
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/kernels/funcs/jit/gen/jitcode.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/funcs/jit/gen/vbroadcast.h b/paddle/phi/kernels/funcs/jit/gen/vbroadcast.h
index b1cf8521dd76c4..9aa7b8456eccae 100644
--- a/paddle/phi/kernels/funcs/jit/gen/vbroadcast.h
+++ b/paddle/phi/kernels/funcs/jit/gen/vbroadcast.h
@@ -17,7 +17,7 @@
 #include <string>
 
 #include "glog/logging.h"
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/kernels/funcs/jit/gen/jitcode.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/funcs/jit/gen_base.cc b/paddle/phi/kernels/funcs/jit/gen_base.cc
index 3758aaf4cace8d..bacfcd59877329 100644
--- a/paddle/phi/kernels/funcs/jit/gen_base.cc
+++ b/paddle/phi/kernels/funcs/jit/gen_base.cc
@@ -16,8 +16,8 @@
 
 #include <fstream>
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/cpu/cpu_info.h"
-#include "paddle/phi/core/enforce.h"
 
 #ifdef _WIN32
 #define posix_memalign_free _aligned_free
diff --git a/paddle/phi/kernels/funcs/jit/helper.cc b/paddle/phi/kernels/funcs/jit/helper.cc
index c135d6ee3177dd..8f5f64b48ef69f 100644
--- a/paddle/phi/kernels/funcs/jit/helper.cc
+++ b/paddle/phi/kernels/funcs/jit/helper.cc
@@ -16,7 +16,7 @@
 
 #include <numeric>
 
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 
 namespace phi {
 namespace jit {
diff --git a/paddle/phi/kernels/funcs/jit/helper.h b/paddle/phi/kernels/funcs/jit/helper.h
index 7e3394dffd4a2a..69e647494d45ce 100644
--- a/paddle/phi/kernels/funcs/jit/helper.h
+++ b/paddle/phi/kernels/funcs/jit/helper.h
@@ -22,8 +22,8 @@
 #include <utility>  // for std::move
 #include <vector>
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/common/place.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/funcs/jit/gen_base.h"
 #include "paddle/phi/kernels/funcs/jit/kernel_base.h"
 #include "paddle/phi/kernels/funcs/jit/kernel_key.h"
diff --git a/paddle/phi/kernels/funcs/jit/more/mkl/mkl.cc b/paddle/phi/kernels/funcs/jit/more/mkl/mkl.cc
index deaeba0224fb98..2523315fa29713 100644
--- a/paddle/phi/kernels/funcs/jit/more/mkl/mkl.cc
+++ b/paddle/phi/kernels/funcs/jit/more/mkl/mkl.cc
@@ -29,20 +29,20 @@ void MatMul<float>(const float* a,
                    const float* b,
                    float* c,
                    const matmul_attr_t* attr) {
-  phi::dynload::cblas_sgemm(CblasRowMajor,
-                            CblasNoTrans,
-                            CblasNoTrans,
-                            attr->m,
-                            attr->n,
-                            attr->k,
-                            1.f,
-                            a,
-                            attr->k,
-                            b,
-                            attr->n,
-                            0.f,
-                            c,
-                            attr->n);
+  common::dynload::cblas_sgemm(CblasRowMajor,
+                               CblasNoTrans,
+                               CblasNoTrans,
+                               attr->m,
+                               attr->n,
+                               attr->k,
+                               1.f,
+                               a,
+                               attr->k,
+                               b,
+                               attr->n,
+                               0.f,
+                               c,
+                               attr->n);
 }
 
 template <>
@@ -50,46 +50,46 @@ void MatMul<double>(const double* a,
                     const double* b,
                     double* c,
                     const matmul_attr_t* attr) {
-  phi::dynload::cblas_dgemm(CblasRowMajor,
-                            CblasNoTrans,
-                            CblasNoTrans,
-                            attr->m,
-                            attr->n,
-                            attr->k,
-                            1.0,
-                            a,
-                            attr->k,
-                            b,
-                            attr->n,
-                            0.0,
-                            c,
-                            attr->n);
+  common::dynload::cblas_dgemm(CblasRowMajor,
+                               CblasNoTrans,
+                               CblasNoTrans,
+                               attr->m,
+                               attr->n,
+                               attr->k,
+                               1.0,
+                               a,
+                               attr->k,
+                               b,
+                               attr->n,
+                               0.0,
+                               c,
+                               attr->n);
 }
 
 template <>
 void VMul<float>(const float* x, const float* y, float* z, int n) {
-  phi::dynload::vsMul(n, x, y, z);
+  common::dynload::vsMul(n, x, y, z);
 }
 
 template <>
 void VMul<double>(const double* x, const double* y, double* z, int n) {
-  phi::dynload::vdMul(n, x, y, z);
+  common::dynload::vdMul(n, x, y, z);
 }
 
 template <>
 void VAdd<float>(const float* x, const float* y, float* z, int n) {
-  phi::dynload::vsAdd(n, x, y, z);
+  common::dynload::vsAdd(n, x, y, z);
 }
 
 template <>
 void VAdd<double>(const double* x, const double* y, double* z, int n) {
-  phi::dynload::vdAdd(n, x, y, z);
+  common::dynload::vdAdd(n, x, y, z);
 }
 
 template <>
 void VScal<float>(const float* a, const float* x, float* y, int n) {
   if (x == y) {
-    phi::dynload::cblas_sscal(n, *a, y, 1);
+    common::dynload::cblas_sscal(n, *a, y, 1);
   } else {
     refer::VScal<float>(a, x, y, n);
   }
@@ -98,7 +98,7 @@ void VScal<float>(const float* a, const float* x, float* y, int n) {
 template <>
 void VScal<double>(const double* a, const double* x, double* y, int n) {
   if (x == y) {
-    phi::dynload::cblas_dscal(n, *a, y, 1);
+    common::dynload::cblas_dscal(n, *a, y, 1);
   } else {
     refer::VScal<double>(a, x, y, n);
   }
@@ -106,52 +106,52 @@ void VScal<double>(const double* a, const double* x, double* y, int n) {
 
 template <>
 void VExp<float>(const float* x, float* y, int n) {
-  phi::dynload::vsExp(n, x, y);
+  common::dynload::vsExp(n, x, y);
 }
 
 template <>
 void VExp<double>(const double* x, double* y, int n) {
-  phi::dynload::vdExp(n, x, y);
+  common::dynload::vdExp(n, x, y);
 }
 
 template <>
 void VSquare<float>(const float* x, float* y, int n) {
-  phi::dynload::vsSqr(n, x, y);
+  common::dynload::vsSqr(n, x, y);
 }
 
 template <>
 void VSquare<double>(const double* x, double* y, int n) {
-  phi::dynload::vdSqr(n, x, y);
+  common::dynload::vdSqr(n, x, y);
 }
 
 template <>
 void VCopy<float>(const float* x, float* y, int n) {
-  phi::dynload::cblas_scopy(n, x, 1, y, 1);
+  common::dynload::cblas_scopy(n, x, 1, y, 1);
 }
 
 template <>
 void VCopy<double>(const double* x, double* y, int n) {
-  phi::dynload::cblas_dcopy(n, x, 1, y, 1);
+  common::dynload::cblas_dcopy(n, x, 1, y, 1);
 }
 
 template <>
 void VAXPY<float>(float a, const float* x, float* y, int n) {
-  phi::dynload::cblas_saxpy(n, a, x, 1, y, 1);
+  common::dynload::cblas_saxpy(n, a, x, 1, y, 1);
 }
 
 template <>
 void VAXPY<double>(double a, const double* x, double* y, int n) {
-  phi::dynload::cblas_daxpy(n, a, x, 1, y, 1);
+  common::dynload::cblas_daxpy(n, a, x, 1, y, 1);
 }
 
 template <>
 void ASum<float>(const float* x, float* res, int n) {
-  res[0] = phi::dynload::cblas_sasum(n, x, 1);
+  res[0] = common::dynload::cblas_sasum(n, x, 1);
 }
 
 template <>
 void ASum<double>(const double* x, double* res, int n) {
-  res[0] = phi::dynload::cblas_dasum(n, x, 1);
+  res[0] = common::dynload::cblas_dasum(n, x, 1);
 }
 
 // TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512
diff --git a/paddle/phi/kernels/funcs/jit/more/mkl/mkl.h b/paddle/phi/kernels/funcs/jit/more/mkl/mkl.h
index 017fd7980039dc..069f62fe22f5eb 100644
--- a/paddle/phi/kernels/funcs/jit/more/mkl/mkl.h
+++ b/paddle/phi/kernels/funcs/jit/more/mkl/mkl.h
@@ -18,7 +18,7 @@
 #include <type_traits>
 #include <vector>
 
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/kernels/funcs/jit/kernel_base.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/funcs/jit/refer/refer.h b/paddle/phi/kernels/funcs/jit/refer/refer.h
index c7c3835f890682..781b417064c0fc 100644
--- a/paddle/phi/kernels/funcs/jit/refer/refer.h
+++ b/paddle/phi/kernels/funcs/jit/refer/refer.h
@@ -18,7 +18,7 @@
 #include <limits>
 #include <string>
 
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/kernels/funcs/jit/helper.h"
 #include "paddle/phi/kernels/funcs/jit/kernel_base.h"
 
diff --git a/paddle/phi/kernels/funcs/jit/test.cc b/paddle/phi/kernels/funcs/jit/test.cc
index d388d95975cff9..f80a5b49cba47b 100644
--- a/paddle/phi/kernels/funcs/jit/test.cc
+++ b/paddle/phi/kernels/funcs/jit/test.cc
@@ -18,9 +18,9 @@ limitations under the License. */
 
 #include "glog/logging.h"
 #include "gtest/gtest.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/cpu/cpu_info.h"
 #include "paddle/phi/common/place.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/funcs/jit/kernels.h"
 #include "paddle/utils/flags.h"
 
diff --git a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
index 1a52e57e45f236..6a82875819161b 100644
--- a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
+++ b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
@@ -26,11 +26,11 @@ namespace cub = hipcub;
 
 #include "glog/logging.h"
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
 #include "paddle/phi/common/memory_utils.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/funcs/lstm_compute.h b/paddle/phi/kernels/funcs/lstm_compute.h
index 56cd975d848caa..275f20872a9cc1 100644
--- a/paddle/phi/kernels/funcs/lstm_compute.h
+++ b/paddle/phi/kernels/funcs/lstm_compute.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/all_context.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/funcs/detail/activation_functions.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/funcs/math_function.h b/paddle/phi/kernels/funcs/math_function.h
index 5390d77c876f5f..9cd0db530dc4f0 100644
--- a/paddle/phi/kernels/funcs/math_function.h
+++ b/paddle/phi/kernels/funcs/math_function.h
@@ -17,10 +17,10 @@ limitations under the License. */
 #include <memory>
 #include <vector>
 
+#include "paddle/common/data_type.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/utils/data_type.h"
 #ifdef PADDLE_WITH_XPU
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
diff --git a/paddle/phi/kernels/funcs/matrix_inverse.h b/paddle/phi/kernels/funcs/matrix_inverse.h
index f0cd265a546481..0b42785e5f6681 100644
--- a/paddle/phi/kernels/funcs/matrix_inverse.h
+++ b/paddle/phi/kernels/funcs/matrix_inverse.h
@@ -18,9 +18,9 @@ limitations under the License. */
 
 #include "Eigen/Core"
 #include "Eigen/LU"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/enforce.h"
 
 namespace phi {
 namespace funcs {
diff --git a/paddle/phi/kernels/funcs/matrix_solve.h b/paddle/phi/kernels/funcs/matrix_solve.h
index 3856c06c1b25fc..a21a5eb4ec8da4 100644
--- a/paddle/phi/kernels/funcs/matrix_solve.h
+++ b/paddle/phi/kernels/funcs/matrix_solve.h
@@ -18,9 +18,9 @@ limitations under the License. */
 
 #include "Eigen/Core"
 #include "Eigen/LU"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/enforce.h"
 
 namespace phi {
 namespace funcs {
diff --git a/paddle/phi/kernels/funcs/mkl_fft_utils.h b/paddle/phi/kernels/funcs/mkl_fft_utils.h
index dbc0678ab7ae5a..ba85f6f9107aa2 100644
--- a/paddle/phi/kernels/funcs/mkl_fft_utils.h
+++ b/paddle/phi/kernels/funcs/mkl_fft_utils.h
@@ -23,18 +23,18 @@ namespace phi {
 namespace funcs {
 namespace detail {
 
-#define MKL_DFTI_CHECK(expr)                                              \
-  do {                                                                    \
-    MKL_LONG status = (expr);                                             \
-    if (!phi::dynload::DftiErrorClass(status, DFTI_NO_ERROR))             \
-      PADDLE_THROW(                                                       \
-          phi::errors::External(phi::dynload::DftiErrorMessage(status))); \
+#define MKL_DFTI_CHECK(expr)                                                 \
+  do {                                                                       \
+    MKL_LONG status = (expr);                                                \
+    if (!common::dynload::DftiErrorClass(status, DFTI_NO_ERROR))             \
+      PADDLE_THROW(                                                          \
+          phi::errors::External(common::dynload::DftiErrorMessage(status))); \
   } while (0);
 
 struct DftiDescriptorDeleter {
   void operator()(DFTI_DESCRIPTOR_HANDLE handle) {
     if (handle != nullptr) {
-      MKL_DFTI_CHECK(phi::dynload::DftiFreeDescriptor(&handle));
+      MKL_DFTI_CHECK(common::dynload::DftiFreeDescriptor(&handle));
     }
   }
 };
@@ -52,7 +52,7 @@ class DftiDescriptor {
                           "DftiDescriptor has already been initialized."));
 
     DFTI_DESCRIPTOR* raw_desc;
-    MKL_DFTI_CHECK(phi::dynload::DftiCreateDescriptorX(
+    MKL_DFTI_CHECK(common::dynload::DftiCreateDescriptorX(
         &raw_desc, precision, signal_type, signal_ndim, sizes));
     desc_.reset(raw_desc);
   }
@@ -105,20 +105,20 @@ static DftiDescriptor plan_mkl_fft(const DataType in_dtype,
   descriptor.init(precision, domain, signal_ndim, fft_sizes.data() + 1);
 
   // placement inplace or not inplace
-  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
+  MKL_DFTI_CHECK(common::dynload::DftiSetValue(
       descriptor.get(), DFTI_PLACEMENT, DFTI_NOT_INPLACE));
 
   // number of transformations
   const MKL_LONG batch_size = fft_sizes[0];
-  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
+  MKL_DFTI_CHECK(common::dynload::DftiSetValue(
       descriptor.get(), DFTI_NUMBER_OF_TRANSFORMS, batch_size));
 
   // input & output distance
   const MKL_LONG idist = in_strides[0];
   const MKL_LONG odist = out_strides[0];
-  MKL_DFTI_CHECK(
-      phi::dynload::DftiSetValue(descriptor.get(), DFTI_INPUT_DISTANCE, idist));
-  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
+  MKL_DFTI_CHECK(common::dynload::DftiSetValue(
+      descriptor.get(), DFTI_INPUT_DISTANCE, idist));
+  MKL_DFTI_CHECK(common::dynload::DftiSetValue(
       descriptor.get(), DFTI_OUTPUT_DISTANCE, odist));
 
   // input & output stride
@@ -128,14 +128,14 @@ static DftiDescriptor plan_mkl_fft(const DataType in_dtype,
     mkl_in_stride[i] = in_strides[i];
     mkl_out_stride[i] = out_strides[i];
   }
-  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
+  MKL_DFTI_CHECK(common::dynload::DftiSetValue(
       descriptor.get(), DFTI_INPUT_STRIDES, mkl_in_stride.data()));
-  MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
+  MKL_DFTI_CHECK(common::dynload::DftiSetValue(
       descriptor.get(), DFTI_OUTPUT_STRIDES, mkl_out_stride.data()));
 
   // conjugate even storage
   if (!(fft_type == FFTTransformType::C2C)) {
-    MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
+    MKL_DFTI_CHECK(common::dynload::DftiSetValue(
         descriptor.get(), DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX));
   }
 
@@ -158,12 +158,12 @@ static DftiDescriptor plan_mkl_fft(const DataType in_dtype,
         return DFTI_BACKWARD_SCALE;
       }
     }();
-    MKL_DFTI_CHECK(
-        phi::dynload::DftiSetValue(descriptor.get(), scale_direction, scale));
+    MKL_DFTI_CHECK(common::dynload::DftiSetValue(
+        descriptor.get(), scale_direction, scale));
   }
 
   // commit the descriptor
-  MKL_DFTI_CHECK(phi::dynload::DftiCommitDescriptor(descriptor.get()));
+  MKL_DFTI_CHECK(common::dynload::DftiCommitDescriptor(descriptor.get()));
   return descriptor;
 }
 
diff --git a/paddle/phi/kernels/funcs/multinomial_functor.h b/paddle/phi/kernels/funcs/multinomial_functor.h
index 05a5a0faf67746..40428820a88998 100644
--- a/paddle/phi/kernels/funcs/multinomial_functor.h
+++ b/paddle/phi/kernels/funcs/multinomial_functor.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/core/device_context.h"
-#include "paddle/phi/core/enforce.h"
 
 namespace phi {
 namespace funcs {
diff --git a/paddle/phi/kernels/funcs/norm_utils.h b/paddle/phi/kernels/funcs/norm_utils.h
index 5c898549b353ea..7df4acc980ab41 100644
--- a/paddle/phi/kernels/funcs/norm_utils.h
+++ b/paddle/phi/kernels/funcs/norm_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "paddle/common/ddim.h"
 #include "paddle/phi/common/layout.h"
-#include "paddle/phi/core/ddim.h"
 
 namespace phi {
 namespace funcs {
diff --git a/paddle/phi/kernels/funcs/padding.h b/paddle/phi/kernels/funcs/padding.h
index d6faa5f824c0d5..fc83dfe53cda60 100644
--- a/paddle/phi/kernels/funcs/padding.h
+++ b/paddle/phi/kernels/funcs/padding.h
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 
diff --git a/paddle/phi/kernels/funcs/range_function.h b/paddle/phi/kernels/funcs/range_function.h
index 5ace32f46ace17..e8d805833440ae 100644
--- a/paddle/phi/kernels/funcs/range_function.h
+++ b/paddle/phi/kernels/funcs/range_function.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 
 namespace phi {
 namespace funcs {
diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h
index 93d4433ef6877b..75fe2f585a9f50 100644
--- a/paddle/phi/kernels/funcs/reduce_function.h
+++ b/paddle/phi/kernels/funcs/reduce_function.h
@@ -58,8 +58,8 @@ using dim3 = phi::kps::dim3;
 #endif
 
 #include "paddle/common/array.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_utils.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
diff --git a/paddle/phi/kernels/funcs/scatter.h b/paddle/phi/kernels/funcs/scatter.h
index 5934f57b47ddec..6a30012d73e7cf 100644
--- a/paddle/phi/kernels/funcs/scatter.h
+++ b/paddle/phi/kernels/funcs/scatter.h
@@ -17,8 +17,8 @@ limitations under the License. */
 #include <string>
 #include <unordered_set>
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/common/place.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
diff --git a/paddle/phi/kernels/funcs/select_impl.cu.h b/paddle/phi/kernels/funcs/select_impl.cu.h
index 96b7942cf27094..a756997dbfa53c 100644
--- a/paddle/phi/kernels/funcs/select_impl.cu.h
+++ b/paddle/phi/kernels/funcs/select_impl.cu.h
@@ -25,9 +25,9 @@ namespace cub = hipcub;
 #endif
 
 #include <algorithm>
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/memory_utils.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
 
diff --git a/paddle/phi/kernels/funcs/selected_rows_functor.cc b/paddle/phi/kernels/funcs/selected_rows_functor.cc
index 5696f44f68fd7e..b1a7f672c82c64 100644
--- a/paddle/phi/kernels/funcs/selected_rows_functor.cc
+++ b/paddle/phi/kernels/funcs/selected_rows_functor.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include <set>
 #include <vector>
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 #include "paddle/phi/core/mixed_vector.h"
 
 #ifdef PADDLE_WITH_XPU
diff --git a/paddle/phi/kernels/funcs/slice.h b/paddle/phi/kernels/funcs/slice.h
index 38b127541650be..bbe95e5e5fe7d4 100644
--- a/paddle/phi/kernels/funcs/slice.h
+++ b/paddle/phi/kernels/funcs/slice.h
@@ -14,9 +14,9 @@
 
 #pragma once
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 
diff --git a/paddle/phi/kernels/funcs/softmax.cu b/paddle/phi/kernels/funcs/softmax.cu
index 2ca97cd4ac2055..7874b54f0cb551 100644
--- a/paddle/phi/kernels/funcs/softmax.cu
+++ b/paddle/phi/kernels/funcs/softmax.cu
@@ -51,30 +51,30 @@ void SoftmaxCUDNNFunctor<T, DeviceContext>::operator()(
   miopenTensorDescriptor_t cudnn_y_desc =
       xDesc.descriptor<T>(layout, cudnn_tensor_dims);
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::miopenSoftmaxForward_V2(context.cudnn_handle(),
-                                            CudnnDataType<T>::kOne(),
-                                            cudnn_x_desc,
-                                            X->data<T>(),
-                                            CudnnDataType<T>::kZero(),
-                                            cudnn_y_desc,
-                                            context.template Alloc<T>(Y),
-                                            MIOPEN_SOFTMAX_ACCURATE,
-                                            MIOPEN_SOFTMAX_MODE_INSTANCE));
+      common::dynload::miopenSoftmaxForward_V2(context.cudnn_handle(),
+                                               CudnnDataType<T>::kOne(),
+                                               cudnn_x_desc,
+                                               X->data<T>(),
+                                               CudnnDataType<T>::kZero(),
+                                               cudnn_y_desc,
+                                               context.template Alloc<T>(Y),
+                                               MIOPEN_SOFTMAX_ACCURATE,
+                                               MIOPEN_SOFTMAX_MODE_INSTANCE));
 #else
   cudnnTensorDescriptor_t cudnn_x_desc =
       xDesc.descriptor<T>(layout, cudnn_tensor_dims);
   cudnnTensorDescriptor_t cudnn_y_desc =
       xDesc.descriptor<T>(layout, cudnn_tensor_dims);
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnSoftmaxForward(context.cudnn_handle(),
-                                        CUDNN_SOFTMAX_ACCURATE,
-                                        CUDNN_SOFTMAX_MODE_INSTANCE,
-                                        CudnnDataType<T>::kOne(),
-                                        cudnn_x_desc,
-                                        X->data<T>(),
-                                        CudnnDataType<T>::kZero(),
-                                        cudnn_y_desc,
-                                        context.template Alloc<T>(Y)));
+      common::dynload::cudnnSoftmaxForward(context.cudnn_handle(),
+                                           CUDNN_SOFTMAX_ACCURATE,
+                                           CUDNN_SOFTMAX_MODE_INSTANCE,
+                                           CudnnDataType<T>::kOne(),
+                                           cudnn_x_desc,
+                                           X->data<T>(),
+                                           CudnnDataType<T>::kZero(),
+                                           cudnn_y_desc,
+                                           context.template Alloc<T>(Y)));
 #endif
 }
 
@@ -105,18 +105,18 @@ void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
       dxDesc.descriptor<T>(layout, cudnn_tensor_dims);
   miopenTensorDescriptor_t cudnn_ygrad_desc =
       dyDesc.descriptor<T>(layout, cudnn_tensor_dims);
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::miopenSoftmaxBackward_V2(context.cudnn_handle(),
-                                             CudnnDataType<T>::kOne(),
-                                             cudnn_y_desc,
-                                             Y->data<T>(),
-                                             cudnn_ygrad_desc,
-                                             YGrad->data<T>(),
-                                             CudnnDataType<T>::kZero(),
-                                             cudnn_xgrad_desc,
-                                             context.template Alloc<T>(XGrad),
-                                             MIOPEN_SOFTMAX_ACCURATE,
-                                             MIOPEN_SOFTMAX_MODE_INSTANCE));
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::miopenSoftmaxBackward_V2(
+      context.cudnn_handle(),
+      CudnnDataType<T>::kOne(),
+      cudnn_y_desc,
+      Y->data<T>(),
+      cudnn_ygrad_desc,
+      YGrad->data<T>(),
+      CudnnDataType<T>::kZero(),
+      cudnn_xgrad_desc,
+      context.template Alloc<T>(XGrad),
+      MIOPEN_SOFTMAX_ACCURATE,
+      MIOPEN_SOFTMAX_MODE_INSTANCE));
 #else
   cudnnTensorDescriptor_t cudnn_y_desc =
       yDesc.descriptor<T>(layout, cudnn_tensor_dims);
@@ -125,17 +125,17 @@ void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
   cudnnTensorDescriptor_t cudnn_ygrad_desc =
       dyDesc.descriptor<T>(layout, cudnn_tensor_dims);
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnSoftmaxBackward(context.cudnn_handle(),
-                                         CUDNN_SOFTMAX_ACCURATE,
-                                         CUDNN_SOFTMAX_MODE_INSTANCE,
-                                         CudnnDataType<T>::kOne(),
-                                         cudnn_y_desc,
-                                         Y->data<T>(),
-                                         cudnn_ygrad_desc,
-                                         YGrad->data<T>(),
-                                         CudnnDataType<T>::kZero(),
-                                         cudnn_xgrad_desc,
-                                         context.template Alloc<T>(XGrad)));
+      common::dynload::cudnnSoftmaxBackward(context.cudnn_handle(),
+                                            CUDNN_SOFTMAX_ACCURATE,
+                                            CUDNN_SOFTMAX_MODE_INSTANCE,
+                                            CudnnDataType<T>::kOne(),
+                                            cudnn_y_desc,
+                                            Y->data<T>(),
+                                            cudnn_ygrad_desc,
+                                            YGrad->data<T>(),
+                                            CudnnDataType<T>::kZero(),
+                                            cudnn_xgrad_desc,
+                                            context.template Alloc<T>(XGrad)));
 #endif
 }
 
diff --git a/paddle/phi/kernels/funcs/sparse/common_shape.h b/paddle/phi/kernels/funcs/sparse/common_shape.h
index e4c836d1162523..451fbabeee528e 100644
--- a/paddle/phi/kernels/funcs/sparse/common_shape.h
+++ b/paddle/phi/kernels/funcs/sparse/common_shape.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <stdint.h>
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 
 namespace phi {
 namespace funcs {
diff --git a/paddle/phi/kernels/funcs/sparse/convolution.h b/paddle/phi/kernels/funcs/sparse/convolution.h
index e6f3a573088b28..7048ca1a127f5c 100644
--- a/paddle/phi/kernels/funcs/sparse/convolution.h
+++ b/paddle/phi/kernels/funcs/sparse/convolution.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 
diff --git a/paddle/phi/kernels/funcs/sparse/flatten_indices.h b/paddle/phi/kernels/funcs/sparse/flatten_indices.h
index 9a031b8cc12ca4..4edcd839572dbb 100644
--- a/paddle/phi/kernels/funcs/sparse/flatten_indices.h
+++ b/paddle/phi/kernels/funcs/sparse/flatten_indices.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <stdint.h>
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 
 namespace phi {
 namespace funcs {
diff --git a/paddle/phi/kernels/funcs/sparse/softmax.h b/paddle/phi/kernels/funcs/sparse/softmax.h
index fcb45def6c1fae..2a820461c4181b 100644
--- a/paddle/phi/kernels/funcs/sparse/softmax.h
+++ b/paddle/phi/kernels/funcs/sparse/softmax.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 #include "paddle/phi/core/tensor_utils.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h
index 63449ecbda7a22..c7bc3e2f4bb806 100644
--- a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h
+++ b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h
@@ -16,13 +16,13 @@
 
 #include "glog/logging.h"
 
+#include "paddle/common/ddim.h"
+#include "paddle/common/enforce.h"
 #include "paddle/common/float16.h"
 #include "paddle/phi/backends/dynload/cusparse.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
 #include "paddle/phi/core/visit_type.h"
@@ -90,22 +90,22 @@ inline void CreateCsrDescriptor(const phi::SparseCsrTensor& x,
   int64_t batch_nnz = x.nnz() / batch_size;
   cudaDataType_t gpu_type = GetGpuDataType<T>();
   dev_ctx.CusparseCall([&](cusparseHandle_t handle) {
-    phi::dynload::cusparseCreateCsr(descriptor,
-                                    M,
-                                    N,
-                                    batch_nnz,
-                                    const_cast<IntT*>(crows_data),
-                                    const_cast<IntT*>(cols_data),
-                                    const_cast<T*>(values_data),
-                                    CUSPARSE_INDEX_64I,
-                                    CUSPARSE_INDEX_64I,
-                                    CUSPARSE_INDEX_BASE_ZERO,
-                                    gpu_type);
+    common::dynload::cusparseCreateCsr(descriptor,
+                                       M,
+                                       N,
+                                       batch_nnz,
+                                       const_cast<IntT*>(crows_data),
+                                       const_cast<IntT*>(cols_data),
+                                       const_cast<T*>(values_data),
+                                       CUSPARSE_INDEX_64I,
+                                       CUSPARSE_INDEX_64I,
+                                       CUSPARSE_INDEX_BASE_ZERO,
+                                       gpu_type);
   });
   if (batch_size > 1) {
 #if CUDA_VERSION >= 11080
     dev_ctx.CusparseCall([&](cusparseHandle_t handle) {
-      phi::dynload::cusparseCsrSetStridedBatch(
+      common::dynload::cusparseCsrSetStridedBatch(
           *descriptor, batch_size, M + 1, batch_nnz);
     });
 #else
@@ -144,22 +144,22 @@ inline void CreateCooDescriptor(const phi::SparseCooTensor& x,
   int64_t batch_nnz = nnz / batch_size;
   cudaDataType_t gpu_type = GetGpuDataType<T>();
   dev_ctx.CusparseCall([&](cusparseHandle_t handle) {
-    phi::dynload::cusparseCreateCoo(descriptor,
-                                    M,
-                                    N,
-                                    batch_nnz,
-                                    const_cast<IntT*>(rows_data),
-                                    const_cast<IntT*>(cols_data),
-                                    const_cast<T*>(values_data),
-                                    CUSPARSE_INDEX_64I,
-                                    CUSPARSE_INDEX_BASE_ZERO,
-                                    gpu_type);
+    common::dynload::cusparseCreateCoo(descriptor,
+                                       M,
+                                       N,
+                                       batch_nnz,
+                                       const_cast<IntT*>(rows_data),
+                                       const_cast<IntT*>(cols_data),
+                                       const_cast<T*>(values_data),
+                                       CUSPARSE_INDEX_64I,
+                                       CUSPARSE_INDEX_BASE_ZERO,
+                                       gpu_type);
   });
 
   if (batch_size > 1) {
 #if CUDA_VERSION >= 11080
     dev_ctx.CusparseCall([&](cusparseHandle_t handle) {
-      phi::dynload::cusparseCooSetStridedBatch(
+      common::dynload::cusparseCooSetStridedBatch(
           *descriptor, batch_size, batch_nnz);
     });
 #else
@@ -195,7 +195,7 @@ class CuSparseSpMatDescriptor {
 
   ~CuSparseSpMatDescriptor() {
     dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
-      phi::dynload::cusparseDestroySpMat(descriptor_);
+      common::dynload::cusparseDestroySpMat(descriptor_);
     });
     VLOG(6) << "Destroy cusparseSpMatDescr_t " << &descriptor_;
   }
@@ -232,20 +232,20 @@ class CuSparseDnMatDescriptor {
     const T* x_data = x.data<T>();
     cudaDataType_t gpu_type = GetGpuDataType<T>();
     dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
-      phi::dynload::cusparseCreateDnMat(&descriptor_,
-                                        M,
-                                        N,
-                                        N,
-                                        const_cast<T*>(x_data),
-                                        gpu_type,
-                                        CUSPARSE_ORDER_ROW);
+      common::dynload::cusparseCreateDnMat(&descriptor_,
+                                           M,
+                                           N,
+                                           N,
+                                           const_cast<T*>(x_data),
+                                           gpu_type,
+                                           CUSPARSE_ORDER_ROW);
     });
 
     PADDLE_ENFORCE_EQ(x.numel(), batch_size * M * N);
     if (batch_size > 1) {
 #if CUDA_VERSION >= 11080
       dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
-        phi::dynload::cusparseDnMatSetStridedBatch(
+        common::dynload::cusparseDnMatSetStridedBatch(
             descriptor_, batch_size, M * N);
       });
 #else
@@ -259,7 +259,7 @@ class CuSparseDnMatDescriptor {
 
   ~CuSparseDnMatDescriptor() {
     dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
-      phi::dynload::cusparseDestroyDnMat(descriptor_);
+      common::dynload::cusparseDestroyDnMat(descriptor_);
     });
     VLOG(6) << "Destroy cusparseDnMatDescr_t " << &descriptor_;
   }
@@ -288,7 +288,7 @@ class CuSparseDnVecDescriptor {
     const T* x_data = x.data<T>();
     cudaDataType_t gpu_type = GetGpuDataType<T>();
     dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
-      phi::dynload::cusparseCreateDnVec(
+      common::dynload::cusparseCreateDnVec(
           &descriptor_, x.numel(), const_cast<T*>(x_data), gpu_type);
     });
 
@@ -297,7 +297,7 @@ class CuSparseDnVecDescriptor {
 
   ~CuSparseDnVecDescriptor() {
     dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
-      phi::dynload::cusparseDestroyDnVec(descriptor_);
+      common::dynload::cusparseDestroyDnVec(descriptor_);
     });
     VLOG(6) << "Destroy cusparseDnVecDescr_t " << &descriptor_;
   }
@@ -326,17 +326,17 @@ void SparseBlas<phi::GPUContext>::SPMM(bool transa,
   cudaDataType_t gpu_type = GetGpuDataType<T>();
   size_t buffer_size = 0;
   dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
-    phi::dynload::cusparseSpMM_bufferSize(handle,
-                                          GetTransposeOperation(transa),
-                                          GetTransposeOperation(transb),
-                                          &alpha,
-                                          a_descriptor.descriptor(),
-                                          b_descriptor.descriptor(),
-                                          &beta,
-                                          out_descriptor.descriptor(),
-                                          gpu_type,
-                                          GetSpMMAlgorithm(mat_a),
-                                          &buffer_size);
+    common::dynload::cusparseSpMM_bufferSize(handle,
+                                             GetTransposeOperation(transa),
+                                             GetTransposeOperation(transb),
+                                             &alpha,
+                                             a_descriptor.descriptor(),
+                                             b_descriptor.descriptor(),
+                                             &beta,
+                                             out_descriptor.descriptor(),
+                                             gpu_type,
+                                             GetSpMMAlgorithm(mat_a),
+                                             &buffer_size);
   });
 
   phi::Allocator::AllocationPtr tmp_buffer = phi::memory_utils::Alloc(
@@ -345,17 +345,17 @@ void SparseBlas<phi::GPUContext>::SPMM(bool transa,
       phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx_.stream())));
   void* tmp_buffer_ptr = tmp_buffer->ptr();
   dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
-    phi::dynload::cusparseSpMM(handle,
-                               GetTransposeOperation(transa),
-                               GetTransposeOperation(transb),
-                               &alpha,
-                               a_descriptor.descriptor(),
-                               b_descriptor.descriptor(),
-                               &beta,
-                               out_descriptor.descriptor(),
-                               gpu_type,
-                               GetSpMMAlgorithm(mat_a),
-                               tmp_buffer_ptr);
+    common::dynload::cusparseSpMM(handle,
+                                  GetTransposeOperation(transa),
+                                  GetTransposeOperation(transb),
+                                  &alpha,
+                                  a_descriptor.descriptor(),
+                                  b_descriptor.descriptor(),
+                                  &beta,
+                                  out_descriptor.descriptor(),
+                                  gpu_type,
+                                  GetSpMMAlgorithm(mat_a),
+                                  tmp_buffer_ptr);
   });
 }
 
@@ -375,20 +375,20 @@ void SparseBlas<phi::GPUContext>::SPMV(bool transa,
   cudaDataType_t gpu_type = GetGpuDataType<T>();
   size_t buffer_size = 0;
   dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
-    phi::dynload::cusparseSpMV_bufferSize(handle,
-                                          GetTransposeOperation(transa),
-                                          &alpha,
-                                          a_descriptor.descriptor(),
-                                          x_descriptor.descriptor(),
-                                          &beta,
-                                          out_descriptor.descriptor(),
-                                          gpu_type,
+    common::dynload::cusparseSpMV_bufferSize(handle,
+                                             GetTransposeOperation(transa),
+                                             &alpha,
+                                             a_descriptor.descriptor(),
+                                             x_descriptor.descriptor(),
+                                             &beta,
+                                             out_descriptor.descriptor(),
+                                             gpu_type,
 #if CUDA_VERSION >= 11040
-                                          CUSPARSE_SPMV_ALG_DEFAULT,
+                                             CUSPARSE_SPMV_ALG_DEFAULT,
 #else
-                                          CUSPARSE_MV_ALG_DEFAULT,
+                                             CUSPARSE_MV_ALG_DEFAULT,
 #endif
-                                          &buffer_size);
+                                             &buffer_size);
   });
 
   phi::Allocator::AllocationPtr tmp_buffer = phi::memory_utils::Alloc(
@@ -397,20 +397,20 @@ void SparseBlas<phi::GPUContext>::SPMV(bool transa,
       phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx_.stream())));
   void* tmp_buffer_ptr = tmp_buffer->ptr();
   dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
-    phi::dynload::cusparseSpMV(handle,
-                               GetTransposeOperation(transa),
-                               &alpha,
-                               a_descriptor.descriptor(),
-                               x_descriptor.descriptor(),
-                               &beta,
-                               out_descriptor.descriptor(),
-                               gpu_type,
+    common::dynload::cusparseSpMV(handle,
+                                  GetTransposeOperation(transa),
+                                  &alpha,
+                                  a_descriptor.descriptor(),
+                                  x_descriptor.descriptor(),
+                                  &beta,
+                                  out_descriptor.descriptor(),
+                                  gpu_type,
 #if CUDA_VERSION >= 11040
-                               CUSPARSE_SPMV_ALG_DEFAULT,
+                                  CUSPARSE_SPMV_ALG_DEFAULT,
 #else
-                               CUSPARSE_MV_ALG_DEFAULT,
+                                  CUSPARSE_MV_ALG_DEFAULT,
 #endif
-                               tmp_buffer_ptr);
+                                  tmp_buffer_ptr);
   });
 }
 
@@ -432,17 +432,17 @@ void SparseBlas<phi::GPUContext>::SDDMM(bool transa,
   cudaDataType_t gpu_type = GetGpuDataType<T>();
   size_t buffer_size = 0;
   dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
-    phi::dynload::cusparseSDDMM_bufferSize(handle,
-                                           GetTransposeOperation(transa),
-                                           GetTransposeOperation(transb),
-                                           &alpha,
-                                           a_descriptor.descriptor(),
-                                           b_descriptor.descriptor(),
-                                           &beta,
-                                           out_descriptor.descriptor(),
-                                           gpu_type,
-                                           CUSPARSE_SDDMM_ALG_DEFAULT,
-                                           &buffer_size);
+    common::dynload::cusparseSDDMM_bufferSize(handle,
+                                              GetTransposeOperation(transa),
+                                              GetTransposeOperation(transb),
+                                              &alpha,
+                                              a_descriptor.descriptor(),
+                                              b_descriptor.descriptor(),
+                                              &beta,
+                                              out_descriptor.descriptor(),
+                                              gpu_type,
+                                              CUSPARSE_SDDMM_ALG_DEFAULT,
+                                              &buffer_size);
   });
 
   phi::Allocator::AllocationPtr tmp_buffer = phi::memory_utils::Alloc(
@@ -452,31 +452,31 @@ void SparseBlas<phi::GPUContext>::SDDMM(bool transa,
   void* tmp_buffer_ptr = tmp_buffer->ptr();
 
   dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
-    phi::dynload::cusparseSDDMM_preprocess(handle,
-                                           GetTransposeOperation(transa),
-                                           GetTransposeOperation(transb),
-                                           &alpha,
-                                           a_descriptor.descriptor(),
-                                           b_descriptor.descriptor(),
-                                           &beta,
-                                           out_descriptor.descriptor(),
-                                           gpu_type,
-                                           CUSPARSE_SDDMM_ALG_DEFAULT,
-                                           tmp_buffer_ptr);
+    common::dynload::cusparseSDDMM_preprocess(handle,
+                                              GetTransposeOperation(transa),
+                                              GetTransposeOperation(transb),
+                                              &alpha,
+                                              a_descriptor.descriptor(),
+                                              b_descriptor.descriptor(),
+                                              &beta,
+                                              out_descriptor.descriptor(),
+                                              gpu_type,
+                                              CUSPARSE_SDDMM_ALG_DEFAULT,
+                                              tmp_buffer_ptr);
   });
 
   dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
-    phi::dynload::cusparseSDDMM(handle,
-                                GetTransposeOperation(transa),
-                                GetTransposeOperation(transb),
-                                &alpha,
-                                a_descriptor.descriptor(),
-                                b_descriptor.descriptor(),
-                                &beta,
-                                out_descriptor.descriptor(),
-                                gpu_type,
-                                CUSPARSE_SDDMM_ALG_DEFAULT,
-                                tmp_buffer_ptr);
+    common::dynload::cusparseSDDMM(handle,
+                                   GetTransposeOperation(transa),
+                                   GetTransposeOperation(transb),
+                                   &alpha,
+                                   a_descriptor.descriptor(),
+                                   b_descriptor.descriptor(),
+                                   &beta,
+                                   out_descriptor.descriptor(),
+                                   gpu_type,
+                                   CUSPARSE_SDDMM_ALG_DEFAULT,
+                                   tmp_buffer_ptr);
   });
 }
 #endif
diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.hip.h b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.hip.h
index cbd42be3cb6d49..62175c284b25ab 100644
--- a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.hip.h
+++ b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.hip.h
@@ -14,13 +14,13 @@
 
 #pragma once
 
+#include "paddle/common/ddim.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/dynload/rocsparse.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/memory_utils.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
 #include "paddle/phi/core/visit_type.h"
@@ -91,17 +91,17 @@ inline void CreateCsrDescriptor(const phi::SparseCsrTensor& x,
   rocsparse_indextype jtype = GetGpuIndexType<int64_t>();
   rocsparse_datatype ttype = GetGpuDataType<T>();
   dev_ctx.CusparseCall([&](rocsparse_handle handle) {
-    phi::dynload::rocsparse_create_csr_descr(descriptor,
-                                             M,
-                                             N,
-                                             batch_nnz,
-                                             const_cast<IntT*>(crows_data),
-                                             const_cast<IntT*>(cols_data),
-                                             const_cast<T*>(values_data),
-                                             itype,
-                                             jtype,
-                                             rocsparse_index_base_zero,
-                                             ttype);
+    common::dynload::rocsparse_create_csr_descr(descriptor,
+                                                M,
+                                                N,
+                                                batch_nnz,
+                                                const_cast<IntT*>(crows_data),
+                                                const_cast<IntT*>(cols_data),
+                                                const_cast<T*>(values_data),
+                                                itype,
+                                                jtype,
+                                                rocsparse_index_base_zero,
+                                                ttype);
   });
   if (batch_size > 1) {
     // TODO(umiswing): Add batch sparse matmul support for ROCM after 5.2.0
@@ -140,16 +140,16 @@ inline void CreateCooDescriptor(const phi::SparseCooTensor& x,
   rocsparse_indextype itype = GetGpuIndexType<int64_t>();
   rocsparse_datatype ttype = GetGpuDataType<T>();
   dev_ctx.CusparseCall([&](rocsparse_handle handle) {
-    phi::dynload::rocsparse_create_coo_descr(descriptor,
-                                             M,
-                                             N,
-                                             batch_nnz,
-                                             const_cast<IntT*>(rows_data),
-                                             const_cast<IntT*>(cols_data),
-                                             const_cast<T*>(values_data),
-                                             itype,
-                                             rocsparse_index_base_zero,
-                                             ttype);
+    common::dynload::rocsparse_create_coo_descr(descriptor,
+                                                M,
+                                                N,
+                                                batch_nnz,
+                                                const_cast<IntT*>(rows_data),
+                                                const_cast<IntT*>(cols_data),
+                                                const_cast<T*>(values_data),
+                                                itype,
+                                                rocsparse_index_base_zero,
+                                                ttype);
   });
 
   if (batch_size > 1) {
@@ -184,7 +184,7 @@ class RocSparseSpMatDescriptor {
 
   ~RocSparseSpMatDescriptor() {
     dev_ctx_.CusparseCall([&](rocsparse_handle handle) {
-      phi::dynload::rocsparse_destroy_spmat_descr(descriptor_);
+      common::dynload::rocsparse_destroy_spmat_descr(descriptor_);
     });
     VLOG(6) << "Destroy roscparse_spmat_descr " << &descriptor_;
   }
@@ -221,13 +221,13 @@ class RocSparseDnMatDescriptor {
     const T* x_data = x.data<T>();
     rocsparse_datatype ttype = GetGpuDataType<T>();
     dev_ctx.CusparseCall([&](rocsparse_handle handle) {
-      phi::dynload::rocsparse_create_dnmat_descr(&descriptor_,
-                                                 M,
-                                                 N,
-                                                 N,
-                                                 const_cast<T*>(x_data),
-                                                 ttype,
-                                                 rocsparse_order_row);
+      common::dynload::rocsparse_create_dnmat_descr(&descriptor_,
+                                                    M,
+                                                    N,
+                                                    N,
+                                                    const_cast<T*>(x_data),
+                                                    ttype,
+                                                    rocsparse_order_row);
     });
 
     PADDLE_ENFORCE_EQ(
@@ -246,7 +246,7 @@ class RocSparseDnMatDescriptor {
 
   ~RocSparseDnMatDescriptor() {
     dev_ctx_.CusparseCall([&](rocsparse_handle handle) {
-      phi::dynload::rocsparse_destroy_dnmat_descr(descriptor_);
+      common::dynload::rocsparse_destroy_dnmat_descr(descriptor_);
     });
     VLOG(6) << "Destroy rocsparse_dnmat_descr " << &descriptor_;
   }
@@ -277,19 +277,19 @@ void SparseBlas<phi::GPUContext>::SPMM(bool transa,
 
   // Query SpMM buffer
   dev_ctx_.CusparseCall([&](rocsparse_handle handle) {
-    phi::dynload::rocsparse_spmm(handle,
-                                 GetTransposeOperation(transa),
-                                 GetTransposeOperation(transb),
-                                 &alpha,
-                                 a_descriptor.descriptor(),
-                                 b_descriptor.descriptor(),
-                                 &beta,
-                                 out_descriptor.descriptor(),
-                                 ttype,
-                                 GetSpMMAlgorithm(mat_a),
-                                 rocsparse_spmm_stage_buffer_size,
-                                 &buffer_size,
-                                 nullptr);
+    common::dynload::rocsparse_spmm(handle,
+                                    GetTransposeOperation(transa),
+                                    GetTransposeOperation(transb),
+                                    &alpha,
+                                    a_descriptor.descriptor(),
+                                    b_descriptor.descriptor(),
+                                    &beta,
+                                    out_descriptor.descriptor(),
+                                    ttype,
+                                    GetSpMMAlgorithm(mat_a),
+                                    rocsparse_spmm_stage_buffer_size,
+                                    &buffer_size,
+                                    nullptr);
   });
 
   // Allocate buffer
@@ -301,36 +301,36 @@ void SparseBlas<phi::GPUContext>::SPMM(bool transa,
 
   // Preprocess data
   dev_ctx_.CusparseCall([&](rocsparse_handle handle) {
-    phi::dynload::rocsparse_spmm(handle,
-                                 GetTransposeOperation(transa),
-                                 GetTransposeOperation(transb),
-                                 &alpha,
-                                 a_descriptor.descriptor(),
-                                 b_descriptor.descriptor(),
-                                 &beta,
-                                 out_descriptor.descriptor(),
-                                 ttype,
-                                 GetSpMMAlgorithm(mat_a),
-                                 rocsparse_spmm_stage_preprocess,
-                                 &buffer_size,
-                                 tmp_buffer_ptr);
+    common::dynload::rocsparse_spmm(handle,
+                                    GetTransposeOperation(transa),
+                                    GetTransposeOperation(transb),
+                                    &alpha,
+                                    a_descriptor.descriptor(),
+                                    b_descriptor.descriptor(),
+                                    &beta,
+                                    out_descriptor.descriptor(),
+                                    ttype,
+                                    GetSpMMAlgorithm(mat_a),
+                                    rocsparse_spmm_stage_preprocess,
+                                    &buffer_size,
+                                    tmp_buffer_ptr);
   });
 
   // Performs the actual SpMM computation
   dev_ctx_.CusparseCall([&](rocsparse_handle handle) {
-    phi::dynload::rocsparse_spmm(handle,
-                                 GetTransposeOperation(transa),
-                                 GetTransposeOperation(transb),
-                                 &alpha,
-                                 a_descriptor.descriptor(),
-                                 b_descriptor.descriptor(),
-                                 &beta,
-                                 out_descriptor.descriptor(),
-                                 ttype,
-                                 GetSpMMAlgorithm(mat_a),
-                                 rocsparse_spmm_stage_compute,
-                                 &buffer_size,
-                                 tmp_buffer_ptr);
+    common::dynload::rocsparse_spmm(handle,
+                                    GetTransposeOperation(transa),
+                                    GetTransposeOperation(transb),
+                                    &alpha,
+                                    a_descriptor.descriptor(),
+                                    b_descriptor.descriptor(),
+                                    &beta,
+                                    out_descriptor.descriptor(),
+                                    ttype,
+                                    GetSpMMAlgorithm(mat_a),
+                                    rocsparse_spmm_stage_compute,
+                                    &buffer_size,
+                                    tmp_buffer_ptr);
   });
 }
 
@@ -352,17 +352,17 @@ void SparseBlas<phi::GPUContext>::SDDMM(bool transa,
   rocsparse_datatype gpu_type = GetGpuDataType<T>();
   size_t buffer_size = 0;
   dev_ctx_.CusparseCall([&](rocsparse_handle handle) {
-    phi::dynload::rocsparse_sddmm_buffer_size(handle,
-                                              GetTransposeOperation(transa),
-                                              GetTransposeOperation(transb),
-                                              &alpha,
-                                              a_descriptor.descriptor(),
-                                              b_descriptor.descriptor(),
-                                              &beta,
-                                              out_descriptor.descriptor(),
-                                              gpu_type,
-                                              rocsparse_sddmm_alg_default,
-                                              &buffer_size);
+    common::dynload::rocsparse_sddmm_buffer_size(handle,
+                                                 GetTransposeOperation(transa),
+                                                 GetTransposeOperation(transb),
+                                                 &alpha,
+                                                 a_descriptor.descriptor(),
+                                                 b_descriptor.descriptor(),
+                                                 &beta,
+                                                 out_descriptor.descriptor(),
+                                                 gpu_type,
+                                                 rocsparse_sddmm_alg_default,
+                                                 &buffer_size);
   });
 
   phi::Allocator::AllocationPtr tmp_buffer = phi::memory_utils::Alloc(
@@ -372,31 +372,31 @@ void SparseBlas<phi::GPUContext>::SDDMM(bool transa,
   void* tmp_buffer_ptr = tmp_buffer->ptr();
 
   dev_ctx_.CusparseCall([&](rocsparse_handle handle) {
-    phi::dynload::rocsparse_sddmm_preprocess(handle,
-                                             GetTransposeOperation(transa),
-                                             GetTransposeOperation(transb),
-                                             &alpha,
-                                             a_descriptor.descriptor(),
-                                             b_descriptor.descriptor(),
-                                             &beta,
-                                             out_descriptor.descriptor(),
-                                             gpu_type,
-                                             rocsparse_sddmm_alg_default,
-                                             tmp_buffer_ptr);
+    common::dynload::rocsparse_sddmm_preprocess(handle,
+                                                GetTransposeOperation(transa),
+                                                GetTransposeOperation(transb),
+                                                &alpha,
+                                                a_descriptor.descriptor(),
+                                                b_descriptor.descriptor(),
+                                                &beta,
+                                                out_descriptor.descriptor(),
+                                                gpu_type,
+                                                rocsparse_sddmm_alg_default,
+                                                tmp_buffer_ptr);
   });
 
   dev_ctx_.CusparseCall([&](rocsparse_handle handle) {
-    phi::dynload::rocsparse_sddmm(handle,
-                                  GetTransposeOperation(transa),
-                                  GetTransposeOperation(transb),
-                                  &alpha,
-                                  a_descriptor.descriptor(),
-                                  b_descriptor.descriptor(),
-                                  &beta,
-                                  out_descriptor.descriptor(),
-                                  gpu_type,
-                                  rocsparse_sddmm_alg_default,
-                                  tmp_buffer_ptr);
+    common::dynload::rocsparse_sddmm(handle,
+                                     GetTransposeOperation(transa),
+                                     GetTransposeOperation(transb),
+                                     &alpha,
+                                     a_descriptor.descriptor(),
+                                     b_descriptor.descriptor(),
+                                     &beta,
+                                     out_descriptor.descriptor(),
+                                     gpu_type,
+                                     rocsparse_sddmm_alg_default,
+                                     tmp_buffer_ptr);
   });
 }
 #endif
diff --git a/paddle/phi/kernels/funcs/strided_slice.h b/paddle/phi/kernels/funcs/strided_slice.h
index 4a88c1e0660b79..b71fbeab519f63 100644
--- a/paddle/phi/kernels/funcs/strided_slice.h
+++ b/paddle/phi/kernels/funcs/strided_slice.h
@@ -17,9 +17,9 @@
 #include <utility>
 #include <vector>
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/tensor_array.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
diff --git a/paddle/phi/kernels/funcs/unique_functor.h b/paddle/phi/kernels/funcs/unique_functor.h
index 806d7cca84851d..ce50227ea921a5 100644
--- a/paddle/phi/kernels/funcs/unique_functor.h
+++ b/paddle/phi/kernels/funcs/unique_functor.h
@@ -15,8 +15,8 @@
 #pragma once
 #include <set>
 
+#include "paddle/common/data_type.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/phi/kernels/funcs/unsqueeze.h b/paddle/phi/kernels/funcs/unsqueeze.h
index b15e781b25117b..9abb989df23ae7 100644
--- a/paddle/phi/kernels/funcs/unsqueeze.h
+++ b/paddle/phi/kernels/funcs/unsqueeze.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 // TODO(paddle-dev): Remove this file when we can call related Kernel directly
diff --git a/paddle/phi/kernels/funcs/values_vectors_functor.h b/paddle/phi/kernels/funcs/values_vectors_functor.h
index e7dae6b82711b9..66287b69ba12e2 100644
--- a/paddle/phi/kernels/funcs/values_vectors_functor.h
+++ b/paddle/phi/kernels/funcs/values_vectors_functor.h
@@ -15,7 +15,7 @@
 #pragma once
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/common/backends/dynload/cusolver.h"
-#include "paddle/phi/core/errors.h"
+#include "paddle/common/errors.h"
 #endif  // PADDLE_WITH_CUDA
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
diff --git a/paddle/phi/kernels/funcs/vol2col.h b/paddle/phi/kernels/funcs/vol2col.h
index 283ab3ea065635..bd909927952d04 100644
--- a/paddle/phi/kernels/funcs/vol2col.h
+++ b/paddle/phi/kernels/funcs/vol2col.h
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include <vector>
 
+#include "paddle/common/errors.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/errors.h"
 
 namespace phi {
 namespace funcs {
diff --git a/paddle/phi/kernels/fusion/cpu/distributed_fused_lamb_init_kernel.cc b/paddle/phi/kernels/fusion/cpu/distributed_fused_lamb_init_kernel.cc
index 3cb37ccf2ed89d..bbcb61bd454765 100644
--- a/paddle/phi/kernels/fusion/cpu/distributed_fused_lamb_init_kernel.cc
+++ b/paddle/phi/kernels/fusion/cpu/distributed_fused_lamb_init_kernel.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/distributed_fused_lamb_init_kernel.h"
-#include "paddle/phi/core/errors.h"
+#include "paddle/common/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/fusion/cpu/fused_softmax_mask_upper_triangle_kernel.cc b/paddle/phi/kernels/fusion/cpu/fused_softmax_mask_upper_triangle_kernel.cc
index b9ded16d1b0958..6257e9c451aaa7 100644
--- a/paddle/phi/kernels/fusion/cpu/fused_softmax_mask_upper_triangle_kernel.cc
+++ b/paddle/phi/kernels/fusion/cpu/fused_softmax_mask_upper_triangle_kernel.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/fused_softmax_mask_upper_triangle_kernel.h"
-#include "paddle/phi/core/errors.h"
+#include "paddle/common/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/fusion/cpu/fusion_gru_kernel.cc b/paddle/phi/kernels/fusion/cpu/fusion_gru_kernel.cc
index b88a93b419beb7..5683c7a09c82ed 100644
--- a/paddle/phi/kernels/fusion/cpu/fusion_gru_kernel.cc
+++ b/paddle/phi/kernels/fusion/cpu/fusion_gru_kernel.cc
@@ -16,9 +16,9 @@
 #include <string>
 #include <vector>
 
+#include "paddle/common/enforce.h"
+#include "paddle/common/errors.h"
 #include "paddle/common/float16.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/phi/kernels/fusion/cpu/fusion_seqconv_eltadd_relu_kernel.cc b/paddle/phi/kernels/fusion/cpu/fusion_seqconv_eltadd_relu_kernel.cc
index e16c081a37ea08..c5ee7983c493e0 100644
--- a/paddle/phi/kernels/fusion/cpu/fusion_seqconv_eltadd_relu_kernel.cc
+++ b/paddle/phi/kernels/fusion/cpu/fusion_seqconv_eltadd_relu_kernel.cc
@@ -15,9 +15,9 @@
 #include <algorithm>  // for min, max
 #include <string>
 
+#include "paddle/common/enforce.h"
+#include "paddle/common/errors.h"
 #include "paddle/common/float16.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/phi/kernels/fusion/cpu/fusion_seqexpand_concat_fc_kernel.cc b/paddle/phi/kernels/fusion/cpu/fusion_seqexpand_concat_fc_kernel.cc
index 09026060965aaa..1b93ab7a139609 100644
--- a/paddle/phi/kernels/fusion/cpu/fusion_seqexpand_concat_fc_kernel.cc
+++ b/paddle/phi/kernels/fusion/cpu/fusion_seqexpand_concat_fc_kernel.cc
@@ -14,10 +14,10 @@
 
 #include <string>
 
+#include "paddle/common/enforce.h"
+#include "paddle/common/errors.h"
 #include "paddle/common/float16.h"
 #include "paddle/phi/backends/cpu/cpu_info.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h
index eaceb46d69d741..0d7c29457be3c9 100644
--- a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h
@@ -24,9 +24,9 @@
 
 #include "cutlass/conv/device/implicit_gemm_convolution.h"
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
-#include "paddle/phi/core/enforce.h"
 
 namespace phi {
 namespace fusion {
diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention.cu b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention.cu
index cc4fd467dfc20b..f9c3cb0e7c7610 100644
--- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention.cu
+++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention.cu
@@ -14,8 +14,8 @@
 
 #include "glog/logging.h"
 
+#include "paddle/common/errors.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/autogen/memory_efficient_attention.h"
 #include "paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/gemm_kernel_utils.h"
diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/gemm_kernel_utils.h b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/gemm_kernel_utils.h
index 3442818c817172..5e043341d2b757 100644
--- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/gemm_kernel_utils.h
+++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/gemm_kernel_utils.h
@@ -20,8 +20,8 @@
 #pragma once
 
 #include "cutlass/arch/mma.h"
+#include "paddle/common/enforce.h"
 #include "paddle/fluid/platform/errors.h"
-#include "paddle/phi/core/enforce.h"
 
 ////////////////////////////////////////////////////////////////////////////////
 // Some helper functions
diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/kernel_backward.h b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/kernel_backward.h
index 56ed034ff5ad5a..15fe7a2ea573fb 100644
--- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/kernel_backward.h
+++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/kernel_backward.h
@@ -66,8 +66,8 @@
 #include "iterators/epilogue_predicated_tile_iterator.h"
 #include "transform/tile_smem_loader.h"
 
+#include "paddle/common/enforce.h"
 #include "paddle/fluid/platform/errors.h"
-#include "paddle/phi/core/enforce.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/kernel_forward.h b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/kernel_forward.h
index 232ded25a7390e..f992955ecfdc89 100644
--- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/kernel_forward.h
+++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/kernel_forward.h
@@ -57,8 +57,8 @@
 #include "gemm/mma_from_smem.h"
 #include "transform/tile_smem_loader.h"
 
+#include "paddle/common/enforce.h"
 #include "paddle/fluid/platform/errors.h"
-#include "paddle/phi/core/enforce.h"
 // namespace phi {
 
 using namespace gemm_kernel_utils;  // NOLINT
diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_utils.h b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_utils.h
index 65dfb1bc8eced4..43afbdb55707dd 100644
--- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_utils.h
+++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_utils.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 
 namespace phi {
 namespace fusion {
diff --git a/paddle/phi/kernels/fusion/gpu/cast_with_ptr.h b/paddle/phi/kernels/fusion/gpu/cast_with_ptr.h
index 5ae8aed256ccdd..0b569dec8d588e 100644
--- a/paddle/phi/kernels/fusion/gpu/cast_with_ptr.h
+++ b/paddle/phi/kernels/fusion/gpu/cast_with_ptr.h
@@ -14,10 +14,10 @@
 
 #pragma once
 
+#include "paddle/common/ddim.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
-#include "paddle/phi/core/ddim.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/fusion/gpu/conv_fusion_kernel.cu b/paddle/phi/kernels/fusion/gpu/conv_fusion_kernel.cu
index 1a23b39b3a2ff9..614c934338a503 100644
--- a/paddle/phi/kernels/fusion/gpu/conv_fusion_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/conv_fusion_kernel.cu
@@ -25,11 +25,11 @@
 #include "glog/logging.h"
 
 #include "paddle/common/backends/dynload/cudnn.h"
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/gpu/cuda/cudnn_desc.h"
 #include "paddle/phi/common/backend.h"
 #include "paddle/phi/common/data_type.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/impl/conv_cudnn_impl.h"
 #include "paddle/utils/optional.h"
@@ -63,14 +63,14 @@ class CudnnConvDescManager {
     bool is_sys_pad;
 
     // TODO(wilber): The destruction of cudnn descriptor depends on the
-    // phi::dynload::cudnn singleton, but when the process exits, the singleton
-    // destruction order cannot be determined.
-    // After testing, it is found that the phi::dynload::cudnn related singleton
-    // on Windows is destructed first, causing the descriptor to be destructed
-    // and failed, while the descriptor on Linux is destructed first, and the
-    // phi::dynload::cudnn singleton is destructed later, so that it is correct.
-    // To circumvent this problem, we rely entirely on freeing resources when
-    // the process exits.
+    // common::dynload::cudnn singleton, but when the process exits, the
+    // singleton destruction order cannot be determined. After testing, it is
+    // found that the common::dynload::cudnn related singleton on Windows is
+    // destructed first, causing the descriptor to be destructed and failed,
+    // while the descriptor on Linux is destructed first, and the
+    // common::dynload::cudnn singleton is destructed later, so that it is
+    // correct. To circumvent this problem, we rely entirely on freeing
+    // resources when the process exits.
 
     // ~CudnnCacheInfo() {
     //   if (x_desc) delete x_desc;
@@ -462,7 +462,7 @@ void ConvFusionKernel(const Context& ctx,
       std::unique_ptr<cudnnConvolutionFwdAlgoPerf_t[]> perf_results(
           new cudnnConvolutionFwdAlgoPerf_t[phi::kNUM_CUDNN_FWD_ALGS]);
       PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::cudnnGetConvolutionForwardAlgorithm_v7(
+          common::dynload::cudnnGetConvolutionForwardAlgorithm_v7(
               handle,
               x_desc,
               w_desc,
@@ -474,7 +474,7 @@ void ConvFusionKernel(const Context& ctx,
       *cudnn_algo = (perf_results.get())[best_algo_idx].algo;
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::cudnnGetConvolutionForwardAlgorithm(
+          common::dynload::cudnnGetConvolutionForwardAlgorithm(
               handle,
               x_desc,
               w_desc,
@@ -485,20 +485,21 @@ void ConvFusionKernel(const Context& ctx,
               cudnn_algo));
 #endif
       PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::cudnnGetConvolutionForwardWorkspaceSize(handle,
-                                                                x_desc,
-                                                                w_desc,
-                                                                cudnn_conv_desc,
-                                                                o_desc,
-                                                                *cudnn_algo,
-                                                                wks_bytes));
+          common::dynload::cudnnGetConvolutionForwardWorkspaceSize(
+              handle,
+              x_desc,
+              w_desc,
+              cudnn_conv_desc,
+              o_desc,
+              *cudnn_algo,
+              wks_bytes));
     } else {
       std::array<cudnnConvolutionFwdAlgoPerf_t, phi::kNUM_CUDNN_FWD_ALGS>
           fwd_perf_stat;
       int returned_algo_count;
       auto cudnn_find_func = [&](void* cudnn_workspace) {
         PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::cudnnFindConvolutionForwardAlgorithmEx(
+            common::dynload::cudnnFindConvolutionForwardAlgorithmEx(
                 handle,
                 x_desc,
                 transformed_input.data(),
@@ -517,7 +518,7 @@ void ConvFusionKernel(const Context& ctx,
       *cudnn_algo = fwd_perf_stat[0].algo;
 
       PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::cudnnGetConvolutionForwardWorkspaceSize(
+          common::dynload::cudnnGetConvolutionForwardWorkspaceSize(
               handle,
               x_desc,
               w_desc,
@@ -561,22 +562,22 @@ void ConvFusionKernel(const Context& ctx,
     ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
     auto cudnn_func = [&](void* cudnn_workspace) {
       PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::cudnnConvolutionForward(handle,
-                                                &alpha,
-                                                x_desc,
-                                                transformed_input.data(),
-                                                w_desc,
-                                                filter.data(),
-                                                cudnn_conv_desc,
-                                                algo,
-                                                cudnn_workspace,
-                                                workspace_size,
-                                                &beta,
-                                                o_desc,
-                                                output->data()));
+          common::dynload::cudnnConvolutionForward(handle,
+                                                   &alpha,
+                                                   x_desc,
+                                                   transformed_input.data(),
+                                                   w_desc,
+                                                   filter.data(),
+                                                   cudnn_conv_desc,
+                                                   algo,
+                                                   cudnn_workspace,
+                                                   workspace_size,
+                                                   &beta,
+                                                   o_desc,
+                                                   output->data()));
     };
     workspace_handle.RunFunc(cudnn_func, workspace_size);
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnAddTensor(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnAddTensor(
         handle, &alpha, b_desc, bias.data(), &alpha, o_desc, output->data()));
   } else {
     // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_​PRECOMP_GEMM algo is
@@ -589,7 +590,7 @@ void ConvFusionKernel(const Context& ctx,
     ScalingParamType<T> beta = residual.get_ptr() ? 1.0f : 0.0f;
     auto cudnn_func = [&](void* cudnn_workspace) {
       PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::cudnnConvolutionBiasActivationForward(
+          common::dynload::cudnnConvolutionBiasActivationForward(
               handle,
               &alpha,
               x_desc,
diff --git a/paddle/phi/kernels/fusion/gpu/distributed_fused_lamb_init_kernel.cu b/paddle/phi/kernels/fusion/gpu/distributed_fused_lamb_init_kernel.cu
index 3ae7f0682bc75b..cdca87eb14f8fc 100644
--- a/paddle/phi/kernels/fusion/gpu/distributed_fused_lamb_init_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/distributed_fused_lamb_init_kernel.cu
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/distributed_fused_lamb_init_kernel.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/common/data_type.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/algorithm.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bn_activation_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bn_activation_grad_kernel.cu
index e795d37ea490e1..7437645f4f28c2 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_bn_activation_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_bn_activation_grad_kernel.cu
@@ -123,9 +123,9 @@ void FusedBatchNormActGradKernel(const Context &dev_ctx,
   cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
 
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+      common::dynload::cudnnCreateTensorDescriptor(&data_desc_));
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
+      common::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
   if (epsilon1 <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
     LOG(ERROR) << "Provided epsilon is smaller than "
                << "CUDNN_BN_MIN_EPSILON. Setting it to "
@@ -133,13 +133,13 @@ void FusedBatchNormActGradKernel(const Context &dev_ctx,
   }
   epsilon1 = std::max(epsilon1, CUDNN_BN_MIN_EPSILON);
 
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnSetTensorNdDescriptor(
       data_desc_,
       CudnnDataType::type,
       x_dims.size() > 3 ? x_dims.size() : 4,
       dims.data(),
       strides.data()));
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDeriveBNTensorDescriptor(
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnDeriveBNTensorDescriptor(
       bn_param_desc_, data_desc_, mode_));
 
   const auto *saved_mean_data = saved_mean.template data<BatchNormParamType>();
@@ -156,7 +156,7 @@ void FusedBatchNormActGradKernel(const Context &dev_ctx,
       scope_act_desc.descriptor<T>(act_type);
   // --------------- cudnn batchnorm workspace ---------------
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnGetBatchNormalizationBackwardExWorkspaceSize(
+      common::dynload::cudnnGetBatchNormalizationBackwardExWorkspaceSize(
           /*handle=*/dev_ctx.cudnn_handle(),
           /*mode=*/mode_,
           /*bnOps=*/bnOps_,
@@ -173,7 +173,7 @@ void FusedBatchNormActGradKernel(const Context &dev_ctx,
       (workspace_size + phi::SizeOf(x.dtype()) - 1) / phi::SizeOf(x.dtype()))});
   workspace_ptr = dev_ctx.template Alloc<T>(&workspace_tensor);
 
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBatchNormalizationBackwardEx(
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnBatchNormalizationBackwardEx(
       /*handle=*/dev_ctx.cudnn_handle(),
       /*mode=*/mode_,
       /*bnOps=*/bnOps_,
@@ -210,9 +210,9 @@ void FusedBatchNormActGradKernel(const Context &dev_ctx,
 
   // clean when exit.
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+      common::dynload::cudnnDestroyTensorDescriptor(data_desc_));
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
+      common::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
 #else
   PADDLE_THROW(phi::errors::Unimplemented(
       "The fused_batch_norm_act operator is not supported on GPU "
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bn_activation_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bn_activation_kernel.cu
index 700141f1e03318..904660c7f158b7 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_bn_activation_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_bn_activation_kernel.cu
@@ -113,22 +113,22 @@ void FusedBatchNormActKernel(const Context &dev_ctx,
   cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
 
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+      common::dynload::cudnnCreateTensorDescriptor(&data_desc_));
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
+      common::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
 
   VLOG(3) << "Setting descriptors.";
   std::vector<int> dims = {N, C, H, W, D};
   std::vector<int> strides = {H * W * D * C, 1, W * D * C, D * C, C};
 
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnSetTensorNdDescriptor(
       data_desc_,
       CudnnDataType::type,
       x_dims.size() > 3 ? x_dims.size() : 4,
       dims.data(),
       strides.data()));
 
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDeriveBNTensorDescriptor(
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnDeriveBNTensorDescriptor(
       bn_param_desc_, data_desc_, mode_));
 
   double this_factor = 1. - momentum;
@@ -149,7 +149,7 @@ void FusedBatchNormActKernel(const Context &dev_ctx,
 
   // --------------- cudnn batchnorm workspace ---------------
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
+      common::dynload::cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
           /*handle=*/handle,
           /*mode=*/mode_,
           /*bnOps=*/bnOps_,
@@ -162,7 +162,7 @@ void FusedBatchNormActKernel(const Context &dev_ctx,
 
   // -------------- cudnn batchnorm reserve space --------------
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
+      common::dynload::cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
           /*handle=*/handle,
           /*mode=*/mode_,
           /*bnOps=*/bnOps_,
@@ -179,7 +179,7 @@ void FusedBatchNormActKernel(const Context &dev_ctx,
   workspace_ptr = dev_ctx.template Alloc<T>(&workspace_tensor);
 
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnBatchNormalizationForwardTrainingEx(
+      common::dynload::cudnnBatchNormalizationForwardTrainingEx(
           handle,
           mode_,
           bnOps_,
@@ -208,9 +208,9 @@ void FusedBatchNormActKernel(const Context &dev_ctx,
 
   // clean when exit.
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+      common::dynload::cudnnDestroyTensorDescriptor(data_desc_));
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
+      common::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
 #else
   PADDLE_THROW(phi::errors::Unimplemented(
       "The fused_batch_norm_act operator is not supported on GPU "
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_grad_kernel.cu
index 894903fb0fab83..850af21da4578a 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_grad_kernel.cu
@@ -115,9 +115,9 @@ void FusedBatchNormAddActGradKernel(const Context &dev_ctx,
   cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
 
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+      common::dynload::cudnnCreateTensorDescriptor(&data_desc_));
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
+      common::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
   if (epsilon1 <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
     LOG(ERROR) << "Provided epsilon is smaller than "
                << "CUDNN_BN_MIN_EPSILON. Setting it to "
@@ -125,13 +125,13 @@ void FusedBatchNormAddActGradKernel(const Context &dev_ctx,
   }
   epsilon1 = std::max(epsilon1, CUDNN_BN_MIN_EPSILON);
 
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnSetTensorNdDescriptor(
       data_desc_,
       CudnnDataType<T>::type,
       in_dims.size() > 3 ? in_dims.size() : 4,
       dims.data(),
       strides.data()));
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDeriveBNTensorDescriptor(
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnDeriveBNTensorDescriptor(
       bn_param_desc_, data_desc_, mode_));
 
   const auto *saved_mean_ptr = &saved_mean;
@@ -151,7 +151,7 @@ void FusedBatchNormAddActGradKernel(const Context &dev_ctx,
       scope_act_desc.descriptor<T>(act_type);
   // --------------- cudnn batchnorm workspace ---------------
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnGetBatchNormalizationBackwardExWorkspaceSize(
+      common::dynload::cudnnGetBatchNormalizationBackwardExWorkspaceSize(
           /*handle=*/dev_ctx.cudnn_handle(),
           /*mode=*/mode_,
           /*bnOps=*/bnOps_,
@@ -167,7 +167,7 @@ void FusedBatchNormAddActGradKernel(const Context &dev_ctx,
   workspace_tensor.Resize({static_cast<int64_t>(workspace_size)});
   workspace_ptr = dev_ctx.template Alloc<T>(&workspace_tensor);
 
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBatchNormalizationBackwardEx(
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnBatchNormalizationBackwardEx(
       /*handle=*/dev_ctx.cudnn_handle(),
       /*mode=*/mode_,
       /*bnOps=*/bnOps_,
@@ -201,9 +201,9 @@ void FusedBatchNormAddActGradKernel(const Context &dev_ctx,
 
   // clean when exit.
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+      common::dynload::cudnnDestroyTensorDescriptor(data_desc_));
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
+      common::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
 #else
   PADDLE_THROW(phi::errors::Unimplemented(
       "The fused_bn_add_activation operator is not supported on GPU "
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_kernel.cu
index 52152476e4aca1..96761aa3baac1f 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_kernel.cu
@@ -100,20 +100,20 @@ void FusedBatchNormAddActKernel(const Context &dev_ctx,
   cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
 
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+      common::dynload::cudnnCreateTensorDescriptor(&data_desc_));
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
+      common::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
 
   std::vector<int> dims = {N, C, H, W, D};
   std::vector<int> strides = {H * W * D * C, 1, W * D * C, D * C, C};
 
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnSetTensorNdDescriptor(
       data_desc_,
       CudnnDataType<T>::type,
       in_dims.size() > 3 ? in_dims.size() : 4,
       dims.data(),
       strides.data()));
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDeriveBNTensorDescriptor(
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnDeriveBNTensorDescriptor(
       bn_param_desc_, data_desc_, mode_));
 
   double this_factor = 1. - momentum;
@@ -136,7 +136,7 @@ void FusedBatchNormAddActKernel(const Context &dev_ctx,
 
   // --------------- cudnn batchnorm workspace ---------------
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
+      common::dynload::cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
           /*handle=*/handle,
           /*mode=*/mode_,
           /*bnOps=*/bnOps_,
@@ -149,7 +149,7 @@ void FusedBatchNormAddActKernel(const Context &dev_ctx,
 
   // -------------- cudnn batchnorm reserve space --------------
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
+      common::dynload::cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
           /*handle=*/handle,
           /*mode=*/mode_,
           /*bnOps=*/bnOps_,
@@ -168,7 +168,7 @@ void FusedBatchNormAddActKernel(const Context &dev_ctx,
       &workspace_tensor, workspace_tensor.numel() * sizeof(T));
 
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnBatchNormalizationForwardTrainingEx(
+      common::dynload::cudnnBatchNormalizationForwardTrainingEx(
           handle,
           mode_,
           bnOps_,
@@ -203,9 +203,9 @@ void FusedBatchNormAddActKernel(const Context &dev_ctx,
 
   // clean when exit.
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+      common::dynload::cudnnDestroyTensorDescriptor(data_desc_));
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
+      common::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
 #else
   PADDLE_THROW(phi::errors::Unimplemented(
       "The fused_bn_add_activation operator is not supported on GPU "
diff --git a/paddle/phi/kernels/fusion/gpu/fused_embedding_eltwise_layernorm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_embedding_eltwise_layernorm_kernel.cu
index 2670c80c1aa07c..bb5008146e0d1c 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_embedding_eltwise_layernorm_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_embedding_eltwise_layernorm_kernel.cu
@@ -15,11 +15,11 @@
 #include <algorithm>
 #include <type_traits>
 
+#include "paddle/common/enforce.h"
+#include "paddle/common/errors.h"
 #include "paddle/common/float16.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/common/place.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/emb_eltwise_layer_norm_functor.h"
diff --git a/paddle/phi/kernels/fusion/gpu/fused_fc_elementwise_layernorm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_fc_elementwise_layernorm_kernel.cu
index 6fd829aa757346..cbb2e1b8fb0321 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_fc_elementwise_layernorm_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_fc_elementwise_layernorm_kernel.cu
@@ -27,11 +27,11 @@ namespace cub = hipcub;
 #include <cuda_fp16.h>
 #endif
 
+#include "paddle/common/enforce.h"
+#include "paddle/common/errors.h"
 #include "paddle/common/float16.h"
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/phi/kernels/fusion/gpu/fused_rope_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_rope_grad_kernel.cu
index 70ea70912f6397..b1664e4b15d48e 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_rope_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_rope_grad_kernel.cu
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/amp_type_traits.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/fusion/gpu/fused_rope_utils.h"
diff --git a/paddle/phi/kernels/fusion/gpu/fused_rope_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_rope_kernel.cu
index 6e032211cc6a09..ecb439c906112d 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_rope_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_rope_kernel.cu
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/amp_type_traits.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/fusion/gpu/fused_rope_utils.h"
diff --git a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_grad_kernel.cu
index 6c7fe36d364576..a08af5a5b89581 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_grad_kernel.cu
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/common/errors.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/generator.h"
 #include "paddle/phi/core/kernel_registry.h"
 
diff --git a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_kernel.cu
index 30e5599aac2363..a0b7cf5b2689ce 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_kernel.cu
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/common/errors.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/generator.h"
 #include "paddle/phi/core/kernel_registry.h"
 
diff --git a/paddle/phi/kernels/fusion/gpu/fusion_group_kernel.cu b/paddle/phi/kernels/fusion/gpu/fusion_group_kernel.cu
index eee5a4b84b54a6..91f5347276f9d2 100644
--- a/paddle/phi/kernels/fusion/gpu/fusion_group_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fusion_group_kernel.cu
@@ -14,10 +14,10 @@
 
 #include "glog/logging.h"
 
+#include "paddle/common/data_type.h"
 #include "paddle/phi/backends/device_code.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/utils/data_type.h"
 
 namespace phi {
 namespace fusion {
diff --git a/paddle/phi/kernels/fusion/gpu/fusion_transpose_flatten_concat_kernel.cu b/paddle/phi/kernels/fusion/gpu/fusion_transpose_flatten_concat_kernel.cu
index 9921a2db5ad173..67856582b1dbd3 100644
--- a/paddle/phi/kernels/fusion/gpu/fusion_transpose_flatten_concat_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fusion_transpose_flatten_concat_kernel.cu
@@ -15,10 +15,10 @@
 #include <algorithm>
 #include <type_traits>
 
+#include "paddle/common/enforce.h"
+#include "paddle/common/errors.h"
 #include "paddle/common/float16.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
@@ -51,9 +51,9 @@ void TransposeFlattenConcatFusionKernel(
   cudnnTensorDescriptor_t in_desc;
   cudnnTensorDescriptor_t out_desc;
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnCreateTensorDescriptor(&in_desc));
+      common::dynload::cudnnCreateTensorDescriptor(&in_desc));
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnCreateTensorDescriptor(&out_desc));
+      common::dynload::cudnnCreateTensorDescriptor(&out_desc));
   cudnnDataType_t cudnn_dtype = CudnnDataType<T>::type;
 
   auto handle = dev_ctx.cudnn_handle();
@@ -91,12 +91,12 @@ void TransposeFlattenConcatFusionKernel(
       dims_y[i] = 1;
     }
 
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnSetTensorNdDescriptor(
         in_desc, cudnn_dtype, max_dim, dims_y.data(), stride_x.data()));
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnSetTensorNdDescriptor(
         out_desc, cudnn_dtype, max_dim, dims_y.data(), stride_y.data()));
 
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnTransformTensor(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnTransformTensor(
         handle,
         CudnnDataType<T>::kOne(),
         in_desc,
@@ -112,9 +112,9 @@ void TransposeFlattenConcatFusionKernel(
     }
   }
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnDestroyTensorDescriptor(in_desc));
+      common::dynload::cudnnDestroyTensorDescriptor(in_desc));
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnDestroyTensorDescriptor(out_desc));
+      common::dynload::cudnnDestroyTensorDescriptor(out_desc));
 #else
   PADDLE_THROW(phi::errors::Unimplemented(
       "The fusion_transpose_flatten_concat operator is not supported on HIP."));
diff --git a/paddle/phi/kernels/fusion/gpu/multihead_matmul_kernel.cu b/paddle/phi/kernels/fusion/gpu/multihead_matmul_kernel.cu
index 87bb36f1162e1a..0efb0718a36fe0 100644
--- a/paddle/phi/kernels/fusion/gpu/multihead_matmul_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/multihead_matmul_kernel.cu
@@ -15,9 +15,9 @@
 #include <algorithm>
 #include <type_traits>
 
+#include "paddle/common/enforce.h"
+#include "paddle/common/errors.h"
 #include "paddle/common/float16.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/phi/kernels/fusion/onednn/fusion_gru_kernel.cc b/paddle/phi/kernels/fusion/onednn/fusion_gru_kernel.cc
index e3fa939aad7537..51b1dac9f07618 100644
--- a/paddle/phi/kernels/fusion/onednn/fusion_gru_kernel.cc
+++ b/paddle/phi/kernels/fusion/onednn/fusion_gru_kernel.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/common/data_type.h"
+#include "paddle/common/enforce.h"
+#include "paddle/common/errors.h"
 #include "paddle/phi/backends/onednn/onednn_reuse.h"
 #include "paddle/phi/core/compat/convert_utils.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/expect.h"
-#include "paddle/phi/core/utils/data_type.h"
 
 #include "paddle/phi/core/kernel_registry.h"
 
diff --git a/paddle/phi/kernels/fusion/xpu/fused_feedforward_grad_kernel.cc b/paddle/phi/kernels/fusion/xpu/fused_feedforward_grad_kernel.cc
index 3448efca7c3ab1..29f74e8e1fe237 100644
--- a/paddle/phi/kernels/fusion/xpu/fused_feedforward_grad_kernel.cc
+++ b/paddle/phi/kernels/fusion/xpu/fused_feedforward_grad_kernel.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/phi/kernels/fusion/xpu/fused_feedforward_kernel.cc b/paddle/phi/kernels/fusion/xpu/fused_feedforward_kernel.cc
index 221305014190bd..dab55c1bbc10ae 100644
--- a/paddle/phi/kernels/fusion/xpu/fused_feedforward_kernel.cc
+++ b/paddle/phi/kernels/fusion/xpu/fused_feedforward_kernel.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/phi/kernels/gpu/allclose_kernel.cu b/paddle/phi/kernels/gpu/allclose_kernel.cu
index 99ccfcd8667e6d..c021667bbec3fb 100644
--- a/paddle/phi/kernels/gpu/allclose_kernel.cu
+++ b/paddle/phi/kernels/gpu/allclose_kernel.cu
@@ -16,9 +16,9 @@
 
 #include "glog/logging.h"
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/data_type.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/gpu/arange_kernel.cu b/paddle/phi/kernels/gpu/arange_kernel.cu
index 8b59c79f0a4e08..c4b1f46f5ac525 100644
--- a/paddle/phi/kernels/gpu/arange_kernel.cu
+++ b/paddle/phi/kernels/gpu/arange_kernel.cu
@@ -15,11 +15,11 @@
 #include "paddle/phi/kernels/arange_kernel.h"
 
 #include "paddle/common/bfloat16.h"
+#include "paddle/common/enforce.h"
+#include "paddle/common/errors.h"
 #include "paddle/common/float16.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/range_function.h"
diff --git a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
index caa635255b9878..191a62698c817f 100644
--- a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
+++ b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
@@ -28,8 +28,8 @@ namespace cub = hipcub;
 #endif
 #include <limits>
 
-#include "paddle/phi/core/ddim.h"
-#include "paddle/phi/core/utils/data_type.h"
+#include "paddle/common/data_type.h"
+#include "paddle/common/ddim.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
index c3c353859728b7..53e151ed27faa5 100644
--- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
@@ -14,10 +14,10 @@
 
 #include "glog/logging.h"
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
 #include "paddle/phi/common/layout.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/flags.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/batch_norm_kernel.h"
@@ -675,9 +675,9 @@ void BatchNormGradFunctor(const Context &ctx,
     cudnnBatchNormMode_t mode_;
 
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+        common::dynload::cudnnCreateTensorDescriptor(&data_desc_));
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
+        common::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
 #endif
     if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
       LOG(ERROR) << "Provided epsilon is smaller than "
@@ -714,13 +714,13 @@ void BatchNormGradFunctor(const Context &ctx,
 //     platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_,
 //                                                       data_desc_, mode_));
 #else
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnSetTensorNdDescriptor(
         data_desc_,
         CudnnDataType<T>::type,
         x_dims.size() > 3 ? x_dims.size() : 4,
         dims.data(),
         strides.data()));
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDeriveBNTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnDeriveBNTensorDescriptor(
         bn_param_desc_, data_desc_, mode_));
 #endif
 
@@ -951,7 +951,7 @@ void BatchNormGradFunctor(const Context &ctx,
         auto reserve_space_size = reserve_space->memory_size();
         // --------------- cudnn batchnorm workspace ---------------
         PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::cudnnGetBatchNormalizationBackwardExWorkspaceSize(
+            common::dynload::cudnnGetBatchNormalizationBackwardExWorkspaceSize(
                 /*handle=*/ctx.cudnn_handle(),
                 /*mode=*/mode_,
                 /*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
@@ -969,7 +969,7 @@ void BatchNormGradFunctor(const Context &ctx,
             static_cast<void *>(ctx.template Alloc<uint8_t>(&workspace_tensor));
 
         PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::cudnnBatchNormalizationBackwardEx(
+            common::dynload::cudnnBatchNormalizationBackwardEx(
                 /*handle=*/ctx.cudnn_handle(),
                 /*mode=*/mode_,
                 /*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
@@ -1006,7 +1006,7 @@ void BatchNormGradFunctor(const Context &ctx,
                 /*reserveSpaceSizeInBytes=*/reserve_space_size));
 #else
         PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::cudnnBatchNormalizationBackward(
+            common::dynload::cudnnBatchNormalizationBackward(
                 ctx.cudnn_handle(),
                 mode_,
                 CudnnDataType<T>::kOne(),
@@ -1136,9 +1136,9 @@ void BatchNormGradFunctor(const Context &ctx,
 #else
     // clean when exit.
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+        common::dynload::cudnnDestroyTensorDescriptor(data_desc_));
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
+        common::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
 #endif
 
   } else {
diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index 20aa02a5f24856..6609d555871473 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -22,10 +22,10 @@ namespace cub = hipcub;
 
 #include "glog/logging.h"
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
 #include "paddle/phi/common/layout.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/flags.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/batch_norm_kernel.h"
@@ -618,9 +618,9 @@ void BatchNormKernel(const Context &ctx,
   cudnnBatchNormMode_t mode_;
 
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+      common::dynload::cudnnCreateTensorDescriptor(&data_desc_));
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
+      common::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
 #endif
 
   if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
@@ -671,14 +671,14 @@ void BatchNormKernel(const Context &ctx,
 //     platform::dynload::miopenDeriveBNTensorDescriptor(
 //         bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_));
 #else
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnSetTensorNdDescriptor(
       data_desc_,
       CudnnDataType<T>::type,
       x_dims.size() > 3 ? x_dims.size() : 4,
       dims.data(),
       strides.data()));
   // Note: PERSISTENT not implemented for inference
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDeriveBNTensorDescriptor(
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnDeriveBNTensorDescriptor(
       bn_param_desc_, data_desc_, test_mode ? CUDNN_BATCHNORM_SPATIAL : mode_));
 #endif
 
@@ -846,7 +846,7 @@ void BatchNormKernel(const Context &ctx,
       }
     } else {
       PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::cudnnBatchNormalizationForwardInference(
+          common::dynload::cudnnBatchNormalizationForwardInference(
               handle,
               // Note: PERSISTENT not implemented for inference
               CUDNN_BATCHNORM_SPATIAL,
@@ -1132,7 +1132,7 @@ void BatchNormKernel(const Context &ctx,
                 "The argument ReserveSpace of batch_norm op is not found."));
         // --------------- cudnn batchnorm workspace ---------------
         PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::
+            common::dynload::
                 cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
                     /*handle=*/handle,
                     /*mode=*/mode_,
@@ -1146,13 +1146,14 @@ void BatchNormKernel(const Context &ctx,
 
         // -------------- cudnn batchnorm reserve space --------------
         PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
-                /*handle=*/handle,
-                /*mode=*/mode_,
-                /*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
-                /*activationDesc=*/nullptr,
-                /*xDesc=*/data_desc_,
-                /*sizeInBytes=*/&reserve_space_size));
+            common::dynload::
+                cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
+                    /*handle=*/handle,
+                    /*mode=*/mode_,
+                    /*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
+                    /*activationDesc=*/nullptr,
+                    /*xDesc=*/data_desc_,
+                    /*sizeInBytes=*/&reserve_space_size));
 
         reserve_space->Resize({static_cast<int64_t>(reserve_space_size)});
         reserve_space_ptr =
@@ -1161,7 +1162,7 @@ void BatchNormKernel(const Context &ctx,
         workspace_ptr =
             static_cast<void *>(ctx.template Alloc<uint8_t>(&workspace_tensor));
         PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::cudnnBatchNormalizationForwardTrainingEx(
+            common::dynload::cudnnBatchNormalizationForwardTrainingEx(
                 handle,
                 mode_,
                 CUDNN_BATCHNORM_OPS_BN,
@@ -1189,7 +1190,7 @@ void BatchNormKernel(const Context &ctx,
                 reserve_space_size));
 #else
         PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::cudnnBatchNormalizationForwardTraining(
+            common::dynload::cudnnBatchNormalizationForwardTraining(
                 handle,
                 mode_,
                 CudnnDataType<T>::kOne(),
@@ -1228,9 +1229,9 @@ void BatchNormKernel(const Context &ctx,
 #else
   // clean when exit.
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+      common::dynload::cudnnDestroyTensorDescriptor(data_desc_));
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
+      common::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
 #endif
 }
 
diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
index 40ea1f195069e9..c77e9dbcbb1eb7 100644
--- a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
@@ -16,9 +16,9 @@
 
 #include <vector>
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 #include "paddle/phi/kernels/primitive/functor_primitives.h"
diff --git a/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu b/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu
index add5b8b7448c9a..b8fe833c2cb431 100644
--- a/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu
+++ b/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu
@@ -17,8 +17,8 @@
 
 #include "paddle/common/backends/dynload/cusolver.h"
 #include "paddle/common/complex.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
 #include "paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h"
diff --git a/paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu b/paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu
index c64facc1e6879b..88cce731ef2495 100644
--- a/paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu
@@ -14,8 +14,8 @@
 
 #include "paddle/phi/kernels/conv_transpose_grad_kernel.h"
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/common/layout.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/phi/kernels/gpu/conv_transpose_kernel.cu b/paddle/phi/kernels/gpu/conv_transpose_kernel.cu
index bee31450cbf70f..647f9316dc6caa 100644
--- a/paddle/phi/kernels/gpu/conv_transpose_kernel.cu
+++ b/paddle/phi/kernels/gpu/conv_transpose_kernel.cu
@@ -14,8 +14,8 @@
 
 #include "paddle/phi/kernels/conv_transpose_kernel.h"
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/common/layout.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
index 63e52527cb9cdd..53502c096e7286 100644
--- a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
@@ -764,7 +764,7 @@ static void SoftmaxWithCrossEntropySoftLabel(const GPUContext& dev_ctx,
 #ifdef PADDLE_WITH_HIP
     auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
                                  : MIOPEN_SOFTMAX_MODE_CHANNEL;
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSoftmaxForward_V2(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::miopenSoftmaxForward_V2(
         handle,
         phi::backends::gpu::CudnnDataType<T>::kOne(),
         descp,
@@ -777,7 +777,7 @@ static void SoftmaxWithCrossEntropySoftLabel(const GPUContext& dev_ctx,
 #else
     auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
                                  : CUDNN_SOFTMAX_MODE_CHANNEL;
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSoftmaxForward(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnSoftmaxForward(
         handle,
         CUDNN_SOFTMAX_LOG,
         mode,
@@ -1196,7 +1196,7 @@ static void SoftmaxWithCrossEntropyHardLabel(const GPUContext& dev_ctx,
 #ifdef PADDLE_WITH_HIP
     auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
                                  : MIOPEN_SOFTMAX_MODE_CHANNEL;
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSoftmaxForward_V2(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::miopenSoftmaxForward_V2(
         handle,
         phi::backends::gpu::CudnnDataType<T>::kOne(),
         descp,
@@ -1209,7 +1209,7 @@ static void SoftmaxWithCrossEntropyHardLabel(const GPUContext& dev_ctx,
 #else
     auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
                                  : CUDNN_SOFTMAX_MODE_CHANNEL;
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSoftmaxForward(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnSoftmaxForward(
         handle,
         CUDNN_SOFTMAX_LOG,
         mode,
diff --git a/paddle/phi/kernels/gpu/cudnn_lstm_cache.h b/paddle/phi/kernels/gpu/cudnn_lstm_cache.h
index 88b20c4dd80977..599878372b6cc8 100644
--- a/paddle/phi/kernels/gpu/cudnn_lstm_cache.h
+++ b/paddle/phi/kernels/gpu/cudnn_lstm_cache.h
@@ -92,7 +92,7 @@ class ScopedRNNBase {
     size_t state_size;
     if (!initialized_) {
       PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::cudnnDropoutGetStatesSize(handle, &state_size));
+          common::dynload::cudnnDropoutGetStatesSize(handle, &state_size));
       phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
       auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(pool.Get(place));
       dropout_state->Resize({static_cast<int64_t>(state_size)});
@@ -107,7 +107,7 @@ class ScopedRNNBase {
                              state_size);
 
     // ------------------- cudnn rnn descriptors ---------------------
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetRNNDescriptor_v6(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnSetRNNDescriptor_v6(
         handle,
         rnn_desc_.desc(),
         hidden_size_,
@@ -121,14 +121,14 @@ class ScopedRNNBase {
 
 #if CUDNN_VERSION >= 7201
     if (!sequence_length.empty()) {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetRNNPaddingMode(
+      PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnSetRNNPaddingMode(
           rnn_desc_.desc(), CUDNN_RNN_PADDED_IO_ENABLED));
     }
 #endif
 
     // ------------------- cudnn weights_size ---------------------
     size_t weights_size_;
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnGetRNNParamsSize(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnGetRNNParamsSize(
         handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type));
     PADDLE_ENFORCE_EQ(
         weights_size_,
@@ -143,12 +143,12 @@ class ScopedRNNBase {
     weight_desc_.descriptor<T>(layout, dim_w);
     // ------------------- cudnn workspace, reserve size ---------------------
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnGetRNNWorkspaceSize(handle,
-                                               rnn_desc_.desc(),
-                                               seq_length_,
-                                               x_descs_.data(),
-                                               workspace_size));
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnGetRNNTrainingReserveSize(
+        common::dynload::cudnnGetRNNWorkspaceSize(handle,
+                                                  rnn_desc_.desc(),
+                                                  seq_length_,
+                                                  x_descs_.data(),
+                                                  workspace_size));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnGetRNNTrainingReserveSize(
         handle, rnn_desc_.desc(), seq_length_, x_descs_.data(), reserve_size));
   }
   cudnnTensorDescriptor_t* x_descs() { return x_descs_.data(); }
diff --git a/paddle/phi/kernels/gpu/cudnn_lstm_grad_kernel.cu b/paddle/phi/kernels/gpu/cudnn_lstm_grad_kernel.cu
index 661a1dd90e7e9b..ccfadb99b4fefc 100644
--- a/paddle/phi/kernels/gpu/cudnn_lstm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cudnn_lstm_grad_kernel.cu
@@ -148,36 +148,36 @@ void CudnnLSTMGradKernel(
   if (!has_seq_length) {
 // This interface is used when the input/output is unpadded.
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenRNNBackwardData(handle,
-                                            rnn.rnn_desc(),
-                                            seq_length,
-                                            rnn.y_descs(),
-                                            out_data,
-                                            rnn.y_descs(),
-                                            out_grad_data,
-                                            rnn.last_h_desc(),
-                                            last_h_grad_data,
-                                            rnn.last_c_desc(),
-                                            last_c_grad_data,
-                                            rnn.weight_desc(),
-                                            weight_data,
-                                            rnn.init_h_desc(),
-                                            init_h_data,
-                                            rnn.init_c_desc(),
-                                            init_c_data,
-                                            rnn.x_descs(),
-                                            in_grad_data,
-                                            rnn.init_h_desc(),
-                                            init_h_grad_data,
-                                            rnn.init_c_desc(),
-                                            init_c_grad_data,
-                                            workspace_data_.data<uint8_t>(),
-                                            workspace_size,
-                                            const_cast<uint8_t *>(reserve_data),
-                                            reserve_size));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::miopenRNNBackwardData(
+        handle,
+        rnn.rnn_desc(),
+        seq_length,
+        rnn.y_descs(),
+        out_data,
+        rnn.y_descs(),
+        out_grad_data,
+        rnn.last_h_desc(),
+        last_h_grad_data,
+        rnn.last_c_desc(),
+        last_c_grad_data,
+        rnn.weight_desc(),
+        weight_data,
+        rnn.init_h_desc(),
+        init_h_data,
+        rnn.init_c_desc(),
+        init_c_data,
+        rnn.x_descs(),
+        in_grad_data,
+        rnn.init_h_desc(),
+        init_h_grad_data,
+        rnn.init_c_desc(),
+        init_c_grad_data,
+        workspace_data_.data<uint8_t>(),
+        workspace_size,
+        const_cast<uint8_t *>(reserve_data),
+        reserve_size));
 
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenRNNBackwardWeights(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::miopenRNNBackwardWeights(
         handle,
         rnn.rnn_desc(),
         seq_length,
@@ -194,36 +194,36 @@ void CudnnLSTMGradKernel(
         const_cast<uint8_t *>(reserve_data),
         reserve_size));
 #else
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnRNNBackwardData(handle,
-                                           rnn.rnn_desc(),
-                                           seq_length,
-                                           rnn.y_descs(),
-                                           out_data,
-                                           rnn.y_descs(),
-                                           out_grad_data,
-                                           rnn.last_h_desc(),
-                                           last_h_grad_data,
-                                           rnn.last_c_desc(),
-                                           last_c_grad_data,
-                                           rnn.weight_desc(),
-                                           weight_data,
-                                           rnn.init_h_desc(),
-                                           init_h_data,
-                                           rnn.init_c_desc(),
-                                           init_c_data,
-                                           rnn.x_descs(),
-                                           in_grad_data,
-                                           rnn.init_h_desc(),
-                                           init_h_grad_data,
-                                           rnn.init_c_desc(),
-                                           init_c_grad_data,
-                                           workspace_data_.data<uint8_t>(),
-                                           workspace_size,
-                                           const_cast<uint8_t *>(reserve_data),
-                                           reserve_size));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnRNNBackwardData(
+        handle,
+        rnn.rnn_desc(),
+        seq_length,
+        rnn.y_descs(),
+        out_data,
+        rnn.y_descs(),
+        out_grad_data,
+        rnn.last_h_desc(),
+        last_h_grad_data,
+        rnn.last_c_desc(),
+        last_c_grad_data,
+        rnn.weight_desc(),
+        weight_data,
+        rnn.init_h_desc(),
+        init_h_data,
+        rnn.init_c_desc(),
+        init_c_data,
+        rnn.x_descs(),
+        in_grad_data,
+        rnn.init_h_desc(),
+        init_h_grad_data,
+        rnn.init_c_desc(),
+        init_c_grad_data,
+        workspace_data_.data<uint8_t>(),
+        workspace_size,
+        const_cast<uint8_t *>(reserve_data),
+        reserve_size));
 
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardWeights(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnRNNBackwardWeights(
         handle,
         rnn.rnn_desc(),
         seq_length,
@@ -244,7 +244,7 @@ void CudnnLSTMGradKernel(
 #if !defined(PADDLE_WITH_HIP) && CUDNN_VERSION >= 7201
     // for train
     // This interface is used when the input/output is padded.
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardDataEx(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnRNNBackwardDataEx(
         handle,
         rnn.rnn_desc(),
         rnn.y_seq_desc(),
@@ -276,7 +276,7 @@ void CudnnLSTMGradKernel(
         const_cast<uint8_t *>(reserve_data),
         reserve_size));
 
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardWeightsEx(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnRNNBackwardWeightsEx(
         handle,
         rnn.rnn_desc(),
         rnn.x_seq_desc(),
diff --git a/paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu b/paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu
index f3a03727e0bc45..121f15ec87b7e2 100644
--- a/paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu
+++ b/paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu
@@ -44,53 +44,53 @@ void LSTMInferece(const bool &has_seq_length,
 // for inference
 // This interface is used when the input/output is unpadded.
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenRNNForwardInference(handle,
-                                                rnn->rnn_desc(),
-                                                seq_length,
-                                                rnn->x_descs(),
-                                                x_data,
-                                                rnn->init_h_desc(),
-                                                init_h_data,
-                                                rnn->init_c_desc(),
-                                                init_c_data,
-                                                rnn->weight_desc(),
-                                                w_data,
-                                                rnn->y_descs(),
-                                                out_data,
-                                                rnn->last_h_desc(),
-                                                last_h_data,
-                                                rnn->last_c_desc(),
-                                                last_c_data,
-                                                workspace_data->data<uint8_t>(),
-                                                workspace_size));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::miopenRNNForwardInference(
+        handle,
+        rnn->rnn_desc(),
+        seq_length,
+        rnn->x_descs(),
+        x_data,
+        rnn->init_h_desc(),
+        init_h_data,
+        rnn->init_c_desc(),
+        init_c_data,
+        rnn->weight_desc(),
+        w_data,
+        rnn->y_descs(),
+        out_data,
+        rnn->last_h_desc(),
+        last_h_data,
+        rnn->last_c_desc(),
+        last_c_data,
+        workspace_data->data<uint8_t>(),
+        workspace_size));
 #else
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnRNNForwardInference(handle,
-                                               rnn->rnn_desc(),
-                                               seq_length,
-                                               rnn->x_descs(),
-                                               x_data,
-                                               rnn->init_h_desc(),
-                                               init_h_data,
-                                               rnn->init_c_desc(),
-                                               init_c_data,
-                                               rnn->weight_desc(),
-                                               w_data,
-                                               rnn->y_descs(),
-                                               out_data,
-                                               rnn->last_h_desc(),
-                                               last_h_data,
-                                               rnn->last_c_desc(),
-                                               last_c_data,
-                                               workspace_data->data<uint8_t>(),
-                                               workspace_size));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnRNNForwardInference(
+        handle,
+        rnn->rnn_desc(),
+        seq_length,
+        rnn->x_descs(),
+        x_data,
+        rnn->init_h_desc(),
+        init_h_data,
+        rnn->init_c_desc(),
+        init_c_data,
+        rnn->weight_desc(),
+        w_data,
+        rnn->y_descs(),
+        out_data,
+        rnn->last_h_desc(),
+        last_h_data,
+        rnn->last_c_desc(),
+        last_c_data,
+        workspace_data->data<uint8_t>(),
+        workspace_size));
 #endif
   } else {
 #if !defined(PADDLE_WITH_HIP) && CUDNN_VERSION >= 7201
     // for inference
     // This interface is used when the input/output is padded.
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNForwardInferenceEx(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnRNNForwardInferenceEx(
         handle,
         rnn->rnn_desc(),
         rnn->x_seq_desc(),
@@ -269,7 +269,7 @@ void CudnnLSTMKernel(
 // for train
 // This interface is used when the input/output is unpadded.
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenRNNForwardTraining(
+      PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::miopenRNNForwardTraining(
           handle,
           rnn.rnn_desc(),
           seq_length,
@@ -292,34 +292,34 @@ void CudnnLSTMKernel(
           reserve_data,
           reserve_size));
 #else
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::cudnnRNNForwardTraining(handle,
-                                                rnn.rnn_desc(),
-                                                seq_length,
-                                                rnn.x_descs(),
-                                                x_data,
-                                                rnn.init_h_desc(),
-                                                init_h_data,
-                                                rnn.init_c_desc(),
-                                                init_c_data,
-                                                rnn.weight_desc(),
-                                                w_data,
-                                                rnn.y_descs(),
-                                                out_data,
-                                                rnn.last_h_desc(),
-                                                last_h_data,
-                                                rnn.last_c_desc(),
-                                                last_c_data,
-                                                workspace_data_.data<uint8_t>(),
-                                                workspace_size,
-                                                reserve_data,
-                                                reserve_size));
+      PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnRNNForwardTraining(
+          handle,
+          rnn.rnn_desc(),
+          seq_length,
+          rnn.x_descs(),
+          x_data,
+          rnn.init_h_desc(),
+          init_h_data,
+          rnn.init_c_desc(),
+          init_c_data,
+          rnn.weight_desc(),
+          w_data,
+          rnn.y_descs(),
+          out_data,
+          rnn.last_h_desc(),
+          last_h_data,
+          rnn.last_c_desc(),
+          last_c_data,
+          workspace_data_.data<uint8_t>(),
+          workspace_size,
+          reserve_data,
+          reserve_size));
 #endif
     } else {
 #if !defined(PADDLE_WITH_HIP) && CUDNN_VERSION >= 7201
       // for train
       // This interface is used when the input/output is padded.
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNForwardTrainingEx(
+      PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnRNNForwardTrainingEx(
           handle,
           rnn.rnn_desc(),
           rnn.x_seq_desc(),
diff --git a/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu b/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu
index fdd9b4ba499146..ad1df809564d52 100644
--- a/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu
@@ -24,7 +24,7 @@
 #include "paddle/phi/kernels/funcs/for_range.h"
 #include "paddle/phi/kernels/funcs/inclusive_scan.h"
 // NOTE(@xiongkun): use of IsComplex<>
-#include "paddle/phi/core/utils/data_type.h"
+#include "paddle/common/data_type.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/decode_jpeg_kernel.cu b/paddle/phi/kernels/gpu/decode_jpeg_kernel.cu
index 0b5a10b93d85a1..77fb6b78a25737 100644
--- a/paddle/phi/kernels/gpu/decode_jpeg_kernel.cu
+++ b/paddle/phi/kernels/gpu/decode_jpeg_kernel.cu
@@ -41,7 +41,7 @@ void DecodeJpegKernel(const Context& dev_ctx,
   // Create nvJPEG handle
   if (nvjpeg_handle == nullptr) {
     nvjpegStatus_t create_status =
-        phi::dynload::nvjpegCreateSimple(&nvjpeg_handle);
+        common::dynload::nvjpegCreateSimple(&nvjpeg_handle);
 
     PADDLE_ENFORCE_EQ(
         create_status,
@@ -51,7 +51,7 @@ void DecodeJpegKernel(const Context& dev_ctx,
 
   nvjpegJpegState_t nvjpeg_state;
   nvjpegStatus_t state_status =
-      phi::dynload::nvjpegJpegStateCreate(nvjpeg_handle, &nvjpeg_state);
+      common::dynload::nvjpegJpegStateCreate(nvjpeg_handle, &nvjpeg_state);
 
   PADDLE_ENFORCE_EQ(
       state_status,
@@ -66,13 +66,13 @@ void DecodeJpegKernel(const Context& dev_ctx,
   auto* x_data = x.data<T>();
 
   nvjpegStatus_t info_status =
-      phi::dynload::nvjpegGetImageInfo(nvjpeg_handle,
-                                       x_data,
-                                       (std::size_t)x.numel(),
-                                       &components,
-                                       &subsampling,
-                                       widths,
-                                       heights);
+      common::dynload::nvjpegGetImageInfo(nvjpeg_handle,
+                                          x_data,
+                                          (std::size_t)x.numel(),
+                                          &components,
+                                          &subsampling,
+                                          widths,
+                                          heights);
   PADDLE_ENFORCE_EQ(info_status,
                     NVJPEG_STATUS_SUCCESS,
                     errors::Fatal("nvjpegGetImageInfo failed: ", info_status));
@@ -91,7 +91,7 @@ void DecodeJpegKernel(const Context& dev_ctx,
       output_format = NVJPEG_OUTPUT_RGB;
       output_components = 3;
     } else {
-      phi::dynload::nvjpegJpegStateDestroy(nvjpeg_state);
+      common::dynload::nvjpegJpegStateDestroy(nvjpeg_state);
       PADDLE_THROW(errors::Fatal(
           "The provided mode is not supported for JPEG files on GPU"));
     }
@@ -102,7 +102,7 @@ void DecodeJpegKernel(const Context& dev_ctx,
     output_format = NVJPEG_OUTPUT_RGB;
     output_components = 3;
   } else {
-    phi::dynload::nvjpegJpegStateDestroy(nvjpeg_state);
+    common::dynload::nvjpegJpegStateDestroy(nvjpeg_state);
     PADDLE_THROW(errors::Fatal(
         "The provided mode is not supported for JPEG files on GPU"));
   }
@@ -127,13 +127,13 @@ void DecodeJpegKernel(const Context& dev_ctx,
     out_image.pitch[c] = width;
   }
 
-  nvjpegStatus_t decode_status = phi::dynload::nvjpegDecode(nvjpeg_handle,
-                                                            nvjpeg_state,
-                                                            x_data,
-                                                            x.numel(),
-                                                            output_format,
-                                                            &out_image,
-                                                            nvjpeg_stream);
+  nvjpegStatus_t decode_status = common::dynload::nvjpegDecode(nvjpeg_handle,
+                                                               nvjpeg_state,
+                                                               x_data,
+                                                               x.numel(),
+                                                               output_format,
+                                                               &out_image,
+                                                               nvjpeg_stream);
 }
 }  // namespace phi
 
diff --git a/paddle/phi/kernels/gpu/flash_attn_grad_kernel.cu b/paddle/phi/kernels/gpu/flash_attn_grad_kernel.cu
index bfe924bf3c56c2..5b961210281c8b 100644
--- a/paddle/phi/kernels/gpu/flash_attn_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/flash_attn_grad_kernel.cu
@@ -98,7 +98,7 @@ void FlashAttnUnpaddedGradKernel(const Context& ctx,
   VLOG(10) << "FlashAttn bwd seed: " << params.seed
            << ", offset: " << params.offset;
 
-  bool succ = phi::dynload::flash_attn_varlen_bwd(
+  bool succ = common::dynload::flash_attn_varlen_bwd(
       dout.data(),
       q.data(),
       k.data(),
@@ -207,7 +207,7 @@ void FlashAttnGradKernel(const Context& ctx,
 
   int num_splits = get_num_split();
 
-  bool succ = phi::dynload::flash_attn_bwd(
+  bool succ = common::dynload::flash_attn_bwd(
       dout.data(),
       q.data(),
       k.data(),
diff --git a/paddle/phi/kernels/gpu/flash_attn_kernel.cu b/paddle/phi/kernels/gpu/flash_attn_kernel.cu
index aadae0f29c3427..5b2f9f711c4bbf 100644
--- a/paddle/phi/kernels/gpu/flash_attn_kernel.cu
+++ b/paddle/phi/kernels/gpu/flash_attn_kernel.cu
@@ -96,7 +96,7 @@ void FlashAttnUnpaddedKernel(
   VLOG(10) << "FlashAttn fwd seed: " << params.seed
            << ", offset: " << params.offset;
 
-  bool succ = phi::dynload::flash_attn_varlen_fwd(
+  bool succ = common::dynload::flash_attn_varlen_fwd(
       q.data(),
       k.data(),
       v.data(),
@@ -200,7 +200,7 @@ void FlashAttnKernel(const Context& ctx,
 
   cudaStream_t stream = ctx.stream();
 
-  bool succ = phi::dynload::flash_attn_fwd(
+  bool succ = common::dynload::flash_attn_fwd(
       q.data(),
       k.data(),
       v.data(),
diff --git a/paddle/phi/kernels/gpu/flash_attn_utils.h b/paddle/phi/kernels/gpu/flash_attn_utils.h
index ea438014f43125..cad32b299f037a 100644
--- a/paddle/phi/kernels/gpu/flash_attn_utils.h
+++ b/paddle/phi/kernels/gpu/flash_attn_utils.h
@@ -14,9 +14,9 @@
 
 #pragma once
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
-#include "paddle/phi/core/enforce.h"
 
 #ifdef PADDLE_WITH_FLASHATTN
 #include "paddle/phi/backends/dynload/flashattn.h"
@@ -264,7 +264,7 @@ static void CheckFlashAttnStatus(const bool status) {
                     true,
                     phi::errors::External(
                         "Error in Flash-Attention, detail information is: %s",
-                        phi::dynload::flash_attn_error()));
+                        common::dynload::flash_attn_error()));
 }
 
 template <typename T>
diff --git a/paddle/phi/kernels/gpu/gather_tree_kernel.cu b/paddle/phi/kernels/gpu/gather_tree_kernel.cu
index 3ae71992d24236..fc084bc56f5ccd 100644
--- a/paddle/phi/kernels/gpu/gather_tree_kernel.cu
+++ b/paddle/phi/kernels/gpu/gather_tree_kernel.cu
@@ -16,7 +16,7 @@
 
 #include <algorithm>
 
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/gpu/index_add_grad_kernel.cu b/paddle/phi/kernels/gpu/index_add_grad_kernel.cu
index c0d5b737c5fbbf..401ac7cac670c6 100644
--- a/paddle/phi/kernels/gpu/index_add_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_add_grad_kernel.cu
@@ -14,11 +14,11 @@
 
 #include "paddle/phi/kernels/index_add_grad_kernel.h"
 
+#include "paddle/common/data_type.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/gpu/index_select_impl.h"
 
diff --git a/paddle/phi/kernels/gpu/index_add_kernel.cu b/paddle/phi/kernels/gpu/index_add_kernel.cu
index 8fd15d5435f98b..3f72349a727f2e 100644
--- a/paddle/phi/kernels/gpu/index_add_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_add_kernel.cu
@@ -15,11 +15,11 @@
 #include "paddle/phi/kernels/index_add_kernel.h"
 
 #include "glog/logging.h"
+#include "paddle/common/data_type.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/utils/data_type.h"
 #include "paddle/utils/flags.h"
 
 PD_DECLARE_bool(cudnn_deterministic);
diff --git a/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu
index ff23dc0c394b94..c0ca88b59d10e0 100644
--- a/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu
@@ -17,11 +17,11 @@
 #include <algorithm>
 #include <vector>
 
+#include "paddle/common/data_type.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/gpu/index_sample_kernel.cu b/paddle/phi/kernels/gpu/index_sample_kernel.cu
index c60abdb3372cb1..669a107e2208f3 100644
--- a/paddle/phi/kernels/gpu/index_sample_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_sample_kernel.cu
@@ -17,10 +17,10 @@
 #include <algorithm>
 #include <vector>
 
+#include "paddle/common/data_type.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu
index 6d0ba9e5bd4ef9..1519119a2b8807 100644
--- a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu
@@ -15,11 +15,11 @@
 #include "paddle/phi/kernels/index_select_grad_kernel.h"
 
 #include "glog/logging.h"
+#include "paddle/common/data_type.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/utils/flags.h"
 
diff --git a/paddle/phi/kernels/gpu/index_select_impl.h b/paddle/phi/kernels/gpu/index_select_impl.h
index deeb6e5eb20f27..f010de7a807102 100644
--- a/paddle/phi/kernels/gpu/index_select_impl.h
+++ b/paddle/phi/kernels/gpu/index_select_impl.h
@@ -14,11 +14,11 @@
 
 #pragma once
 
+#include "paddle/common/data_type.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/utils/data_type.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/index_select_kernel.cu b/paddle/phi/kernels/gpu/index_select_kernel.cu
index 910015e00f0103..6678c14022d802 100644
--- a/paddle/phi/kernels/gpu/index_select_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_select_kernel.cu
@@ -14,11 +14,11 @@
 
 #include "paddle/phi/kernels/index_select_kernel.h"
 
+#include "paddle/common/data_type.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/gpu/index_select_impl.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
index 0f17a1bcc318a7..57b5e1cdee2820 100644
--- a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
@@ -398,17 +398,17 @@ void InstanceNormGradKernel(const Context &dev_ctx,
   miopenTensorDescriptor_t in_param_desc_;
 
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::miopenCreateTensorDescriptor(&data_desc_));
+      common::dynload::miopenCreateTensorDescriptor(&data_desc_));
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
+      common::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
 #else
   cudnnTensorDescriptor_t data_desc_;
   cudnnTensorDescriptor_t in_param_desc_;
 
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+      common::dynload::cudnnCreateTensorDescriptor(&data_desc_));
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
+      common::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
 #endif
 
   if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
@@ -419,22 +419,22 @@ void InstanceNormGradKernel(const Context &dev_ctx,
   epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
 
 #ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::miopenSetTensorDescriptor(
       data_desc_,
       CudnnDataType<T>::type,
       x_dims.size() > 3 ? x_dims.size() : 4,
       const_cast<int *>(dims.data()),
       const_cast<int *>(strides.data())));
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDeriveBNTensorDescriptor(
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::miopenDeriveBNTensorDescriptor(
       in_param_desc_, data_desc_, miopenBNSpatial));
 #else
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnSetTensorNdDescriptor(
       data_desc_,
       CudnnDataType<T>::type,
       x_dims.size() > 3 ? x_dims.size() : 4,
       dims.data(),
       strides.data()));
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDeriveBNTensorDescriptor(
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnDeriveBNTensorDescriptor(
       in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
 #endif
   const auto *saved_mean_data =
@@ -444,28 +444,29 @@ void InstanceNormGradKernel(const Context &dev_ctx,
 
   if (d_scale && d_bias) {
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenBatchNormalizationBackward(
-        dev_ctx.cudnn_handle(),
-        miopenBNSpatial,
-        CudnnDataType<T>::kOne(),
-        CudnnDataType<T>::kZero(),
-        CudnnDataType<T>::kOne(),
-        CudnnDataType<T>::kZero(),
-        data_desc_,
-        x_tmp.template data<T>(),
-        data_desc_,
-        d_y_tmp.template data<T>(),
-        data_desc_,
-        d_x->template data<T>(),
-        in_param_desc_,
-        scale_tmp.template data<BatchNormParamType<T>>(),
-        d_scale_tmp.template data<BatchNormParamType<T>>(),
-        d_bias_tmp.template data<BatchNormParamType<T>>(),
-        epsilon,
-        saved_mean_data,
-        saved_var_data));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        common::dynload::miopenBatchNormalizationBackward(
+            dev_ctx.cudnn_handle(),
+            miopenBNSpatial,
+            CudnnDataType<T>::kOne(),
+            CudnnDataType<T>::kZero(),
+            CudnnDataType<T>::kOne(),
+            CudnnDataType<T>::kZero(),
+            data_desc_,
+            x_tmp.template data<T>(),
+            data_desc_,
+            d_y_tmp.template data<T>(),
+            data_desc_,
+            d_x->template data<T>(),
+            in_param_desc_,
+            scale_tmp.template data<BatchNormParamType<T>>(),
+            d_scale_tmp.template data<BatchNormParamType<T>>(),
+            d_bias_tmp.template data<BatchNormParamType<T>>(),
+            epsilon,
+            saved_mean_data,
+            saved_var_data));
 #else
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBatchNormalizationBackward(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnBatchNormalizationBackward(
         dev_ctx.cudnn_handle(),
         CUDNN_BATCHNORM_SPATIAL,
         CudnnDataType<T>::kOne(),
@@ -508,14 +509,14 @@ void InstanceNormGradKernel(const Context &dev_ctx,
 
 #ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::miopenDestroyTensorDescriptor(data_desc_));
+      common::dynload::miopenDestroyTensorDescriptor(data_desc_));
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
+      common::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+      common::dynload::cudnnDestroyTensorDescriptor(data_desc_));
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
+      common::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
 #endif
 }
 
diff --git a/paddle/phi/kernels/gpu/instance_norm_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_kernel.cu
index 7f10eac67c67c8..4d4a0865c88a02 100644
--- a/paddle/phi/kernels/gpu/instance_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/instance_norm_kernel.cu
@@ -66,17 +66,17 @@ void InstanceNormKernel(const Context &dev_ctx,
   miopenTensorDescriptor_t in_param_desc_;
 
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::miopenCreateTensorDescriptor(&data_desc_));
+      common::dynload::miopenCreateTensorDescriptor(&data_desc_));
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
+      common::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
 #else
   cudnnTensorDescriptor_t data_desc_;
   cudnnTensorDescriptor_t in_param_desc_;
 
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+      common::dynload::cudnnCreateTensorDescriptor(&data_desc_));
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
+      common::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
 #endif
   if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
     LOG(ERROR) << "Provided epsilon is smaller than "
@@ -92,22 +92,22 @@ void InstanceNormKernel(const Context &dev_ctx,
   strides = {NxC * H * W * D, H * W * D, W * D, D, 1};
 
 #ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::miopenSetTensorDescriptor(
       data_desc_,
       CudnnDataType<T>::type,
       x_dims.size() > 3 ? x_dims.size() : 4,
       const_cast<int *>(dims.data()),
       const_cast<int *>(strides.data())));
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDeriveBNTensorDescriptor(
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::miopenDeriveBNTensorDescriptor(
       in_param_desc_, data_desc_, miopenBNSpatial));
 #else
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnSetTensorNdDescriptor(
       data_desc_,
       CudnnDataType<T>::type,
       x_dims.size() > 3 ? x_dims.size() : 4,
       dims.data(),
       strides.data()));
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDeriveBNTensorDescriptor(
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnDeriveBNTensorDescriptor(
       in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
 #endif
 
@@ -169,7 +169,7 @@ void InstanceNormKernel(const Context &dev_ctx,
 
 #ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::miopenBatchNormalizationForwardTraining(
+      common::dynload::miopenBatchNormalizationForwardTraining(
           handle,
           miopenBNSpatial,
           const_cast<void *>(
@@ -193,12 +193,12 @@ void InstanceNormKernel(const Context &dev_ctx,
           static_cast<void *>(saved_variance_data)));
 
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::miopenDestroyTensorDescriptor(data_desc_));
+      common::dynload::miopenDestroyTensorDescriptor(data_desc_));
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
+      common::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnBatchNormalizationForwardTraining(
+      common::dynload::cudnnBatchNormalizationForwardTraining(
           handle,
           CUDNN_BATCHNORM_SPATIAL,
           CudnnDataType<T>::kOne(),
@@ -218,9 +218,9 @@ void InstanceNormKernel(const Context &dev_ctx,
           saved_variance_data));
 
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+      common::dynload::cudnnDestroyTensorDescriptor(data_desc_));
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
+      common::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
 #endif
 }
 
diff --git a/paddle/phi/kernels/gpu/miopen_lstm_cache.h b/paddle/phi/kernels/gpu/miopen_lstm_cache.h
index 63604b4bd01e7e..a74604a0281087 100644
--- a/paddle/phi/kernels/gpu/miopen_lstm_cache.h
+++ b/paddle/phi/kernels/gpu/miopen_lstm_cache.h
@@ -80,7 +80,7 @@ class ScopedRNNBase {
     size_t state_size;
     if (!initialized_) {
       PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::miopenDropoutGetStatesSize(handle, &state_size));
+          common::dynload::miopenDropoutGetStatesSize(handle, &state_size));
       phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
       auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(pool.Get(place));
       dropout_state->Resize({static_cast<int64_t>(state_size)});
@@ -95,7 +95,7 @@ class ScopedRNNBase {
                              state_size);
 
     // ------------------- miopen rnn descriptors ---------------------
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetRNNDescriptor_V2(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::miopenSetRNNDescriptor_V2(
         rnn_desc_.desc(),
         hidden_size_,
         num_layers_,
@@ -109,7 +109,7 @@ class ScopedRNNBase {
 
     // ------------------- miopen weights_size ---------------------
     size_t weights_size_;
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenGetRNNParamsSize(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::miopenGetRNNParamsSize(
         handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, miopen_type));
     PADDLE_ENFORCE_EQ(
         weights_size_,
@@ -124,12 +124,12 @@ class ScopedRNNBase {
     weight_desc_.descriptor<T>(layout, dim_w);
     // ------------------- miopen workspace, reserve size ---------------------
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenGetRNNWorkspaceSize(handle,
-                                                rnn_desc_.desc(),
-                                                seq_length_,
-                                                x_descs_.data(),
-                                                workspace_size));
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenGetRNNTrainingReserveSize(
+        common::dynload::miopenGetRNNWorkspaceSize(handle,
+                                                   rnn_desc_.desc(),
+                                                   seq_length_,
+                                                   x_descs_.data(),
+                                                   workspace_size));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::miopenGetRNNTrainingReserveSize(
         handle, rnn_desc_.desc(), seq_length_, x_descs_.data(), reserve_size));
   }
   miopenTensorDescriptor_t* x_descs() { return x_descs_.data(); }
diff --git a/paddle/phi/kernels/gpu/multinomial_kernel.cu b/paddle/phi/kernels/gpu/multinomial_kernel.cu
index d23965811f9eb0..45c80a892e6c46 100644
--- a/paddle/phi/kernels/gpu/multinomial_kernel.cu
+++ b/paddle/phi/kernels/gpu/multinomial_kernel.cu
@@ -22,10 +22,10 @@ limitations under the License. */
 namespace cub = hipcub;
 #endif
 
+#include "paddle/common/ddim.h"
 #include "paddle/common/scalar.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/data_type.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/arg_min_max_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
diff --git a/paddle/phi/kernels/gpu/nonzero_kernel.cu b/paddle/phi/kernels/gpu/nonzero_kernel.cu
index bc44f4f033c458..65cdcd3d6a058d 100644
--- a/paddle/phi/kernels/gpu/nonzero_kernel.cu
+++ b/paddle/phi/kernels/gpu/nonzero_kernel.cu
@@ -20,7 +20,7 @@
 namespace cub = hipcub;
 #endif
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/phi/kernels/gpu/p_recv_kernel.cu b/paddle/phi/kernels/gpu/p_recv_kernel.cu
index 1e413797b6b893..25d6a677a2e7f6 100644
--- a/paddle/phi/kernels/gpu/p_recv_kernel.cu
+++ b/paddle/phi/kernels/gpu/p_recv_kernel.cu
@@ -16,9 +16,9 @@
 
 #include "glog/logging.h"
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/common/memory_utils.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 #if defined(PADDLE_WITH_NCCL) || \
diff --git a/paddle/phi/kernels/gpu/p_send_kernel.cu b/paddle/phi/kernels/gpu/p_send_kernel.cu
index 520adcf730a1d6..3c2af9103114b4 100644
--- a/paddle/phi/kernels/gpu/p_send_kernel.cu
+++ b/paddle/phi/kernels/gpu/p_send_kernel.cu
@@ -16,10 +16,10 @@
 
 #include "glog/logging.h"
 
+#include "paddle/common/data_type.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/utils/data_type.h"
 
 #if defined(PADDLE_WITH_NCCL) || \
     defined(PADDLE_WITH_RCCL) && NCCL_VERSION_CODE >= 2703
diff --git a/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
index 8321bcd1aa7acf..6705a98d976d48 100644
--- a/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
@@ -14,11 +14,11 @@
 
 #include "paddle/phi/kernels/put_along_axis_grad_kernel.h"
 
+#include "paddle/common/data_type.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
-#include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/gather_scatter_functor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/gpu/put_along_axis_kernel.cu b/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
index b63047973e9b82..b2ce3eb747c2fd 100644
--- a/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
+++ b/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
@@ -14,11 +14,11 @@
 
 #include "paddle/phi/kernels/put_along_axis_kernel.h"
 
+#include "paddle/common/data_type.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
-#include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/gather_scatter_functor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/gpu/qr_kernel.cu b/paddle/phi/kernels/gpu/qr_kernel.cu
index 6c036b83ee9d1f..dfaf23c74823a0 100644
--- a/paddle/phi/kernels/gpu/qr_kernel.cu
+++ b/paddle/phi/kernels/gpu/qr_kernel.cu
@@ -19,9 +19,9 @@
 #include <vector>
 
 #include "paddle/common/backends/dynload/cusolver.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/infermeta/unary.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
@@ -191,8 +191,8 @@ void BatchedGeqrf<GPUContext, float>(const GPUContext& dev_ctx,
   int lwork = 0;
 
   auto handle = dev_ctx.cusolver_dn_handle();
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cusolverDnSgeqrf_bufferSize(handle, m, n, a, lda, &lwork));
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cusolverDnSgeqrf_bufferSize(
+      handle, m, n, a, lda, &lwork));
 
   DenseTensor workspace = DenseTensor();
   workspace.Resize(make_ddim({lwork}));
@@ -206,15 +206,16 @@ void BatchedGeqrf<GPUContext, float>(const GPUContext& dev_ctx,
     float* a_working_ptr = &a[i * a_stride];
     float* tau_working_ptr = &tau[i * tau_stride];
     // compute geqrf
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnSgeqrf(handle,
-                                                              m,
-                                                              n,
-                                                              a_working_ptr,
-                                                              lda,
-                                                              tau_working_ptr,
-                                                              workspace_ptr,
-                                                              lwork,
-                                                              info_d));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        common::dynload::cusolverDnSgeqrf(handle,
+                                          m,
+                                          n,
+                                          a_working_ptr,
+                                          lda,
+                                          tau_working_ptr,
+                                          workspace_ptr,
+                                          lwork,
+                                          info_d));
     // Do we need synchronized here?
     // check the error info
     int info_h;
@@ -245,8 +246,8 @@ void BatchedGeqrf<GPUContext, double>(const GPUContext& dev_ctx,
   int lwork = 0;
 
   auto handle = dev_ctx.cusolver_dn_handle();
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cusolverDnDgeqrf_bufferSize(handle, m, n, a, lda, &lwork));
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cusolverDnDgeqrf_bufferSize(
+      handle, m, n, a, lda, &lwork));
 
   DenseTensor workspace = DenseTensor();
   workspace.Resize(make_ddim({lwork}));
@@ -260,15 +261,16 @@ void BatchedGeqrf<GPUContext, double>(const GPUContext& dev_ctx,
     double* a_working_ptr = &a[i * a_stride];
     double* tau_working_ptr = &tau[i * tau_stride];
     // compute geqrf
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnDgeqrf(handle,
-                                                              m,
-                                                              n,
-                                                              a_working_ptr,
-                                                              lda,
-                                                              tau_working_ptr,
-                                                              workspace_ptr,
-                                                              lwork,
-                                                              info_d));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        common::dynload::cusolverDnDgeqrf(handle,
+                                          m,
+                                          n,
+                                          a_working_ptr,
+                                          lda,
+                                          tau_working_ptr,
+                                          workspace_ptr,
+                                          lwork,
+                                          info_d));
     // Do we need synchronized here?
     // check the error info
     int info_h;
@@ -300,7 +302,7 @@ void BatchedOrgqr<GPUContext, float>(const GPUContext& dev_ctx,
   int lwork = 0;
 
   auto handle = dev_ctx.cusolver_dn_handle();
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnSorgqr_bufferSize(
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cusolverDnSorgqr_bufferSize(
       handle, m, n, k, a, lda, tau, &lwork));
 
   DenseTensor workspace = DenseTensor();
@@ -315,16 +317,17 @@ void BatchedOrgqr<GPUContext, float>(const GPUContext& dev_ctx,
     float* a_working_ptr = &a[i * a_stride];
     float* tau_working_ptr = &tau[i * tau_stride];
     // compute orggr
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnSorgqr(handle,
-                                                              m,
-                                                              n,
-                                                              k,
-                                                              a_working_ptr,
-                                                              lda,
-                                                              tau_working_ptr,
-                                                              workspace_ptr,
-                                                              lwork,
-                                                              info_d));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        common::dynload::cusolverDnSorgqr(handle,
+                                          m,
+                                          n,
+                                          k,
+                                          a_working_ptr,
+                                          lda,
+                                          tau_working_ptr,
+                                          workspace_ptr,
+                                          lwork,
+                                          info_d));
     // Do we need synchronized here?
     // check the error info
     int info_h;
@@ -356,7 +359,7 @@ void BatchedOrgqr<GPUContext, double>(const GPUContext& dev_ctx,
   int lwork = 0;
 
   auto handle = dev_ctx.cusolver_dn_handle();
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnDorgqr_bufferSize(
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cusolverDnDorgqr_bufferSize(
       handle, m, n, k, a, lda, tau, &lwork));
 
   DenseTensor workspace = DenseTensor();
@@ -371,16 +374,17 @@ void BatchedOrgqr<GPUContext, double>(const GPUContext& dev_ctx,
     double* a_working_ptr = &a[i * a_stride];
     double* tau_working_ptr = &tau[i * tau_stride];
     // compute orggr
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnDorgqr(handle,
-                                                              m,
-                                                              n,
-                                                              k,
-                                                              a_working_ptr,
-                                                              lda,
-                                                              tau_working_ptr,
-                                                              workspace_ptr,
-                                                              lwork,
-                                                              info_d));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        common::dynload::cusolverDnDorgqr(handle,
+                                          m,
+                                          n,
+                                          k,
+                                          a_working_ptr,
+                                          lda,
+                                          tau_working_ptr,
+                                          workspace_ptr,
+                                          lwork,
+                                          info_d));
     // Do we need synchronized here?
     // check the error info
     int info_h;
diff --git a/paddle/phi/kernels/gpu/rnn_functor.h b/paddle/phi/kernels/gpu/rnn_functor.h
index bdbcd05e65772a..861a17a8efc094 100644
--- a/paddle/phi/kernels/gpu/rnn_functor.h
+++ b/paddle/phi/kernels/gpu/rnn_functor.h
@@ -102,10 +102,10 @@ class RNNDescriptors {
     if (!is_test_ && !is_initialized) {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::miopenDropoutGetStatesSize(handle, &state_size));
+          common::dynload::miopenDropoutGetStatesSize(handle, &state_size));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::cudnnDropoutGetStatesSize(handle, &state_size));
+          common::dynload::cudnnDropoutGetStatesSize(handle, &state_size));
 #endif
       dropout_state->Resize({static_cast<int64_t>(state_size)});
       dev_ctx.template Alloc<uint8_t>(dropout_state);
@@ -120,7 +120,7 @@ class RNNDescriptors {
 
 // ------------------- cudnn rnn descriptors ---------------------
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetRNNDescriptor_V2(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::miopenSetRNNDescriptor_V2(
         rnn_desc_.desc(),
         hidden_size_,
         num_layers_,
@@ -132,7 +132,7 @@ class RNNDescriptors {
         miopenRNNdefault,
         cudnn_type));
 #elif CUDNN_VERSION >= 6000
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetRNNDescriptor_v6(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnSetRNNDescriptor_v6(
         handle,
         rnn_desc_.desc(),
         hidden_size_,
@@ -144,7 +144,7 @@ class RNNDescriptors {
         CUDNN_RNN_ALGO_STANDARD,
         cudnn_type));
 #else
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetRNNDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnSetRNNDescriptor(
         rnn_desc_.desc(),
         hidden_size_,
         num_layers_,
@@ -157,7 +157,7 @@ class RNNDescriptors {
 
 #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION >= 7201
     if (!sequence_length.empty()) {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetRNNPaddingMode(
+      PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnSetRNNPaddingMode(
           rnn_desc_.desc(), CUDNN_RNN_PADDED_IO_ENABLED));
     }
 #endif
@@ -165,10 +165,10 @@ class RNNDescriptors {
     // ------------------- cudnn weights_size ---------------------
     size_t weights_size_;
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenGetRNNParamsSize(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::miopenGetRNNParamsSize(
         handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type));
 #else
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnGetRNNParamsSize(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnGetRNNParamsSize(
         handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type));
 #endif
     PADDLE_ENFORCE_EQ(
@@ -184,21 +184,21 @@ class RNNDescriptors {
 // ------------------- cudnn workspace, reserve size ---------------------
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenGetRNNWorkspaceSize(handle,
-                                                rnn_desc_.desc(),
-                                                seq_length_,
-                                                x_descs_.data(),
-                                                workspace_size));
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenGetRNNTrainingReserveSize(
+        common::dynload::miopenGetRNNWorkspaceSize(handle,
+                                                   rnn_desc_.desc(),
+                                                   seq_length_,
+                                                   x_descs_.data(),
+                                                   workspace_size));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::miopenGetRNNTrainingReserveSize(
         handle, rnn_desc_.desc(), seq_length_, x_descs_.data(), reserve_size));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnGetRNNWorkspaceSize(handle,
-                                               rnn_desc_.desc(),
-                                               seq_length_,
-                                               x_descs_.data(),
-                                               workspace_size));
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnGetRNNTrainingReserveSize(
+        common::dynload::cudnnGetRNNWorkspaceSize(handle,
+                                                  rnn_desc_.desc(),
+                                                  seq_length_,
+                                                  x_descs_.data(),
+                                                  workspace_size));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnGetRNNTrainingReserveSize(
         handle, rnn_desc_.desc(), seq_length_, x_descs_.data(), reserve_size));
 #endif
   }
diff --git a/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc b/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc
index 3e8dfe813cad70..b8d9f9db25cf81 100644
--- a/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc
@@ -259,7 +259,7 @@ void RnnGradKernel(const Context &dev_ctx,
   if (!has_seq_length) {
     if (x_grad) {
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenRNNBackwardData(
+      PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::miopenRNNBackwardData(
           handle,
           rnn.rnn_desc(),
           seq_length,
@@ -289,7 +289,7 @@ void RnnGradKernel(const Context &dev_ctx,
           reserve_size));
 #else
       // This interface is used when the input/output is unpadded.
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardData(
+      PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnRNNBackwardData(
           handle,
           rnn.rnn_desc(),
           seq_length,
@@ -321,7 +321,7 @@ void RnnGradKernel(const Context &dev_ctx,
     }
     if (!weight_grad_list.empty()) {
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenRNNBackwardWeights(
+      PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::miopenRNNBackwardWeights(
           handle,
           rnn.rnn_desc(),
           seq_length,
@@ -341,7 +341,7 @@ void RnnGradKernel(const Context &dev_ctx,
       TensorToPermutedWeight<T>(
           place, stream, weight_grad, &weight_grad_list, rnn_mode, is_bidirec);
 #else
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardWeights(
+      PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnRNNBackwardWeights(
           handle,
           rnn.rnn_desc(),
           seq_length,
@@ -364,7 +364,7 @@ void RnnGradKernel(const Context &dev_ctx,
     // for train
     // This interface is used when the input/output is padded.
     if (x_grad) {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardDataEx(
+      PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnRNNBackwardDataEx(
           handle,
           rnn.rnn_desc(),
           rnn.y_seq_desc(),
@@ -398,7 +398,7 @@ void RnnGradKernel(const Context &dev_ctx,
     }
 
     if (!weight_grad_list.empty()) {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardWeightsEx(
+      PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnRNNBackwardWeightsEx(
           handle,
           rnn.rnn_desc(),
           rnn.x_seq_desc(),
diff --git a/paddle/phi/kernels/gpu/rnn_kernel.cu.cc b/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
index 82800607bae9de..53c9231f3b8798 100644
--- a/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
@@ -43,53 +43,53 @@ void RNNInferece(bool has_seq_length,
 // for inference
 // This interface is used when the input/output is unpadded.
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenRNNForwardInference(handle,
-                                                rnn->rnn_desc(),
-                                                seq_length,
-                                                rnn->x_descs(),
-                                                x_data,
-                                                rnn->init_h_desc(),
-                                                init_h_data,
-                                                rnn->init_c_desc(),
-                                                init_c_data,
-                                                rnn->weight_desc(),
-                                                w_data,
-                                                rnn->y_descs(),
-                                                out_data,
-                                                rnn->last_h_desc(),
-                                                last_h_data,
-                                                rnn->last_c_desc(),
-                                                last_c_data,
-                                                workspace_data->data<uint8_t>(),
-                                                workspace_size));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::miopenRNNForwardInference(
+        handle,
+        rnn->rnn_desc(),
+        seq_length,
+        rnn->x_descs(),
+        x_data,
+        rnn->init_h_desc(),
+        init_h_data,
+        rnn->init_c_desc(),
+        init_c_data,
+        rnn->weight_desc(),
+        w_data,
+        rnn->y_descs(),
+        out_data,
+        rnn->last_h_desc(),
+        last_h_data,
+        rnn->last_c_desc(),
+        last_c_data,
+        workspace_data->data<uint8_t>(),
+        workspace_size));
 #else
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnRNNForwardInference(handle,
-                                               rnn->rnn_desc(),
-                                               seq_length,
-                                               rnn->x_descs(),
-                                               x_data,
-                                               rnn->init_h_desc(),
-                                               init_h_data,
-                                               rnn->init_c_desc(),
-                                               init_c_data,
-                                               rnn->weight_desc(),
-                                               w_data,
-                                               rnn->y_descs(),
-                                               out_data,
-                                               rnn->last_h_desc(),
-                                               last_h_data,
-                                               rnn->last_c_desc(),
-                                               last_c_data,
-                                               workspace_data->data<uint8_t>(),
-                                               workspace_size));
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnRNNForwardInference(
+        handle,
+        rnn->rnn_desc(),
+        seq_length,
+        rnn->x_descs(),
+        x_data,
+        rnn->init_h_desc(),
+        init_h_data,
+        rnn->init_c_desc(),
+        init_c_data,
+        rnn->weight_desc(),
+        w_data,
+        rnn->y_descs(),
+        out_data,
+        rnn->last_h_desc(),
+        last_h_data,
+        rnn->last_c_desc(),
+        last_c_data,
+        workspace_data->data<uint8_t>(),
+        workspace_size));
 #endif
   } else {
 #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION >= 7201
     // for inference
     // This interface is used when the input/output is padded.
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNForwardInferenceEx(
+    PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnRNNForwardInferenceEx(
         handle,
         rnn->rnn_desc(),
         rnn->x_seq_desc(),
@@ -309,7 +309,7 @@ void RnnKernel(const Context &dev_ctx,
 // for train
 // This interface is used when the input/output is unpadded.
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenRNNForwardTraining(
+      PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::miopenRNNForwardTraining(
           handle,
           rnn.rnn_desc(),
           seq_length,
@@ -332,34 +332,34 @@ void RnnKernel(const Context &dev_ctx,
           reserve_data,
           reserve_size));
 #else
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::cudnnRNNForwardTraining(handle,
-                                                rnn.rnn_desc(),
-                                                seq_length,
-                                                rnn.x_descs(),
-                                                x_data,
-                                                rnn.init_h_desc(),
-                                                init_h_data,
-                                                rnn.init_c_desc(),
-                                                init_c_data,
-                                                rnn.weight_desc(),
-                                                w_data,
-                                                rnn.y_descs(),
-                                                out_data,
-                                                rnn.last_h_desc(),
-                                                last_h_data,
-                                                rnn.last_c_desc(),
-                                                last_c_data,
-                                                workspace_data_.data<uint8_t>(),
-                                                workspace_size,
-                                                reserve_data,
-                                                reserve_size));
+      PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnRNNForwardTraining(
+          handle,
+          rnn.rnn_desc(),
+          seq_length,
+          rnn.x_descs(),
+          x_data,
+          rnn.init_h_desc(),
+          init_h_data,
+          rnn.init_c_desc(),
+          init_c_data,
+          rnn.weight_desc(),
+          w_data,
+          rnn.y_descs(),
+          out_data,
+          rnn.last_h_desc(),
+          last_h_data,
+          rnn.last_c_desc(),
+          last_c_data,
+          workspace_data_.data<uint8_t>(),
+          workspace_size,
+          reserve_data,
+          reserve_size));
 #endif
     } else {
 #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION >= 7201
       // for train
       // This interface is used when the input/output is padded.
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNForwardTrainingEx(
+      PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnRNNForwardTrainingEx(
           handle,
           rnn.rnn_desc(),
           rnn.x_seq_desc(),
diff --git a/paddle/phi/kernels/gpu/shuffle_batch_grad_kernel.cu b/paddle/phi/kernels/gpu/shuffle_batch_grad_kernel.cu
index 33b39666edf071..9472861a64c8e3 100644
--- a/paddle/phi/kernels/gpu/shuffle_batch_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/shuffle_batch_grad_kernel.cu
@@ -21,9 +21,9 @@
 #include <thrust/shuffle.h>
 #endif
 
+#include "paddle/common/errors.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
diff --git a/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu b/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu
index e145e7e1c8a206..058c3c6e686b0d 100644
--- a/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu
+++ b/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu
@@ -21,9 +21,9 @@
 #include <thrust/shuffle.h>
 #endif
 
+#include "paddle/common/errors.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
diff --git a/paddle/phi/kernels/gpu/svd_kernel.cu b/paddle/phi/kernels/gpu/svd_kernel.cu
index 28857334e77b61..0d9dcc302bcb74 100644
--- a/paddle/phi/kernels/gpu/svd_kernel.cu
+++ b/paddle/phi/kernels/gpu/svd_kernel.cu
@@ -61,22 +61,22 @@ void GesvdjBatched<float>(const phi::GPUContext& dev_ctx,
   int lwork = 0;
   auto handle = dev_ctx.cusolver_dn_handle();
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params));
+      common::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params));
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cusolverDnSgesvdj_bufferSize(handle,
-                                                 jobz,
-                                                 thin_UV,
-                                                 m,
-                                                 n,
-                                                 A,
-                                                 lda,
-                                                 S,
-                                                 U,
-                                                 ldu,
-                                                 V,
-                                                 ldt,
-                                                 &lwork,
-                                                 gesvdj_params));
+      common::dynload::cusolverDnSgesvdj_bufferSize(handle,
+                                                    jobz,
+                                                    thin_UV,
+                                                    m,
+                                                    n,
+                                                    A,
+                                                    lda,
+                                                    S,
+                                                    U,
+                                                    ldu,
+                                                    V,
+                                                    ldt,
+                                                    &lwork,
+                                                    gesvdj_params));
   auto workspace = phi::memory_utils::Alloc(
       dev_ctx.GetPlace(),
       lwork * sizeof(float),
@@ -86,22 +86,23 @@ void GesvdjBatched<float>(const phi::GPUContext& dev_ctx,
   int stride_U = ldu * (thin_UV ? k : m);
   int stride_V = ldt * (thin_UV ? k : n);
   for (int i = 0; i < batchSize; ++i) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnSgesvdj(handle,
-                                                               jobz,
-                                                               thin_UV,
-                                                               m,
-                                                               n,
-                                                               A + stride_A * i,
-                                                               lda,
-                                                               S + k * i,
-                                                               U + stride_U * i,
-                                                               ldu,
-                                                               V + stride_V * i,
-                                                               ldt,
-                                                               workspace_ptr,
-                                                               lwork,
-                                                               info,
-                                                               gesvdj_params));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        common::dynload::cusolverDnSgesvdj(handle,
+                                           jobz,
+                                           thin_UV,
+                                           m,
+                                           n,
+                                           A + stride_A * i,
+                                           lda,
+                                           S + k * i,
+                                           U + stride_U * i,
+                                           ldu,
+                                           V + stride_V * i,
+                                           ldt,
+                                           workspace_ptr,
+                                           lwork,
+                                           info,
+                                           gesvdj_params));
     // check the error info
     int error_info;
     memory_utils::Copy(phi::CPUPlace(),
@@ -117,7 +118,7 @@ void GesvdjBatched<float>(const phi::GPUContext& dev_ctx,
             "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info));
   }
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params));
+      common::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params));
 }
 
 template <>
@@ -142,22 +143,22 @@ void GesvdjBatched<double>(const phi::GPUContext& dev_ctx,
   int lwork = 0;
   auto handle = dev_ctx.cusolver_dn_handle();
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params));
+      common::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params));
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cusolverDnDgesvdj_bufferSize(handle,
-                                                 jobz,
-                                                 thin_UV,
-                                                 m,
-                                                 n,
-                                                 A,
-                                                 lda,
-                                                 S,
-                                                 U,
-                                                 ldu,
-                                                 V,
-                                                 ldt,
-                                                 &lwork,
-                                                 gesvdj_params));
+      common::dynload::cusolverDnDgesvdj_bufferSize(handle,
+                                                    jobz,
+                                                    thin_UV,
+                                                    m,
+                                                    n,
+                                                    A,
+                                                    lda,
+                                                    S,
+                                                    U,
+                                                    ldu,
+                                                    V,
+                                                    ldt,
+                                                    &lwork,
+                                                    gesvdj_params));
   auto workspace = phi::memory_utils::Alloc(
       dev_ctx.GetPlace(),
       lwork * sizeof(double),
@@ -167,22 +168,23 @@ void GesvdjBatched<double>(const phi::GPUContext& dev_ctx,
   int stride_U = ldu * (thin_UV ? k : m);
   int stride_V = ldt * (thin_UV ? k : n);
   for (int i = 0; i < batchSize; ++i) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnDgesvdj(handle,
-                                                               jobz,
-                                                               thin_UV,
-                                                               m,
-                                                               n,
-                                                               A + stride_A * i,
-                                                               lda,
-                                                               S + k * i,
-                                                               U + stride_U * i,
-                                                               ldu,
-                                                               V + stride_V * i,
-                                                               ldt,
-                                                               workspace_ptr,
-                                                               lwork,
-                                                               info,
-                                                               gesvdj_params));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        common::dynload::cusolverDnDgesvdj(handle,
+                                           jobz,
+                                           thin_UV,
+                                           m,
+                                           n,
+                                           A + stride_A * i,
+                                           lda,
+                                           S + k * i,
+                                           U + stride_U * i,
+                                           ldu,
+                                           V + stride_V * i,
+                                           ldt,
+                                           workspace_ptr,
+                                           lwork,
+                                           info,
+                                           gesvdj_params));
     // check the error info
     int error_info;
     memory_utils::Copy(phi::CPUPlace(),
@@ -198,7 +200,7 @@ void GesvdjBatched<double>(const phi::GPUContext& dev_ctx,
             "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info));
   }
   PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params));
+      common::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params));
 }
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu
index 6cea7592836730..6191b97fba398f 100644
--- a/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu
@@ -14,10 +14,10 @@
 
 #include "paddle/phi/kernels/take_along_axis_grad_kernel.h"
 
+#include "paddle/common/data_type.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/gather_scatter_functor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
index ba4c6ba27e6824..bbdd7b914ceea4 100644
--- a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
+++ b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
@@ -14,10 +14,10 @@
 
 #include "paddle/phi/kernels/take_along_axis_kernel.h"
 
+#include "paddle/common/data_type.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/gather_scatter_functor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/gpu/triangular_solve_kernel.cu b/paddle/phi/kernels/gpu/triangular_solve_kernel.cu
index 889c421eb0bb96..4db6dc842f2a5e 100644
--- a/paddle/phi/kernels/gpu/triangular_solve_kernel.cu
+++ b/paddle/phi/kernels/gpu/triangular_solve_kernel.cu
@@ -14,9 +14,9 @@
 
 #include "paddle/phi/kernels/triangular_solve_kernel.h"
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/expand_kernel.h"
diff --git a/paddle/phi/kernels/gpu/unique_consecutive_kernel.cu b/paddle/phi/kernels/gpu/unique_consecutive_kernel.cu
index 448e6ca38b3f50..83cae05b1350fa 100644
--- a/paddle/phi/kernels/gpu/unique_consecutive_kernel.cu
+++ b/paddle/phi/kernels/gpu/unique_consecutive_kernel.cu
@@ -17,8 +17,8 @@
 #include "paddle/phi/kernels/unique_consecutive_kernel.h"
 #include "paddle/phi/kernels/gpu/unique_consecutive_functor.h"
 
+#include "paddle/common/errors.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu b/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu
index 2a3c9515ac2ea7..79bdcb1d474967 100644
--- a/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu
@@ -58,8 +58,9 @@ void AffineGridGradCudnnKernel(const Context& dev_ctx,
   const T* output_grad_data = output_grad.data<T>();
   T* theta_grad_data = dev_ctx.template Alloc<T>(theta_grad);
 
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSpatialTfGridGeneratorBackward(
-      handle, cudnn_st_desc, output_grad_data, theta_grad_data));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      common::dynload::cudnnSpatialTfGridGeneratorBackward(
+          handle, cudnn_st_desc, output_grad_data, theta_grad_data));
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpudnn/affine_grid_kernel.cu b/paddle/phi/kernels/gpudnn/affine_grid_kernel.cu
index 060f8c86710b58..1ee7a6fcb164ef 100644
--- a/paddle/phi/kernels/gpudnn/affine_grid_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/affine_grid_kernel.cu
@@ -55,7 +55,7 @@ void AffineGridCudnnKernel(const Context& dev_ctx,
   cudnnSpatialTransformerDescriptor_t cudnn_st_desc =
       st_desc.descriptor<T>(4, h_size_data);
 
-  PADDLE_ENFORCE_EQ(phi::dynload::cudnnSpatialTfGridGeneratorForward(
+  PADDLE_ENFORCE_EQ(common::dynload::cudnnSpatialTfGridGeneratorForward(
                         handle, cudnn_st_desc, theta_data, output_data),
                     0,
                     phi::errors::Fatal("Some errors has occurred "
diff --git a/paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h b/paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h
index d0bdcc10beaa83..726dff49812797 100644
--- a/paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h
+++ b/paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h
@@ -19,11 +19,11 @@ limitations under the License. */
 
 #include "glog/logging.h"
 
+#include "paddle/common/data_type.h"
 #include "paddle/phi/backends/dynload/cudnn_frontend.h"
 #include "paddle/phi/backends/gpu/cuda/cudnn_desc.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/autotune/cache.h"
 #include "paddle/phi/kernels/autotune/switch_autotune.h"
 #include "paddle/phi/kernels/gpudnn/conv_gpudnn_base.h"
@@ -286,7 +286,7 @@ class CudnnFrontendConvHelper {
                   .setDataPointers(data_ptrs->size(), data_ptrs->data())
                   .setUids(uids->size(), uids->data())
                   .build();
-          PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBackendExecute(
+          PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnBackendExecute(
               handle_, plan_desc, variant_pack.get_raw_desc()));
         },
         workspace_size);
diff --git a/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h b/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h
index dfea9013ab0b87..05007c3c195f19 100644
--- a/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h
+++ b/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h
@@ -112,13 +112,14 @@ struct SearchAlgorithmBase<ConvKind::kForward> {
                                  cudnnConvolutionFwdAlgo_t algo) {
     size_t workspace_size = 0;
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnGetConvolutionForwardWorkspaceSize(args.handle,
-                                                              args.idesc.desc(),
-                                                              args.wdesc.desc(),
-                                                              args.cdesc.desc(),
-                                                              args.odesc.desc(),
-                                                              algo,
-                                                              &workspace_size));
+        common::dynload::cudnnGetConvolutionForwardWorkspaceSize(
+            args.handle,
+            args.idesc.desc(),
+            args.wdesc.desc(),
+            args.cdesc.desc(),
+            args.odesc.desc(),
+            algo,
+            &workspace_size));
     return workspace_size;
   }
 
@@ -140,7 +141,7 @@ struct SearchAlgorithmBase<ConvKind::kForward> {
     int best_algo_idx = 0;
     std::vector<PerfT> perf_results(kNUM_CUDNN_FWD_ALGS);
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnGetConvolutionForwardAlgorithm_v7(
+        common::dynload::cudnnGetConvolutionForwardAlgorithm_v7(
             args.handle,
             args.idesc.desc(),
             args.wdesc.desc(),
@@ -167,7 +168,7 @@ struct SearchAlgorithmBase<ConvKind::kForward> {
               << result.workspace_size << ") exceeds the limit("
               << workspace_size_limit << ")";
       PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::cudnnGetConvolutionForwardAlgorithm(
+          common::dynload::cudnnGetConvolutionForwardAlgorithm(
               args.handle,
               args.idesc.desc(),
               args.wdesc.desc(),
@@ -180,7 +181,7 @@ struct SearchAlgorithmBase<ConvKind::kForward> {
     }
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnGetConvolutionForwardAlgorithm(
+        common::dynload::cudnnGetConvolutionForwardAlgorithm(
             args.handle,
             args.idesc.desc(),
             args.wdesc.desc(),
@@ -208,7 +209,7 @@ struct SearchAlgorithmBase<ConvKind::kForward> {
     std::vector<PerfT> perf_results(kNUM_CUDNN_FWD_ALGS);
     auto cudnn_find_func = [&](void* workspace_ptr) {
       PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::cudnnFindConvolutionForwardAlgorithmEx(
+          common::dynload::cudnnFindConvolutionForwardAlgorithmEx(
               args.handle,
               args.idesc.desc(),
               args.x->data<T>(),
@@ -246,7 +247,7 @@ struct SearchAlgorithmBase<ConvKind::kForward> {
       size_t max_workspace_size = 0;
       for (size_t algo = 0; algo < kNUM_CUDNN_FWD_ALGS; ++algo) {
         size_t workspace_size = 0;
-        auto status = phi::dynload::cudnnGetConvolutionForwardWorkspaceSize(
+        auto status = common::dynload::cudnnGetConvolutionForwardWorkspaceSize(
             args.handle,
             args.idesc.desc(),
             args.wdesc.desc(),
@@ -286,7 +287,7 @@ struct SearchAlgorithmBase<ConvKind::kBackwardData> {
                                  cudnnConvolutionBwdDataAlgo_t algo) {
     size_t workspace_size = 0;
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
+        common::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
             args.handle,
             args.wdesc.desc(),
             args.odesc.desc(),
@@ -316,7 +317,7 @@ struct SearchAlgorithmBase<ConvKind::kBackwardData> {
     int best_algo_idx = 0;
     std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_DATA_ALGS);
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnGetConvolutionBackwardDataAlgorithm_v7(
+        common::dynload::cudnnGetConvolutionBackwardDataAlgorithm_v7(
             args.handle,
             args.wdesc.desc(),
             args.odesc.desc(),
@@ -351,7 +352,7 @@ struct SearchAlgorithmBase<ConvKind::kBackwardData> {
               << result.workspace_size << ") exceeds the limit("
               << workspace_size_limit << ")";
       PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
+          common::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
               args.handle,
               args.wdesc.desc(),
               args.odesc.desc(),
@@ -364,7 +365,7 @@ struct SearchAlgorithmBase<ConvKind::kBackwardData> {
     }
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
+        common::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
             args.handle,
             args.wdesc.desc(),
             args.odesc.desc(),
@@ -392,7 +393,7 @@ struct SearchAlgorithmBase<ConvKind::kBackwardData> {
     std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_DATA_ALGS);
     auto cudnn_find_func = [&](void* workspace_ptr) {
       PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::cudnnFindConvolutionBackwardDataAlgorithmEx(
+          common::dynload::cudnnFindConvolutionBackwardDataAlgorithmEx(
               args.handle,
               args.wdesc.desc(),
               args.w->data<T>(),
@@ -431,7 +432,7 @@ struct SearchAlgorithmBase<ConvKind::kBackwardData> {
       for (size_t algo = 0; algo < kNUM_CUDNN_BWD_DATA_ALGS; ++algo) {
         size_t workspace_size = 0;
         auto status =
-            phi::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
+            common::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
                 args.handle,
                 args.wdesc.desc(),
                 args.odesc.desc(),
@@ -470,7 +471,7 @@ struct SearchAlgorithmBase<ConvKind::kBackwardFilter> {
     phi::backends::gpu::CUDAGraphCaptureModeGuard guard;
     size_t workspace_size = 0;
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
+        common::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
             args.handle,
             args.idesc.desc(),
             args.odesc.desc(),
@@ -500,7 +501,7 @@ struct SearchAlgorithmBase<ConvKind::kBackwardFilter> {
     int best_algo_idx = 0;
     std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_FILTER_ALGS);
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnGetConvolutionBackwardFilterAlgorithm_v7(
+        common::dynload::cudnnGetConvolutionBackwardFilterAlgorithm_v7(
             args.handle,
             args.idesc.desc(),
             args.odesc.desc(),
@@ -523,7 +524,7 @@ struct SearchAlgorithmBase<ConvKind::kBackwardFilter> {
               << result.workspace_size << ") exceeds the limit("
               << workspace_size_limit << ")";
       PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
+          common::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
               args.handle,
               args.idesc.desc(),
               args.odesc.desc(),
@@ -536,7 +537,7 @@ struct SearchAlgorithmBase<ConvKind::kBackwardFilter> {
     }
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
+        common::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
             args.handle,
             args.idesc.desc(),
             args.odesc.desc(),
@@ -568,7 +569,7 @@ struct SearchAlgorithmBase<ConvKind::kBackwardFilter> {
 
       auto cudnn_find_func = [&](void* workspace_ptr) {
         PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::cudnnFindConvolutionBackwardFilterAlgorithmEx(
+            common::dynload::cudnnFindConvolutionBackwardFilterAlgorithmEx(
                 args.handle,
                 args.idesc.desc(),
                 args.x->data<T>(),
@@ -597,7 +598,7 @@ struct SearchAlgorithmBase<ConvKind::kBackwardFilter> {
       int max_algos = GetAlgorithmMaxCount(args.handle);
       std::vector<PerfT> perf_results(max_algos);
       PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::cudnnFindConvolutionBackwardFilterAlgorithm(
+          common::dynload::cudnnFindConvolutionBackwardFilterAlgorithm(
               args.handle,
               args.idesc.desc(),
               args.odesc.desc(),
@@ -625,7 +626,7 @@ struct SearchAlgorithmBase<ConvKind::kBackwardFilter> {
 #if CUDNN_VERSION_MIN(7, 0, 1)
     int max_algos = 0;
     auto status =
-        phi::dynload::cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
+        common::dynload::cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
             handle, &max_algos);
     if (status == gpuSuccess) {
       VLOG(5) << "[BackwardFilter] max_algos: predefined="
@@ -643,7 +644,7 @@ struct SearchAlgorithmBase<ConvKind::kBackwardFilter> {
       for (size_t algo = 0; algo < kNUM_CUDNN_BWD_FILTER_ALGS; ++algo) {
         size_t workspace_size = 0;
         auto status =
-            phi::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
+            common::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
                 args.handle,
                 args.idesc.desc(),
                 args.odesc.desc(),
@@ -738,7 +739,7 @@ struct SearchAlgorithm : public SearchAlgorithmBase<CK> {
       const phi::backends::gpu::ConvolutionDescriptor& cdesc) {
 #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
     if (ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType(
+      PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnSetConvolutionMathType(
           cdesc.desc(), CUDNN_TENSOR_OP_MATH));
       VLOG(5) << "Enable Tensor Core for FLOAT16";
 #if CUDA_VERSION >= 11000
@@ -746,16 +747,16 @@ struct SearchAlgorithm : public SearchAlgorithmBase<CK> {
     } else if (ctx.GetComputeCapability() >= 80 &&
                dtype == CUDNN_DATA_BFLOAT16) {
       VLOG(5) << "Enable Tensor Core for BFLOAT16";
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType(
+      PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnSetConvolutionMathType(
           cdesc.desc(), CUDNN_TENSOR_OP_MATH));
 #endif  // CUDNN_VERSION_MIN(8, 1, 0)
     } else if (dtype == CUDNN_DATA_FLOAT && !cdesc.allow_tf32_) {
       VLOG(5) << "Disable TensorFloat (Tensor Core) for FLOAT";
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType(
+      PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnSetConvolutionMathType(
           cdesc.desc(), CUDNN_FMA_MATH));
 #endif  // CUDA_VERSION >= 11000
     } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType(
+      PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnSetConvolutionMathType(
           cdesc.desc(), CUDNN_DEFAULT_MATH));
     }
 #endif
@@ -788,7 +789,7 @@ struct ConvRunner<T, ConvKind::kForward> {
     for (int i = 0; i < groups; i++) {
       workspace_handle->RunFunc(
           [&](void* workspace_ptr) {
-            PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnConvolutionForward(
+            PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnConvolutionForward(
                 cudnn_handle,
                 &alpha,
                 args.idesc.desc(),
@@ -832,7 +833,7 @@ struct ConvRunner<T, ConvKind::kBackwardData> {
       workspace_handle->RunFunc(
           [&](void* workspace_ptr) {
             PADDLE_ENFORCE_GPU_SUCCESS(
-                phi::dynload::cudnnConvolutionBackwardData(
+                common::dynload::cudnnConvolutionBackwardData(
                     cudnn_handle,
                     &alpha,
                     args.wdesc.desc(),
@@ -876,7 +877,7 @@ struct ConvRunner<T, ConvKind::kBackwardFilter> {
       workspace_handle->RunFunc(
           [&](void* workspace_ptr) {
             PADDLE_ENFORCE_GPU_SUCCESS(
-                phi::dynload::cudnnConvolutionBackwardFilter(
+                common::dynload::cudnnConvolutionBackwardFilter(
                     cudnn_handle,
                     &alpha,
                     args.idesc.desc(),
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
index a2c4db700c4ba6..deeb384b4510b7 100644
--- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
@@ -230,38 +230,39 @@ void ConvCudnnGradKernelImplV7(
       workspace_handle.RunFunc(
           [&](void* cudnn_workspace_ptr) {
             PADDLE_ENFORCE_GPU_SUCCESS(
-                phi::dynload::miopenConvolutionBackwardData(handle,
-                                                            &alpha,
-                                                            args1.odesc.desc(),
-                                                            output_grad_data,
-                                                            args1.wdesc.desc(),
-                                                            filter_data,
-                                                            args1.cdesc.desc(),
-                                                            bwd_result.algo,
-                                                            &beta,
-                                                            args1.idesc.desc(),
-                                                            temp_tensor_data,
-                                                            cudnn_workspace_ptr,
-                                                            workspace_size));
+                common::dynload::miopenConvolutionBackwardData(
+                    handle,
+                    &alpha,
+                    args1.odesc.desc(),
+                    output_grad_data,
+                    args1.wdesc.desc(),
+                    filter_data,
+                    args1.cdesc.desc(),
+                    bwd_result.algo,
+                    &beta,
+                    args1.idesc.desc(),
+                    temp_tensor_data,
+                    cudnn_workspace_ptr,
+                    workspace_size));
           },
           workspace_size);
       PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::miopenOpTensor(handle,
-                                       miopenTensorOpAdd,
-                                       &alpha,
-                                       args1.idesc.desc(),
-                                       transformed_input_grad_data,
-                                       &alpha,
-                                       args1.idesc.desc(),
-                                       temp_tensor_data,
-                                       &beta,
-                                       args1.idesc.desc(),
-                                       transformed_input_grad_data));
+          common::dynload::miopenOpTensor(handle,
+                                          miopenTensorOpAdd,
+                                          &alpha,
+                                          args1.idesc.desc(),
+                                          transformed_input_grad_data,
+                                          &alpha,
+                                          args1.idesc.desc(),
+                                          temp_tensor_data,
+                                          &beta,
+                                          args1.idesc.desc(),
+                                          transformed_input_grad_data));
     } else {
       workspace_handle.RunFunc(
           [&](void* cudnn_workspace_ptr) {
             PADDLE_ENFORCE_GPU_SUCCESS(
-                phi::dynload::miopenConvolutionBackwardData(
+                common::dynload::miopenConvolutionBackwardData(
                     handle,
                     &alpha,
                     args1.odesc.desc(),
@@ -302,7 +303,7 @@ void ConvCudnnGradKernelImplV7(
     workspace_handle.RunFunc(
         [&](void* cudnn_workspace_ptr) {
           PADDLE_ENFORCE_GPU_SUCCESS(
-              phi::dynload::miopenConvolutionBackwardWeights(
+              common::dynload::miopenConvolutionBackwardWeights(
                   handle,
                   &alpha,
                   args2.odesc.desc(),
@@ -1211,19 +1212,20 @@ void ConvCudnnGradGradKernel(
       workspace_handle.RunFunc(
           [&](void* workspace_ptr) {
             PADDLE_ENFORCE_GPU_SUCCESS(
-                phi::dynload::miopenConvolutionForward(handle,
-                                                       &alpha,
-                                                       args1.idesc.desc(),
-                                                       ddx,
-                                                       args1.wdesc.desc(),
-                                                       w,
-                                                       args1.cdesc.desc(),
-                                                       fwd_result1.algo,
-                                                       &beta,
-                                                       args1.odesc.desc(),
-                                                       transformed_ddy_channel,
-                                                       workspace_ptr,
-                                                       workspace_size));
+                common::dynload::miopenConvolutionForward(
+                    handle,
+                    &alpha,
+                    args1.idesc.desc(),
+                    ddx,
+                    args1.wdesc.desc(),
+                    w,
+                    args1.cdesc.desc(),
+                    fwd_result1.algo,
+                    &beta,
+                    args1.odesc.desc(),
+                    transformed_ddy_channel,
+                    workspace_ptr,
+                    workspace_size));
           },
           workspace_size);
 #else
@@ -1248,19 +1250,20 @@ void ConvCudnnGradGradKernel(
       workspace_handle.RunFunc(
           [&](void* workspace_ptr) {
             PADDLE_ENFORCE_GPU_SUCCESS(
-                phi::dynload::miopenConvolutionForward(handle,
-                                                       &alpha,
-                                                       args2.idesc.desc(),
-                                                       x,
-                                                       args2.wdesc.desc(),
-                                                       ddw,
-                                                       args2.cdesc.desc(),
-                                                       fwd_result2.algo,
-                                                       &beta,
-                                                       args2.odesc.desc(),
-                                                       transformed_ddy_channel,
-                                                       workspace_ptr,
-                                                       workspace_size));
+                common::dynload::miopenConvolutionForward(
+                    handle,
+                    &alpha,
+                    args2.idesc.desc(),
+                    x,
+                    args2.wdesc.desc(),
+                    ddw,
+                    args2.cdesc.desc(),
+                    fwd_result2.algo,
+                    &beta,
+                    args2.odesc.desc(),
+                    transformed_ddy_channel,
+                    workspace_ptr,
+                    workspace_size));
           },
           workspace_size);
 #else
@@ -1290,7 +1293,7 @@ void ConvCudnnGradGradKernel(
     workspace_handle.RunFunc(
         [&](void* workspace_ptr) {
           PADDLE_ENFORCE_GPU_SUCCESS(
-              phi::dynload::miopenConvolutionBackwardWeights(
+              common::dynload::miopenConvolutionBackwardWeights(
                   handle,
                   &alpha,
                   args3.odesc.desc(),
@@ -1329,7 +1332,7 @@ void ConvCudnnGradGradKernel(
     workspace_handle.RunFunc(
         [&](void* workspace_ptr) {
           PADDLE_ENFORCE_GPU_SUCCESS(
-              phi::dynload::miopenConvolutionBackwardData(
+              common::dynload::miopenConvolutionBackwardData(
                   handle,
                   &alpha,
                   args4.odesc.desc(),
diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu
index 093f6cf80e449b..036ef19f49f8da 100644
--- a/paddle/phi/kernels/gpudnn/conv_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu
@@ -93,8 +93,8 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input,
   // cudnn 7 can support groups, no need to do it manually
   // FIXME(typhoonzero): find a better way to disable groups
   // rather than setting it to 1.
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::cudnnSetConvolutionGroupCount(args.cdesc.desc(), groups));
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnSetConvolutionGroupCount(
+      args.cdesc.desc(), groups));
   groups = 1;
 #endif
 #ifdef PADDLE_WITH_HIP
@@ -180,19 +180,19 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input,
   workspace_handle.RunFunc(
       [&](void* workspace_ptr) {
         PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::miopenConvolutionForward(handle,
-                                                   &alpha,
-                                                   args.idesc.desc(),
-                                                   input_data,
-                                                   args.wdesc.desc(),
-                                                   filter_data,
-                                                   args.cdesc.desc(),
-                                                   fwd_result.algo,
-                                                   &beta,
-                                                   args.odesc.desc(),
-                                                   output_data,
-                                                   workspace_ptr,
-                                                   workspace_size));
+            common::dynload::miopenConvolutionForward(handle,
+                                                      &alpha,
+                                                      args.idesc.desc(),
+                                                      input_data,
+                                                      args.wdesc.desc(),
+                                                      filter_data,
+                                                      args.cdesc.desc(),
+                                                      fwd_result.algo,
+                                                      &beta,
+                                                      args.odesc.desc(),
+                                                      output_data,
+                                                      workspace_ptr,
+                                                      workspace_size));
       },
       workspace_size);
 #else
diff --git a/paddle/phi/kernels/gpudnn/conv_miopen_helper.h b/paddle/phi/kernels/gpudnn/conv_miopen_helper.h
index be2c09bf8d18a8..324f89b86056a5 100644
--- a/paddle/phi/kernels/gpudnn/conv_miopen_helper.h
+++ b/paddle/phi/kernels/gpudnn/conv_miopen_helper.h
@@ -44,7 +44,7 @@ struct SearchAlgorithm<miopenConvFwdAlgorithm_t> {
     miopenConvAlgoPerf_t find_result;
     auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
       PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::miopenFindConvolutionForwardAlgorithm(
+          common::dynload::miopenFindConvolutionForwardAlgorithm(
               args.handle,
               args.idesc.desc(),
               args.x->data<T>(),
@@ -70,7 +70,7 @@ struct SearchAlgorithm<miopenConvFwdAlgorithm_t> {
   static size_t GetWorkspaceSize(const ConvArgs& args) {
     size_t workspace_size = 0;
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenConvolutionForwardGetWorkSpaceSize(
+        common::dynload::miopenConvolutionForwardGetWorkSpaceSize(
             args.handle,
             args.wdesc.desc(),
             args.idesc.desc(),
@@ -100,7 +100,7 @@ struct SearchAlgorithm<miopenConvBwdDataAlgorithm_t> {
     miopenConvAlgoPerf_t find_result;
     auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
       PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::miopenFindConvolutionBackwardDataAlgorithm(
+          common::dynload::miopenFindConvolutionBackwardDataAlgorithm(
               args.handle,
               args.odesc.desc(),
               args.o->data<T>(),
@@ -126,7 +126,7 @@ struct SearchAlgorithm<miopenConvBwdDataAlgorithm_t> {
   static size_t GetWorkspaceSize(const ConvArgs& args) {
     size_t workspace_size = 0;
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenConvolutionBackwardDataGetWorkSpaceSize(
+        common::dynload::miopenConvolutionBackwardDataGetWorkSpaceSize(
             args.handle,
             args.odesc.desc(),
             args.wdesc.desc(),
@@ -156,7 +156,7 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> {
     miopenConvAlgoPerf_t find_result;
     auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
       PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::miopenFindConvolutionBackwardWeightsAlgorithm(
+          common::dynload::miopenFindConvolutionBackwardWeightsAlgorithm(
               args.handle,
               args.odesc.desc(),
               args.o->data<T>(),
@@ -182,7 +182,7 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> {
   static size_t GetWorkspaceSize(const ConvArgs& args) {
     size_t workspace_size = 0;
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::miopenConvolutionBackwardWeightsGetWorkSpaceSize(
+        common::dynload::miopenConvolutionBackwardWeightsGetWorkSpaceSize(
             args.handle,
             args.odesc.desc(),
             args.idesc.desc(),
diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
index f4b0ac0f926729..f1f5ef97bb0d4e 100644
--- a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
@@ -18,9 +18,9 @@ limitations under the License. */
 
 #include "paddle/common/backends/dynload/cudnn.h"
 #include "paddle/common/bfloat16.h"
+#include "paddle/common/ddim.h"
 #include "paddle/common/float16.h"
 #include "paddle/phi/backends/context_pool.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/funcs/batch_norm_utils.h"
diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
index 0468572a741181..eb273f3fd09999 100644
--- a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
@@ -18,9 +18,9 @@ limitations under the License. */
 
 #include "paddle/common/backends/dynload/cudnn.h"
 #include "paddle/common/bfloat16.h"
+#include "paddle/common/ddim.h"
 #include "paddle/common/float16.h"
 #include "paddle/phi/backends/context_pool.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/funcs/padding.h"
diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
index d73a1eaeb000cc..bdf8df3bbc1b08 100644
--- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
+++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
@@ -1041,7 +1041,7 @@ void SoftmaxForwardCudnnKernel(const GPUContext& dev_ctx,
   auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
                                : MIOPEN_SOFTMAX_MODE_CHANNEL;
   auto algo = log_mode ? MIOPEN_SOFTMAX_LOG : MIOPEN_SOFTMAX_ACCURATE;
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSoftmaxForward_V2(
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::miopenSoftmaxForward_V2(
       handle,
       phi::backends::gpu::CudnnDataType<T>::kOne(),
       desc,
@@ -1056,7 +1056,7 @@ void SoftmaxForwardCudnnKernel(const GPUContext& dev_ctx,
   auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
                                : CUDNN_SOFTMAX_MODE_CHANNEL;
   auto algo = log_mode ? CUDNN_SOFTMAX_LOG : CUDNN_SOFTMAX_ACCURATE;
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSoftmaxForward(
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnSoftmaxForward(
       handle,
       algo,
       mode,
@@ -1113,7 +1113,7 @@ void SoftmaxBackwardCudnnKernel(const GPUContext& dev_ctx,
   auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
                                : MIOPEN_SOFTMAX_MODE_CHANNEL;
   auto algo = log_mode ? MIOPEN_SOFTMAX_LOG : MIOPEN_SOFTMAX_ACCURATE;
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSoftmaxBackward_V2(
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::miopenSoftmaxBackward_V2(
       handle,
       phi::backends::gpu::CudnnDataType<T>::kOne(),
       desc,
@@ -1130,7 +1130,7 @@ void SoftmaxBackwardCudnnKernel(const GPUContext& dev_ctx,
   auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
                                : CUDNN_SOFTMAX_MODE_CHANNEL;
   auto algo = log_mode ? CUDNN_SOFTMAX_LOG : CUDNN_SOFTMAX_ACCURATE;
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSoftmaxBackward(
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cudnnSoftmaxBackward(
       handle,
       algo,
       mode,
diff --git a/paddle/phi/kernels/impl/box_coder.h b/paddle/phi/kernels/impl/box_coder.h
index 739293ef54e6bf..95900153da165a 100644
--- a/paddle/phi/kernels/impl/box_coder.h
+++ b/paddle/phi/kernels/impl/box_coder.h
@@ -16,7 +16,7 @@
 
 #include <string>
 
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 
 namespace phi {
 namespace funcs {
diff --git a/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h b/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h
index c61b10d5a21995..3d907f06bc3a9b 100644
--- a/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h
+++ b/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h
@@ -16,8 +16,8 @@
 
 #include <vector>
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/broadcast_tensors_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
diff --git a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
index 2d92f8156b607d..bd997a12ee3a02 100644
--- a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/common/layout.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/conv_transpose_grad_kernel.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/phi/kernels/impl/conv_transpose_kernel_impl.h b/paddle/phi/kernels/impl/conv_transpose_kernel_impl.h
index 9fab3e6735b40d..9e71eaf8506533 100644
--- a/paddle/phi/kernels/impl/conv_transpose_kernel_impl.h
+++ b/paddle/phi/kernels/impl/conv_transpose_kernel_impl.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/common/layout.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/conv_transpose_kernel.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/phi/kernels/impl/determinant_kernel_impl.h b/paddle/phi/kernels/impl/determinant_kernel_impl.h
index 01c54d780b4b0e..7fa53a3d86c3c4 100644
--- a/paddle/phi/kernels/impl/determinant_kernel_impl.h
+++ b/paddle/phi/kernels/impl/determinant_kernel_impl.h
@@ -23,7 +23,7 @@
 #include "glog/logging.h"
 #include "paddle/phi/common/amp_type_traits.h"
 
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/determinant_kernel.h"
 
diff --git a/paddle/phi/kernels/impl/fft_grad_kernel_impl.h b/paddle/phi/kernels/impl/fft_grad_kernel_impl.h
index de4bb8d4bd1734..83a37abb5d89a1 100644
--- a/paddle/phi/kernels/impl/fft_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/fft_grad_kernel_impl.h
@@ -18,8 +18,8 @@
 #include <string>
 #include <vector>
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/common/data_type.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/kernels/complex_kernel.h"
diff --git a/paddle/phi/kernels/impl/fft_kernel_impl.h b/paddle/phi/kernels/impl/fft_kernel_impl.h
index 13c54182d1d316..e542d758c4ce64 100644
--- a/paddle/phi/kernels/impl/fft_kernel_impl.h
+++ b/paddle/phi/kernels/impl/fft_kernel_impl.h
@@ -18,7 +18,7 @@
 #include <string>
 #include <vector>
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/fft.h"
diff --git a/paddle/phi/kernels/impl/fold_kernel_impl.h b/paddle/phi/kernels/impl/fold_kernel_impl.h
index 694d754ecfb8e4..b585a7267a14f8 100644
--- a/paddle/phi/kernels/impl/fold_kernel_impl.h
+++ b/paddle/phi/kernels/impl/fold_kernel_impl.h
@@ -16,8 +16,8 @@
 
 #include <vector>
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/funcs/im2col.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/unfold_functor.h"
diff --git a/paddle/phi/kernels/impl/lstsq_kernel_impl.h b/paddle/phi/kernels/impl/lstsq_kernel_impl.h
index a12e0650824140..6c04554c4a5f99 100644
--- a/paddle/phi/kernels/impl/lstsq_kernel_impl.h
+++ b/paddle/phi/kernels/impl/lstsq_kernel_impl.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/common/memory_utils.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/utils/optional.h"
 
 #include "paddle/phi/core/dense_tensor.h"
@@ -119,7 +119,7 @@ inline void BatchedOrmqr<GPUContext, float>(const GPUContext& dev_ctx,
   int ldc = std::max<int>(1, m);
 
   auto handle = dev_ctx.cusolver_dn_handle();
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnSormqr_bufferSize(
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cusolverDnSormqr_bufferSize(
       handle, side, trans, m, n, k, a, lda, tau, other, ldc, &lwork));
   DenseTensor* info = new DenseTensor();
   info->Resize(make_ddim({1}));
@@ -136,20 +136,21 @@ inline void BatchedOrmqr<GPUContext, float>(const GPUContext& dev_ctx,
     float* workspace_ptr = dev_ctx.template Alloc<float>(workspace);
 
     // compute ormgr
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnSormqr(handle,
-                                                              side,
-                                                              trans,
-                                                              m,
-                                                              n,
-                                                              k,
-                                                              a_working_ptr,
-                                                              lda,
-                                                              tau_working_ptr,
-                                                              other_working_ptr,
-                                                              ldc,
-                                                              workspace_ptr,
-                                                              lwork,
-                                                              info_d));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        common::dynload::cusolverDnSormqr(handle,
+                                          side,
+                                          trans,
+                                          m,
+                                          n,
+                                          k,
+                                          a_working_ptr,
+                                          lda,
+                                          tau_working_ptr,
+                                          other_working_ptr,
+                                          ldc,
+                                          workspace_ptr,
+                                          lwork,
+                                          info_d));
 
     // check the error info
     int info_h;
@@ -188,7 +189,7 @@ inline void BatchedOrmqr<GPUContext, double>(const GPUContext& dev_ctx,
   int ldc = std::max<int>(1, m);
 
   auto handle = dev_ctx.cusolver_dn_handle();
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnDormqr_bufferSize(
+  PADDLE_ENFORCE_GPU_SUCCESS(common::dynload::cusolverDnDormqr_bufferSize(
       handle, side, trans, m, n, k, a, lda, tau, other, ldc, &lwork));
   DenseTensor* info = new DenseTensor();
   info->Resize(make_ddim({1}));
@@ -205,20 +206,21 @@ inline void BatchedOrmqr<GPUContext, double>(const GPUContext& dev_ctx,
     double* workspace_ptr = dev_ctx.template Alloc<double>(workspace);
 
     // compute ormgr
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnDormqr(handle,
-                                                              side,
-                                                              trans,
-                                                              m,
-                                                              n,
-                                                              k,
-                                                              a_working_ptr,
-                                                              lda,
-                                                              tau_working_ptr,
-                                                              other_working_ptr,
-                                                              ldc,
-                                                              workspace_ptr,
-                                                              lwork,
-                                                              info_d));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        common::dynload::cusolverDnDormqr(handle,
+                                          side,
+                                          trans,
+                                          m,
+                                          n,
+                                          k,
+                                          a_working_ptr,
+                                          lda,
+                                          tau_working_ptr,
+                                          other_working_ptr,
+                                          ldc,
+                                          workspace_ptr,
+                                          lwork,
+                                          info_d));
 
     // check the error info
     int info_h;
diff --git a/paddle/phi/kernels/impl/lu_kernel_impl.h b/paddle/phi/kernels/impl/lu_kernel_impl.h
index d2838551ff20a7..e6f7e88a1ab218 100644
--- a/paddle/phi/kernels/impl/lu_kernel_impl.h
+++ b/paddle/phi/kernels/impl/lu_kernel_impl.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/elementwise_add_kernel.h"
 #include "paddle/phi/kernels/elementwise_subtract_kernel.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
diff --git a/paddle/phi/kernels/impl/pool_grad_kernel_impl.h b/paddle/phi/kernels/impl/pool_grad_kernel_impl.h
index e3e19370c86bf1..cf00a9b82b8dd8 100644
--- a/paddle/phi/kernels/impl/pool_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/pool_grad_kernel_impl.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/pooling.h"
 #include "paddle/phi/kernels/pool_grad_kernel.h"
diff --git a/paddle/phi/kernels/impl/pool_kernel_impl.h b/paddle/phi/kernels/impl/pool_kernel_impl.h
index a2a6705a68302b..dc0b7ad2108ac5 100644
--- a/paddle/phi/kernels/impl/pool_kernel_impl.h
+++ b/paddle/phi/kernels/impl/pool_kernel_impl.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <algorithm>
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 #include "paddle/phi/kernels/funcs/pooling.h"
 #include "paddle/phi/kernels/pool_kernel.h"
 
diff --git a/paddle/phi/kernels/impl/qr_grad_kernel_impl.h b/paddle/phi/kernels/impl/qr_grad_kernel_impl.h
index d22eca3c73393e..b67512f4f895ab 100644
--- a/paddle/phi/kernels/impl/qr_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/qr_grad_kernel_impl.h
@@ -13,8 +13,8 @@
 // limitations under the License.
 #pragma once
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/infermeta/binary.h"
 #include "paddle/phi/infermeta/unary.h"
diff --git a/paddle/phi/kernels/impl/qr_kernel_impl.h b/paddle/phi/kernels/impl/qr_kernel_impl.h
index cb086590271eb1..79e8a39650b8c7 100644
--- a/paddle/phi/kernels/impl/qr_kernel_impl.h
+++ b/paddle/phi/kernels/impl/qr_kernel_impl.h
@@ -14,9 +14,9 @@
 
 #pragma once
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/utils/optional.h"
 
 #if defined(PADDLE_WITH_CUDA)
diff --git a/paddle/phi/kernels/impl/searchsorted_kernel_impl.h b/paddle/phi/kernels/impl/searchsorted_kernel_impl.h
index b3be4b9d556645..f933b718a28fe8 100644
--- a/paddle/phi/kernels/impl/searchsorted_kernel_impl.h
+++ b/paddle/phi/kernels/impl/searchsorted_kernel_impl.h
@@ -16,7 +16,7 @@
 
 #include <math.h>
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 #include "paddle/phi/kernels/funcs/algorithm.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
 
diff --git a/paddle/phi/kernels/impl/slogdeterminant_kernel_impl.h b/paddle/phi/kernels/impl/slogdeterminant_kernel_impl.h
index a5798d66ee5c7e..02004e7442da6d 100644
--- a/paddle/phi/kernels/impl/slogdeterminant_kernel_impl.h
+++ b/paddle/phi/kernels/impl/slogdeterminant_kernel_impl.h
@@ -20,7 +20,7 @@
 
 #include "glog/logging.h"
 
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/impl/determinant_kernel_impl.h"
 #include "paddle/phi/kernels/slogdeterminant_kernel.h"
diff --git a/paddle/phi/kernels/impl/warpctc_kernel_impl.h b/paddle/phi/kernels/impl/warpctc_kernel_impl.h
index 4b4bd6f5143dd3..6693b8fdbd7ba3 100644
--- a/paddle/phi/kernels/impl/warpctc_kernel_impl.h
+++ b/paddle/phi/kernels/impl/warpctc_kernel_impl.h
@@ -58,16 +58,16 @@ class ComputeCtcLossFunctor<Context, float> {
                          float* costs,
                          void* workspace,
                          ctcOptions options) {
-    return phi::dynload::compute_ctc_loss(activations,
-                                          gradients,
-                                          flat_labels,
-                                          label_lengths,
-                                          input_lengths,
-                                          static_cast<int>(alphabet_size),
-                                          static_cast<int>(minibatch),
-                                          costs,
-                                          workspace,
-                                          options);
+    return common::dynload::compute_ctc_loss(activations,
+                                             gradients,
+                                             flat_labels,
+                                             label_lengths,
+                                             input_lengths,
+                                             static_cast<int>(alphabet_size),
+                                             static_cast<int>(minibatch),
+                                             costs,
+                                             workspace,
+                                             options);
   }
 };
 
@@ -84,7 +84,7 @@ class ComputeCtcLossFunctor<Context, double> {
                          double* costs,
                          void* workspace,
                          ctcOptions options) {
-    return phi::dynload::compute_ctc_loss_double(
+    return common::dynload::compute_ctc_loss_double(
         activations,
         gradients,
         flat_labels,
@@ -141,14 +141,14 @@ class WarpCTCFunctor {
     ctcStatus_t status = CTC_STATUS_UNKNOWN_ERROR;
     if (sizeof(T) == 4) {
       status =
-          phi::dynload::get_workspace_size(cpu_label_lengths,
-                                           cpu_input_lengths,
-                                           static_cast<int>(sequence_width),
-                                           static_cast<int>(num_sequences),
-                                           options_,
-                                           &workspace_bytes);
+          common::dynload::get_workspace_size(cpu_label_lengths,
+                                              cpu_input_lengths,
+                                              static_cast<int>(sequence_width),
+                                              static_cast<int>(num_sequences),
+                                              options_,
+                                              &workspace_bytes);
     } else {
-      status = phi::dynload::get_workspace_size_double(
+      status = common::dynload::get_workspace_size_double(
           cpu_label_lengths,
           cpu_input_lengths,
           static_cast<int>(sequence_width),
@@ -162,7 +162,7 @@ class WarpCTCFunctor {
         errors::PreconditionNotMet(
             "warp-ctc [version %d] Error in get_workspace_size: %s",
             warpctc_version_,
-            phi::dynload::ctcGetStatusString(status)));
+            common::dynload::ctcGetStatusString(status)));
     PADDLE_ENFORCE_GT(
         workspace_bytes,
         0UL,
@@ -197,12 +197,12 @@ class WarpCTCFunctor {
         errors::PreconditionNotMet(
             "warp-ctc [version %d] Error in get_workspace_size: %s",
             warpctc_version_,
-            phi::dynload::ctcGetStatusString(status)));
+            common::dynload::ctcGetStatusString(status)));
   }
 
  protected:
   void init(const Context& dev_ctx, const size_t blank) {
-    warpctc_version_ = phi::dynload::get_warpctc_version();
+    warpctc_version_ = common::dynload::get_warpctc_version();
 
     if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/phi/kernels/impl/warprnnt_kernel_impl.h b/paddle/phi/kernels/impl/warprnnt_kernel_impl.h
index f51041285aaee9..42834f3b224925 100644
--- a/paddle/phi/kernels/impl/warprnnt_kernel_impl.h
+++ b/paddle/phi/kernels/impl/warprnnt_kernel_impl.h
@@ -55,16 +55,16 @@ class ComputeRnntLossFunctor<Context, float> {
                           float* costs,
                           void* workspace,
                           rnntOptions options) {
-    return phi::dynload::compute_rnnt_loss(activations,
-                                           gradients,
-                                           label,
-                                           label_lengths,
-                                           input_lengths,
-                                           static_cast<int>(alphabet_size),
-                                           static_cast<int>(minibatch),
-                                           costs,
-                                           workspace,
-                                           options);
+    return common::dynload::compute_rnnt_loss(activations,
+                                              gradients,
+                                              label,
+                                              label_lengths,
+                                              input_lengths,
+                                              static_cast<int>(alphabet_size),
+                                              static_cast<int>(minibatch),
+                                              costs,
+                                              workspace,
+                                              options);
   }
 };
 
@@ -81,16 +81,17 @@ class ComputeRnntLossFunctor<Context, double> {
                           double* costs,
                           void* workspace,
                           rnntOptions options) {
-    return phi::dynload::compute_rnnt_loss_fp64(activations,
-                                                gradients,
-                                                label,
-                                                label_lengths,
-                                                input_lengths,
-                                                static_cast<int>(alphabet_size),
-                                                static_cast<int>(minibatch),
-                                                costs,
-                                                workspace,
-                                                options);
+    return common::dynload::compute_rnnt_loss_fp64(
+        activations,
+        gradients,
+        label,
+        label_lengths,
+        input_lengths,
+        static_cast<int>(alphabet_size),
+        static_cast<int>(minibatch),
+        costs,
+        workspace,
+        options);
   }
 };
 
@@ -148,7 +149,7 @@ class WarpRNNTFunctor {
     }
 
     size_t workspace_bytes = 0;
-    status = phi::dynload::get_rnnt_workspace_size(
+    status = common::dynload::get_rnnt_workspace_size(
         maxT, maxU, B, gpu, &workspace_bytes, sizeof(T));
 
     PADDLE_ENFORCE_EQ(
@@ -157,7 +158,7 @@ class WarpRNNTFunctor {
         errors::PreconditionNotMet(
             "warp-rnnt [version %d] Error in get_rnnt_workspace_size: %s",
             warprnnt_version_,
-            phi::dynload::rnntGetStatusString(status)));
+            common::dynload::rnntGetStatusString(status)));
     PADDLE_ENFORCE_GT(
         workspace_bytes,
         0UL,
@@ -189,7 +190,7 @@ class WarpRNNTFunctor {
         errors::PreconditionNotMet(
             "warp-rnnt [version %d] Error in get_workspace_size: %s",
             warprnnt_version_,
-            phi::dynload::rnntGetStatusString(status)));
+            common::dynload::rnntGetStatusString(status)));
   }
 
  protected:
@@ -199,7 +200,7 @@ class WarpRNNTFunctor {
             const size_t blank,
             const float fastemit_lambda,
             const int num_threads) {
-    warprnnt_version_ = phi::dynload::get_warprnnt_version();
+    warprnnt_version_ = common::dynload::get_warprnnt_version();
 
     options_.maxT = maxT;
     options_.maxU = maxU;
diff --git a/paddle/phi/kernels/kps/reduce_kernel.cu b/paddle/phi/kernels/kps/reduce_kernel.cu
index 506bd36e828bc5..45704eb63ba5a9 100644
--- a/paddle/phi/kernels/kps/reduce_kernel.cu
+++ b/paddle/phi/kernels/kps/reduce_kernel.cu
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include <limits>
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpu/reduce.h"
 #include "paddle/phi/kernels/legacy/reduce_max_kernel.h"
diff --git a/paddle/phi/kernels/legacy/cpu/one_hot_kernel.cc b/paddle/phi/kernels/legacy/cpu/one_hot_kernel.cc
index 65d463af2cc5d6..f70169ab82ee7a 100644
--- a/paddle/phi/kernels/legacy/cpu/one_hot_kernel.cc
+++ b/paddle/phi/kernels/legacy/cpu/one_hot_kernel.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/common/data_type.h"
 #include "paddle/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/legacy/xpu/one_hot_kernel.cc b/paddle/phi/kernels/legacy/xpu/one_hot_kernel.cc
index 93b0e92b50fbda..1fc2cc972ad356 100644
--- a/paddle/phi/kernels/legacy/xpu/one_hot_kernel.cc
+++ b/paddle/phi/kernels/legacy/xpu/one_hot_kernel.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/common/data_type.h"
 #include "paddle/common/scalar.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/utils/data_type.h"
 
 namespace phi {
 template <typename Context, typename InT>
diff --git a/paddle/phi/kernels/onednn/dequantize_kernel.cc b/paddle/phi/kernels/onednn/dequantize_kernel.cc
index 384ca7ea1e6383..19ab10a23faa66 100644
--- a/paddle/phi/kernels/onednn/dequantize_kernel.cc
+++ b/paddle/phi/kernels/onednn/dequantize_kernel.cc
@@ -14,10 +14,10 @@
 
 #include "paddle/phi/kernels/dequantize_kernel.h"
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/onednn/onednn_context.h"
 #include "paddle/phi/backends/onednn/onednn_helper.h"
 #include "paddle/phi/backends/onednn/onednn_reuse.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/primitive/datamover_primitives.h b/paddle/phi/kernels/primitive/datamover_primitives.h
index 2a3579d99cfe67..a78045aa0dc7ca 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives.h
@@ -20,7 +20,7 @@
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_fp16.h>
 #endif
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 
 namespace phi {
 namespace kps {
diff --git a/paddle/phi/kernels/primitive/functor_primitives.h b/paddle/phi/kernels/primitive/functor_primitives.h
index c742706a0b0222..318f5715b6ca7e 100644
--- a/paddle/phi/kernels/primitive/functor_primitives.h
+++ b/paddle/phi/kernels/primitive/functor_primitives.h
@@ -14,9 +14,9 @@
 
 #pragma once
 
+#include "paddle/common/enforce.h"
 #include "paddle/common/float16.h"
 #include "paddle/phi/common/amp_type_traits.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/funcs/eigen/extensions.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/reverse_kernel.cc b/paddle/phi/kernels/reverse_kernel.cc
index 771acacedf0243..cdf380780c2509 100644
--- a/paddle/phi/kernels/reverse_kernel.cc
+++ b/paddle/phi/kernels/reverse_kernel.cc
@@ -14,7 +14,7 @@
 
 #include "paddle/phi/kernels/reverse_kernel.h"
 
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 
diff --git a/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc b/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc
index 6400aa1c2c891a..70f45cd1926ac4 100644
--- a/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc
@@ -16,9 +16,9 @@ limitations under the License. */
 
 #include "paddle/common/bfloat16.h"
 #include "paddle/common/complex.h"
+#include "paddle/common/enforce.h"
 #include "paddle/common/float16.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/elementwise_multiply_kernel.h"
 
diff --git a/paddle/phi/kernels/sparse/cpu/elementwise_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/elementwise_grad_kernel.cc
index 88a01e1135b7bd..85cb01fffbc10f 100644
--- a/paddle/phi/kernels/sparse/cpu/elementwise_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/elementwise_grad_kernel.cc
@@ -17,8 +17,8 @@ limitations under the License. */
 
 #include "glog/logging.h"
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/core/tensor_utils.h"
diff --git a/paddle/phi/kernels/sparse/cpu/elementwise_kernel.cc b/paddle/phi/kernels/sparse/cpu/elementwise_kernel.cc
index 72e3d00962b5dc..077ded01e364fa 100644
--- a/paddle/phi/kernels/sparse/cpu/elementwise_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/elementwise_kernel.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/sparse/elementwise_kernel.h"
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/core/tensor_utils.h"
diff --git a/paddle/phi/kernels/sparse/cpu/mask_kernel.cc b/paddle/phi/kernels/sparse/cpu/mask_kernel.cc
index d4e240d5e82039..e427ff002875b7 100644
--- a/paddle/phi/kernels/sparse/cpu/mask_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/mask_kernel.cc
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/sparse/mask_kernel.h"
 
+#include "paddle/common/ddim.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/api/ext/dispatch.h"
-#include "paddle/phi/core/ddim.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/core/visit_type.h"
diff --git a/paddle/phi/kernels/sparse/cpu/reshape_kernel.cc b/paddle/phi/kernels/sparse/cpu/reshape_kernel.cc
index e8badf3d6e8248..2ed6f8be3be80a 100644
--- a/paddle/phi/kernels/sparse/cpu/reshape_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/reshape_kernel.cc
@@ -14,7 +14,7 @@
 
 #include "paddle/phi/kernels/sparse/unary_kernel.h"
 
-#include "paddle/phi/core/ddim.h"
+#include "paddle/common/ddim.h"
 #include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
diff --git a/paddle/phi/kernels/sparse/cpu/slice_kernel.cc b/paddle/phi/kernels/sparse/cpu/slice_kernel.cc
index c40be8a9b15799..81af8339f88a91 100644
--- a/paddle/phi/kernels/sparse/cpu/slice_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/slice_kernel.cc
@@ -14,8 +14,8 @@
 
 #include "paddle/phi/kernels/sparse/unary_kernel.h"
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/slice_utils.h"
diff --git a/paddle/phi/kernels/sparse/gpu/addmm_kernel.cu b/paddle/phi/kernels/sparse/gpu/addmm_kernel.cu
index 1a43009c519b6c..58d4d94876ea33 100644
--- a/paddle/phi/kernels/sparse/gpu/addmm_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/addmm_kernel.cu
@@ -16,9 +16,9 @@ limitations under the License. */
 
 #include <vector>
 
+#include "paddle/common/ddim.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/ddim.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/sparse/sparse_blas.h"
diff --git a/paddle/phi/kernels/sparse/gpu/conv.cu.h b/paddle/phi/kernels/sparse/gpu/conv.cu.h
index 689629c9393388..3e4c6535c699b7 100644
--- a/paddle/phi/kernels/sparse/gpu/conv.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/conv.cu.h
@@ -25,10 +25,10 @@ namespace cub = hipcub;
 #endif
 #include "paddle/phi/kernels/sparse/conv_kernel.h"
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/funcs/index_impl.cu.h"
diff --git a/paddle/phi/kernels/sparse/gpu/elementwise_kernel.cu b/paddle/phi/kernels/sparse/gpu/elementwise_kernel.cu
index 47daa1eae19eda..711095df31a4b6 100644
--- a/paddle/phi/kernels/sparse/gpu/elementwise_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/elementwise_kernel.cu
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/sparse/elementwise_kernel.h"
 #include "paddle/phi/kernels/sparse/empty_kernel.h"
 
-#include "paddle/phi/core/enforce.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/visit_type.h"
 
diff --git a/paddle/phi/kernels/sparse/gpu/mask_kernel.cu b/paddle/phi/kernels/sparse/gpu/mask_kernel.cu
index 3b93ff9638c052..ff90632911b909 100644
--- a/paddle/phi/kernels/sparse/gpu/mask_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/mask_kernel.cu
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/sparse/mask_kernel.h"
 
+#include "paddle/common/ddim.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
-#include "paddle/phi/core/ddim.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/core/visit_type.h"
diff --git a/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu b/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu
index f39209e9b8604d..bb8b35a397a60d 100644
--- a/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu
@@ -16,9 +16,9 @@ limitations under the License. */
 
 #include <vector>
 
+#include "paddle/common/ddim.h"
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/ddim.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/meta_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
diff --git a/paddle/phi/kernels/sparse/gpu/mv_kernel.cu b/paddle/phi/kernels/sparse/gpu/mv_kernel.cu
index 27f094fb0fa982..a921ab6d9de59b 100644
--- a/paddle/phi/kernels/sparse/gpu/mv_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/mv_kernel.cu
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include <vector>
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/sparse/sparse_blas.h"
 
diff --git a/paddle/phi/kernels/sparse/gpu/slice_kernel.cu b/paddle/phi/kernels/sparse/gpu/slice_kernel.cu
index f47accfc8eff81..b96883c0ea3e17 100644
--- a/paddle/phi/kernels/sparse/gpu/slice_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/slice_kernel.cu
@@ -17,11 +17,11 @@
 
 #include "paddle/phi/kernels/sparse/unary_kernel.h"
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/memory_utils.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/empty_kernel.h"
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
index 084cb0e60bb6de..ae2a20cea29cc2 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
@@ -20,9 +20,9 @@ limitations under the License. */
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/dynload/rocsparse.h"
 #endif
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/core/visit_type.h"
@@ -292,12 +292,12 @@ void CsrToCooGPUKernel(const GPUContext& dev_ctx,
 
 #ifdef PADDLE_WITH_HIP
   dev_ctx.CusparseCall([&](rocsparse_handle handle) {
-    phi::dynload::rocsparse_csr2coo(handle,
-                                    csr_crows_data,
-                                    non_zero_num,
-                                    rows,
-                                    coo_rows_data,
-                                    rocsparse_index_base_zero);
+    common::dynload::rocsparse_csr2coo(handle,
+                                       csr_crows_data,
+                                       non_zero_num,
+                                       rows,
+                                       coo_rows_data,
+                                       rocsparse_index_base_zero);
   });
 #else
   auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1);
diff --git a/paddle/phi/kernels/sparse/unary_kernel.h b/paddle/phi/kernels/sparse/unary_kernel.h
index 24bf4f131f6101..dff8742f5afc79 100644
--- a/paddle/phi/kernels/sparse/unary_kernel.h
+++ b/paddle/phi/kernels/sparse/unary_kernel.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/common/int_array.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
 
diff --git a/paddle/phi/kernels/triangular_solve_grad_kernel.h b/paddle/phi/kernels/triangular_solve_grad_kernel.h
index eb5a5ab461a1dc..1b51ad50d3246a 100644
--- a/paddle/phi/kernels/triangular_solve_grad_kernel.h
+++ b/paddle/phi/kernels/triangular_solve_grad_kernel.h
@@ -14,9 +14,9 @@
 
 #pragma once
 
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/xpu/adam_kernel.cc b/paddle/phi/kernels/xpu/adam_kernel.cc
index a4c0d017d82de0..ad1ea4b96a043c 100644
--- a/paddle/phi/kernels/xpu/adam_kernel.cc
+++ b/paddle/phi/kernels/xpu/adam_kernel.cc
@@ -16,9 +16,9 @@
 
 #include "glog/logging.h"
 
+#include "paddle/common/enforce.h"
 #include "paddle/common/float16.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/adam_functors.h"
diff --git a/paddle/phi/kernels/xpu/arg_min_max_kernel.cc b/paddle/phi/kernels/xpu/arg_min_max_kernel.cc
index b5b2ed7d328884..f2b4dbbc08d39f 100644
--- a/paddle/phi/kernels/xpu/arg_min_max_kernel.cc
+++ b/paddle/phi/kernels/xpu/arg_min_max_kernel.cc
@@ -14,10 +14,10 @@
 
 #include "paddle/phi/kernels/arg_min_max_kernel.h"
 
+#include "paddle/common/data_type.h"
+#include "paddle/common/ddim.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
-#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/xpu/index_sample_grad_kernel.cc b/paddle/phi/kernels/xpu/index_sample_grad_kernel.cc
index 22c35ef46840fc..fa2d481f1afaeb 100644
--- a/paddle/phi/kernels/xpu/index_sample_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/index_sample_grad_kernel.cc
@@ -14,9 +14,9 @@
 
 #include "paddle/phi/kernels/index_sample_grad_kernel.h"
 
+#include "paddle/common/data_type.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/utils/data_type.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/xpu/index_select_grad_kernel.cc b/paddle/phi/kernels/xpu/index_select_grad_kernel.cc
index 14bfce38799f0c..a7bc62a54430b9 100644
--- a/paddle/phi/kernels/xpu/index_select_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/index_select_grad_kernel.cc
@@ -14,9 +14,9 @@
 
 #include "paddle/phi/kernels/index_select_grad_kernel.h"
 
+#include "paddle/common/data_type.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/utils/data_type.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/xpu/index_select_kernel.cc b/paddle/phi/kernels/xpu/index_select_kernel.cc
index 75c19aa028bce7..b4c63203f1068f 100644
--- a/paddle/phi/kernels/xpu/index_select_kernel.cc
+++ b/paddle/phi/kernels/xpu/index_select_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/index_select_kernel.h"
+#include "paddle/common/data_type.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/utils/data_type.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/xpu/kldiv_loss_grad_kernel.cc b/paddle/phi/kernels/xpu/kldiv_loss_grad_kernel.cc
index 5d2c750a4dfa33..64278d50e8ce5a 100644
--- a/paddle/phi/kernels/xpu/kldiv_loss_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/kldiv_loss_grad_kernel.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/softmax_kernel.h"
 
diff --git a/paddle/phi/kernels/xpu/kldiv_loss_kernel.cc b/paddle/phi/kernels/xpu/kldiv_loss_kernel.cc
index 4ef917f008ab9e..bf1c58855184d2 100644
--- a/paddle/phi/kernels/xpu/kldiv_loss_kernel.cc
+++ b/paddle/phi/kernels/xpu/kldiv_loss_kernel.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/softmax_kernel.h"
 
diff --git a/paddle/phi/kernels/xpu/one_hot_kernel.cc b/paddle/phi/kernels/xpu/one_hot_kernel.cc
index ad96d4858f7ed6..162fcf805ab4d4 100644
--- a/paddle/phi/kernels/xpu/one_hot_kernel.cc
+++ b/paddle/phi/kernels/xpu/one_hot_kernel.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/one_hot_kernel.h"
+#include "paddle/common/data_type.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/utils/data_type.h"
 
 namespace phi {
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/xpu/unique_kernel.cc b/paddle/phi/kernels/xpu/unique_kernel.cc
index 6f2d8f470a2120..7cb4ceb97c652c 100644
--- a/paddle/phi/kernels/xpu/unique_kernel.cc
+++ b/paddle/phi/kernels/xpu/unique_kernel.cc
@@ -19,10 +19,10 @@
 
 #include "paddle/phi/kernels/unique_kernel.h"
 
+#include "paddle/common/data_type.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/core/visit_type.h"
 
 namespace phi {
diff --git a/test/cpp/fluid/math/selected_rows_functor_test.cu.cc b/test/cpp/fluid/math/selected_rows_functor_test.cu.cc
index b507f096082f94..20cffb62d43b4c 100644
--- a/test/cpp/fluid/math/selected_rows_functor_test.cu.cc
+++ b/test/cpp/fluid/math/selected_rows_functor_test.cu.cc
@@ -15,10 +15,10 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/selected_rows_functor.h"
 
 #include "gtest/gtest.h"
+#include "paddle/common/errors.h"
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/test/cpp/phi/core/test_tensor_array.cc b/test/cpp/phi/core/test_tensor_array.cc
index 201790a7bc0e10..ae2685d6fc98e7 100644
--- a/test/cpp/phi/core/test_tensor_array.cc
+++ b/test/cpp/phi/core/test_tensor_array.cc
@@ -17,9 +17,9 @@ limitations under the License. */
 #include <utility>
 
 #include "gtest/gtest.h"
+#include "paddle/common/errors.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/tensor_array.h"
 #include "test/cpp/phi/core/allocator.h"