From c3aa04c0e90a6c89c7667ad435e170896e7074ac Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 22 Sep 2022 09:48:57 +0000
Subject: [PATCH 01/15] remove needless using tensor

---
 .../distributed/ps/service/brpc_utils.cc      |   2 +-
 paddle/fluid/eager/eager_tensor.h             |   6 +-
 paddle/fluid/framework/attribute_checker.h    |   2 +-
 .../fluid/framework/copy_same_tensor_test.cc  |   6 +-
 paddle/fluid/framework/custom_operator.cc     |  16 +-
 .../fluid/framework/data_device_transform.cc  |   4 +-
 .../fluid/framework/data_device_transform.h   |   4 +-
 .../framework/data_device_transform_test.cu   |   6 +-
 paddle/fluid/framework/data_feed.proto        |   2 +-
 .../fluid/framework/data_layout_transform.cc  |  15 +-
 .../fluid/framework/data_layout_transform.h   |  22 +-
 .../framework/data_layout_transform_test.cc   |   8 +-
 paddle/fluid/framework/data_transform.cc      |  14 +-
 paddle/fluid/framework/data_transform.h       |   6 +-
 paddle/fluid/framework/data_type_test.cc      |   6 +-
 paddle/fluid/framework/data_type_transform.cc |  28 +-
 paddle/fluid/framework/data_type_transform.h  |  12 +-
 .../framework/data_type_transform_test.cc     |  16 +-
 .../framework/data_type_transform_test.cu     |  16 +-
 .../framework/details/all_reduce_op_handle.cc |   2 +-
 .../framework/details/broadcast_op_handle.cc  |   4 +-
 .../details/broadcast_op_handle_test.h        |   4 +-
 .../framework/details/build_strategy_test.cc  |   3 +-
 .../details/fetch_async_op_handle.cc          |   4 +-
 .../framework/details/gather_op_handle.cc     |   4 +-
 .../details/gather_op_handle_test.cc          |   2 +-
 .../framework/details/nan_inf_utils_detail.cc |  35 ++-
 .../framework/details/nan_inf_utils_detail.cu |   2 +-
 .../framework/details/nan_inf_utils_detail.h  |   6 +-
 .../framework/details/reduce_and_gather.h     |   2 +-
 .../details/reduce_op_handle_test.cc          |   4 +-
 .../details/scale_loss_grad_op_handle.cc      |   4 +-
 .../details/scope_buffered_monitor.cc         |   7 +-
 .../details/share_tensor_buffer_functor.h     |   4 +-
 .../framework/details/variable_visitor.cc     |   4 +-
 .../framework/details/variable_visitor.h      |   2 +-
 paddle/fluid/framework/device_worker.cc       |  32 +--
 paddle/fluid/framework/device_worker.h        |   7 +-
 paddle/fluid/framework/dlpack_tensor.cc       |   2 +-
 paddle/fluid/framework/dlpack_tensor.h        |   2 +-
 paddle/fluid/framework/dlpack_tensor_test.cc  |   4 +-
 .../fluid/framework/downpour_lite_worker.cc   |  10 +-
 paddle/fluid/framework/downpour_worker.cc     |  10 +-
 paddle/fluid/framework/eigen.h                |  27 +-
 paddle/fluid/framework/eigen_test.cc          |  14 +-
 paddle/fluid/framework/fleet/ascend_wrapper.h |  11 +-
 paddle/fluid/framework/framework.proto        |   2 +-
 paddle/fluid/framework/infershape_utils.cc    |   2 +-
 paddle/fluid/framework/infershape_utils.h     |  18 +-
 .../framework/ir/attention_lstm_fuse_pass.cc  |  24 +-
 .../fluid/framework/ir/conv_bn_fuse_pass.cc   |  18 +-
 paddle/fluid/framework/ir/fc_fuse_pass.cc     |   2 +-
 .../framework/ir/fusion_group/operation.cc    |   3 +-
 .../reference_count_pass.cc                   |   2 +-
 .../compute_propagate_scales_mkldnn_pass.cc   |  22 +-
 .../compute_propagate_scales_mkldnn_pass.h    |  11 +-
 ...ute_propagate_scales_mkldnn_pass_tester.cc |   8 +-
 .../conv_affine_channel_mkldnn_fuse_pass.cc   |  16 +-
 .../mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc |   6 +-
 .../ir/mkldnn/quant_dequant_mkldnn_pass.cc    |   6 +-
 .../ir/mkldnn/quant_dequant_mkldnn_pass.h     |   4 +-
 .../ir/multihead_matmul_fuse_pass.cc          |  12 +-
 paddle/fluid/framework/ir/pass_test_util.h    |   2 +-
 paddle/fluid/framework/lod_tensor.h           |   4 +-
 .../framework/new_executor/data_transfer.cc   |  12 +-
 .../framework/new_executor/interpretercore.cc |  16 +-
 .../new_executor/new_executor_defs.cc         |   2 +-
 paddle/fluid/framework/operator.cc            | 129 +++++-----
 paddle/fluid/framework/operator.h             |  34 +--
 paddle/fluid/framework/operator_test.cc       |  19 +-
 .../paddle2cinn/cinn_graph_symbolization.cc   |   4 +-
 .../cinn_graph_symbolization_test.cc          |   2 +-
 .../framework/paddle2cinn/cinn_lib_test.cc    |  14 +-
 paddle/fluid/framework/program_desc.h         |  12 +-
 paddle/fluid/framework/save_load_util.cc      |  64 ++---
 paddle/fluid/framework/save_load_util.h       |   7 +-
 paddle/fluid/framework/save_load_util_test.cc |   8 +-
 .../framework/selected_rows_utils_test.cc     |   6 +-
 paddle/fluid/framework/tensor.h               |   1 -
 paddle/fluid/framework/tensor_test.cc         |  94 +++----
 paddle/fluid/framework/tensor_util.cc         | 104 ++++----
 paddle/fluid/framework/tensor_util.h          |  67 ++---
 paddle/fluid/framework/tensor_util_test.cc    | 108 ++++----
 paddle/fluid/framework/tensor_util_test.cu    |  48 ++--
 paddle/fluid/framework/tuple.h                |   2 +-
 paddle/fluid/framework/var_type_traits.h      |   2 +-
 paddle/fluid/framework/variable.h             |  18 +-
 paddle/fluid/framework/variable_test.cc       |   2 +-
 paddle/fluid/framework/version.cc             |   4 +-
 paddle/fluid/framework/version.h              |   2 +-
 paddle/fluid/imperative/all_reduce.cc         |   4 +-
 paddle/fluid/imperative/basic_engine.cc       |   2 +-
 paddle/fluid/imperative/bkcl_context.cc       |   6 +-
 paddle/fluid/imperative/cncl_context.cc       |   6 +-
 paddle/fluid/imperative/gloo_context.cc       |   4 +-
 paddle/fluid/imperative/gloo_context.h        |   2 +-
 .../fluid/imperative/gradient_accumulator.cc  |   4 +-
 paddle/fluid/imperative/hccl_context.cc       |   6 +-
 paddle/fluid/imperative/nccl_context.cc       |   2 +-
 paddle/fluid/imperative/prepared_operator.cc  |   4 +-
 paddle/fluid/imperative/prepared_operator.h   |   6 +-
 paddle/fluid/imperative/reducer.cc            |  37 ++-
 paddle/fluid/imperative/reducer.cu            |   2 +-
 paddle/fluid/imperative/reducer.h             |   8 +-
 .../tests/test_gradient_accmulator.cc         |   4 +-
 paddle/fluid/imperative/tests/test_group.cc   |   6 +-
 .../fluid/imperative/tests/test_prepare_op.cc |   2 +-
 paddle/fluid/imperative/variable_wrapper.h    |   6 +-
 .../passes/convert_to_mixed_precision.cc      |   4 +-
 .../ir_params_sync_among_devices_pass.cc      |  12 +-
 .../api/details/reset_tensor_array.h          |   2 +-
 .../inference/api/details/zero_copy_tensor.cc |   4 +-
 .../tensorrt/convert/fill_constant_op.cc      |   2 +-
 .../inference/tensorrt/convert/gelu_op.cc     |   4 +-
 .../inference/tensorrt/convert/matmul_op.cc   |   3 +-
 .../inference/tensorrt/convert/op_converter.h |   4 +-
 .../inference/tensorrt/convert/scale_op.cc    |   2 +-
 .../tensorrt/convert/strided_slice_op.cc      |   2 +-
 paddle/fluid/inference/tensorrt/engine.cc     |  24 +-
 paddle/fluid/inference/tensorrt/engine.h      |  10 +-
 .../plugin/emb_eltwise_layernorm_plugin.h     |  11 +-
 .../plugin/fused_token_prune_op_plugin.cu     |   2 +-
 .../tensorrt/plugin/group_norm_op_plugin.cu   |   4 +-
 .../tensorrt/plugin/group_norm_op_plugin.h    |  25 +-
 .../tensorrt/plugin/instance_norm_op_plugin.h |   4 +-
 .../tensorrt/plugin/layer_norm_op_plugin.h    |  25 +-
 .../tensorrt/plugin/qkv_to_context_plugin.cu  |   8 +-
 .../tensorrt/plugin/qkv_to_context_plugin.h   |  11 +-
 .../inference/tensorrt/test_dynamic_engine.cc |   8 +-
 .../fluid/inference/tensorrt/test_engine.cc   |   4 +-
 .../inference/tensorrt/trt_int8_calibrator.cc |   2 +-
 .../inference/tensorrt/trt_int8_calibrator.h  |   2 +-
 paddle/fluid/operators/abs_op.cc              |   2 +-
 paddle/fluid/operators/abs_op_mlu.cc          |  12 +-
 paddle/fluid/operators/abs_op_npu.cc          |  12 +-
 paddle/fluid/operators/activation_cudnn.cu.cc |   4 +-
 .../fluid/operators/activation_cudnn_op.cu.cc |  20 +-
 paddle/fluid/operators/activation_op.cc       |   8 +-
 paddle/fluid/operators/activation_op.h        |  73 +++---
 paddle/fluid/operators/activation_op.kps      |  16 +-
 paddle/fluid/operators/activation_op_mlu.cc   |  82 +++---
 paddle/fluid/operators/activation_op_npu.cc   |   2 +-
 paddle/fluid/operators/addmm_op.cc            |   1 -
 paddle/fluid/operators/affine_channel_op.cc   |  20 +-
 paddle/fluid/operators/affine_channel_op.cu   |  22 +-
 .../fluid/operators/affine_channel_op_xpu.cc  |  20 +-
 paddle/fluid/operators/affine_grid_op.cc      |   2 +-
 .../amp/alloc_float_status_op_npu.cc          |   4 +-
 .../amp/check_finite_and_unscale_op_mlu.cc    |  10 +-
 .../amp/check_finite_and_unscale_op_npu.cc    |  14 +-
 .../check_finite_and_unscale_op_npu_test.cc   |   2 +-
 .../amp/clear_float_status_op_npu.cc          |   6 +-
 .../operators/amp/get_float_status_op_npu.cc  |   6 +-
 .../operators/amp/update_loss_scaling_op.cc   |   2 +-
 .../amp/update_loss_scaling_op_npu.cc         |  51 ++--
 paddle/fluid/operators/arg_max_op_mlu.cc      |  10 +-
 paddle/fluid/operators/arg_max_op_npu.cc      |   6 +-
 paddle/fluid/operators/arg_min_op_npu.cc      |   6 +-
 paddle/fluid/operators/argsort_op_mlu.cc      |   8 +-
 paddle/fluid/operators/argsort_op_npu.cc      |  32 +--
 paddle/fluid/operators/array_operator.h       |   2 +-
 .../fluid/operators/array_to_lod_tensor_op.cc |   4 +-
 paddle/fluid/operators/ascend_trigger_op.h    |   4 +-
 paddle/fluid/operators/assign_op.cc           |   2 +-
 paddle/fluid/operators/assign_op_test.cc      |   4 +-
 paddle/fluid/operators/assign_pos_op.cu       |   2 +-
 paddle/fluid/operators/assign_value_op.h      |  16 +-
 paddle/fluid/operators/attention_lstm_op.cc   |  25 +-
 paddle/fluid/operators/attention_lstm_op.h    |   2 +-
 paddle/fluid/operators/batch_fc_op.cu         |  17 +-
 paddle/fluid/operators/batch_norm_op.cc       |  19 +-
 paddle/fluid/operators/batch_norm_op.cu       |   2 +-
 paddle/fluid/operators/batch_norm_op.h        |   6 +-
 paddle/fluid/operators/batch_norm_op_mlu.cc   |  46 ++--
 paddle/fluid/operators/batch_norm_op_npu.cc   |  50 ++--
 paddle/fluid/operators/bce_loss_op.cc         |   2 -
 paddle/fluid/operators/bce_loss_op_mlu.cc     |  16 +-
 paddle/fluid/operators/bce_loss_op_npu.cc     |  16 +-
 paddle/fluid/operators/beam_search_op.h       |   2 +-
 paddle/fluid/operators/bilateral_slice_op.cc  |   3 +-
 paddle/fluid/operators/bilateral_slice_op.cu  |  27 +-
 paddle/fluid/operators/bincount_op.cc         |   1 -
 paddle/fluid/operators/bmm_op.h               |  10 +-
 paddle/fluid/operators/bpr_loss_op.h          |  27 +-
 .../fluid/operators/broadcast_tensors_op.cc   |   1 -
 paddle/fluid/operators/cast_op.h              |  12 +-
 paddle/fluid/operators/cast_op_mlu.cc         |   6 +-
 paddle/fluid/operators/cast_op_npu.cc         |   6 +-
 paddle/fluid/operators/center_loss_op.cu      |  18 +-
 paddle/fluid/operators/center_loss_op.h       |  22 +-
 paddle/fluid/operators/chunk_eval_op.h        |  16 +-
 .../operators/cinn/cinn_launch_context.h      |   2 +-
 paddle/fluid/operators/clip_by_norm_op.h      |   2 +-
 paddle/fluid/operators/clip_by_norm_op_npu.cc |   6 +-
 paddle/fluid/operators/clip_by_norm_op_xpu.cc |   8 +-
 paddle/fluid/operators/clip_op_mlu.cc         |  20 +-
 paddle/fluid/operators/clip_op_npu.cc         |  24 +-
 paddle/fluid/operators/coalesce_tensor_op.cc  |   2 +-
 .../fluid/operators/collective/allreduce_op.h |   4 +-
 .../operators/collective/barrier_op.cu.cc     |   4 +-
 .../operators/collective/broadcast_op.cu.cc   |   4 +-
 .../operators/collective/broadcast_op_xpu.cc  |   4 +-
 .../operators/collective/c_allgather_op.cu.cc |   4 +-
 .../operators/collective/c_allgather_op.h     |   4 +-
 .../collective/c_allgather_op_mlu.cc          |   4 +-
 .../collective/c_allgather_op_npu.cc          |   4 +-
 .../collective/c_allgather_op_xpu.cc          |   4 +-
 .../operators/collective/c_allreduce_op.h     |  28 +-
 .../operators/collective/c_broadcast_op.cu.cc |   4 +-
 .../operators/collective/c_broadcast_op.h     |   4 +-
 .../collective/c_broadcast_op_mlu.cc          |   4 +-
 .../collective/c_broadcast_op_npu.cc          |   4 +-
 .../operators/collective/c_concat_op.cu.cc    |  10 +-
 .../collective/c_embedding_op_npu.cc          |  16 +-
 .../fluid/operators/collective/c_reduce_op.h  |  12 +-
 .../collective/c_reducescatter_op.cu.cc       |   4 +-
 .../collective/c_reducescatter_op_npu.cc      |   4 +-
 .../operators/collective/c_scatter_op.cu.cc   |  10 +-
 .../fluid/operators/collective/c_scatter_op.h |   4 +-
 .../c_softmax_with_cross_entropy_op.cu        |  31 +--
 .../fluid/operators/collective/c_split_op.cu  |   4 +-
 .../collective/global_gather_op.cu.cc         |   8 +-
 .../collective/global_scatter_op.cu.cc        |   8 +-
 .../collective/partial_allgather_op.cu.cc     |   4 +-
 .../collective/partial_allgather_op_npu.cc    |   4 +-
 .../operators/collective/recv_v2_op.cu.cc     |  12 +-
 .../operators/collective/send_v2_op.cu.cc     |  15 +-
 paddle/fluid/operators/concat_op.cc           |   8 +-
 paddle/fluid/operators/concat_op_mlu.cc       |   7 +-
 paddle/fluid/operators/concat_op_npu.cc       |   5 +-
 .../fluid/operators/controlflow/fetch_op.cc   |   2 +-
 .../operators/controlflow/fetch_v2_op.cc      |   4 +-
 .../operators/controlflow/logical_op_mlu.cc   |   8 +-
 .../operators/controlflow/logical_op_npu.cc   |  18 +-
 .../operators/controlflow/logical_op_xpu.h    |  14 +-
 paddle/fluid/operators/conv_base_helper.h     |  10 +-
 paddle/fluid/operators/conv_cudnn_helper.h    |   4 +-
 paddle/fluid/operators/conv_miopen_helper.h   |   4 +-
 paddle/fluid/operators/conv_op.cc             |   8 +-
 paddle/fluid/operators/conv_op.h              |   6 +-
 paddle/fluid/operators/conv_op_mlu.cc         |  40 +--
 paddle/fluid/operators/conv_op_npu.cc         |  68 ++---
 paddle/fluid/operators/conv_shift_op.cc       |  17 +-
 paddle/fluid/operators/conv_shift_op.cu       |  21 +-
 paddle/fluid/operators/conv_transpose_op.cc   |   2 +-
 paddle/fluid/operators/conv_transpose_op.h    |   2 +-
 .../fluid/operators/conv_transpose_op_mlu.cc  |  22 +-
 .../fluid/operators/conv_transpose_op_npu.cc  |  28 +-
 paddle/fluid/operators/copy_cross_scope_op.cc |   4 +-
 paddle/fluid/operators/correlation_op.cc      |  14 +-
 paddle/fluid/operators/correlation_op.cu      |  28 +-
 paddle/fluid/operators/cos_sim_op.cc          |   2 -
 paddle/fluid/operators/cos_sim_op.h           |  27 +-
 paddle/fluid/operators/crf_decoding_op.h      |  25 +-
 paddle/fluid/operators/crop_op.cc             |   2 -
 paddle/fluid/operators/crop_op.h              |  24 +-
 paddle/fluid/operators/crop_op_npu.cc         |  10 +-
 paddle/fluid/operators/crop_tensor_op.cc      |   6 +-
 paddle/fluid/operators/cross_entropy_op.h     |  32 +--
 paddle/fluid/operators/ctc_align_op.h         |   2 +-
 .../fluid/operators/cuda_graph_with_in_out.h  |  28 +-
 paddle/fluid/operators/cudnn_lstm_cache.h     |   2 +-
 paddle/fluid/operators/cudnn_lstm_op.cu.cc    |  73 +++---
 paddle/fluid/operators/cudnn_rnn_cache.h      |   4 +-
 paddle/fluid/operators/cumsum_op_mlu.cc       |   8 +-
 paddle/fluid/operators/cumsum_op_npu.cc       |  10 +-
 paddle/fluid/operators/cvm_op.cc              |   2 +-
 paddle/fluid/operators/cvm_op.cu              |   4 +-
 paddle/fluid/operators/cvm_op.h               |   4 +-
 paddle/fluid/operators/data_norm_op.cc        |  48 ++--
 paddle/fluid/operators/data_norm_op.cu        |  52 ++--
 paddle/fluid/operators/decode_jpeg_op.cc      |   2 +-
 .../fluid/operators/deformable_conv_op_mlu.cc |  38 +--
 .../operators/deformable_psroi_pooling_op.cu  |  30 ++-
 .../operators/deformable_psroi_pooling_op.h   |  27 +-
 .../fluid/operators/dequantize_abs_max_op.cc  |   6 +-
 .../fluid/operators/dequantize_abs_max_op.cu  |   6 +-
 .../fluid/operators/dequantize_abs_max_op.h   |  12 +-
 paddle/fluid/operators/dequantize_log_op.cc   |   6 +-
 paddle/fluid/operators/dequantize_log_op.cu   |   6 +-
 paddle/fluid/operators/dequantize_log_op.h    |  12 +-
 paddle/fluid/operators/dequantize_op.h        |   1 -
 .../detection/anchor_generator_op.cu          |  14 +-
 .../operators/detection/anchor_generator_op.h |   8 +-
 .../fluid/operators/detection/bbox_util.cu.h  |   2 +-
 paddle/fluid/operators/detection/bbox_util.h  |  38 +--
 .../operators/detection/bipartite_match_op.cc |  11 +-
 .../fluid/operators/detection/box_clip_op.cu  |   4 +-
 .../fluid/operators/detection/box_clip_op.h   |   2 +-
 .../operators/detection/box_coder_op_npu.cc   |  78 +++---
 .../detection/box_decoder_and_assign_op.cu    |   6 +-
 .../detection/box_decoder_and_assign_op.h     |   6 +-
 .../detection/collect_fpn_proposals_op.cc     |   2 +-
 .../detection/collect_fpn_proposals_op.cu     |   8 +-
 .../detection/collect_fpn_proposals_op.h      |   4 +-
 .../detection/density_prior_box_op.cu         |  12 +-
 .../detection/density_prior_box_op.h          |  10 +-
 .../detection/density_prior_box_op_npu.cc     |  52 ++--
 .../detection/generate_mask_labels_op.cc      |  22 +-
 .../detection/generate_proposal_labels_op.cc  |  52 ++--
 .../detection/generate_proposals_op.cc        |  21 +-
 .../detection/generate_proposals_op.cu        |  21 +-
 .../detection/generate_proposals_v2_op.cc     |   2 +-
 .../operators/detection/iou_similarity_op.h   |   2 +-
 .../detection/iou_similarity_op_mlu.cc        |  34 ++-
 .../detection/iou_similarity_op_npu.cc        |  34 ++-
 .../detection/iou_similarity_op_xpu.cc        |   2 +-
 .../detection/locality_aware_nms_op.cc        |  16 +-
 .../operators/detection/matrix_nms_op.cc      |   2 +-
 .../detection/mine_hard_examples_op.cc        |  10 +-
 .../operators/detection/multiclass_nms_op.cc  |  27 +-
 paddle/fluid/operators/detection/nms_op.cc    |   2 -
 .../detection/polygon_box_transform_op.cc     |   6 +-
 .../detection/polygon_box_transform_op.cu     |   6 +-
 .../fluid/operators/detection/prior_box_op.cc |   2 +-
 .../fluid/operators/detection/prior_box_op.h  |  10 +-
 .../operators/detection/prior_box_op_npu.cc   |  10 +-
 .../retinanet_detection_output_op.cc          |  12 +-
 .../detection/roi_perspective_transform_op.cc |  21 +-
 .../detection/roi_perspective_transform_op.cu |  21 +-
 .../detection/rpn_target_assign_op.cc         |  45 ++--
 .../detection/sigmoid_focal_loss_op.cc        |   2 -
 .../detection/sigmoid_focal_loss_op.cu        |  21 +-
 .../detection/sigmoid_focal_loss_op.h         |  21 +-
 .../operators/detection/target_assign_op.h    |   6 +-
 .../fluid/operators/detection/yolo_box_op.cc  |   2 -
 .../operators/detection/yolov3_loss_op.cc     |   2 -
 paddle/fluid/operators/detection_map_op.cc    |   2 +-
 paddle/fluid/operators/detection_map_op.h     |  10 +-
 paddle/fluid/operators/dgc_clip_by_norm_op.cc |   2 +-
 paddle/fluid/operators/dgc_clip_by_norm_op.h  |   8 +-
 paddle/fluid/operators/dgc_op.cc              |   2 +-
 paddle/fluid/operators/dgc_op.h               |  24 +-
 paddle/fluid/operators/diag_op.h              |   4 +-
 paddle/fluid/operators/dropout_impl.cu.h      |  42 +--
 paddle/fluid/operators/dropout_impl_util.h    |   4 +-
 paddle/fluid/operators/dropout_op.cc          |   4 +-
 paddle/fluid/operators/dropout_op_mlu.cc      |  16 +-
 paddle/fluid/operators/dropout_op_npu.cc      |  16 +-
 paddle/fluid/operators/eig_op.h               |   6 +-
 paddle/fluid/operators/eigh_op.cc             |   2 -
 paddle/fluid/operators/eigvalsh_op.cc         |   2 -
 .../elementwise/elementwise_add_op_mlu.cc     |  12 +-
 .../elementwise/elementwise_add_op_npu.cc     |  12 +-
 .../elementwise/elementwise_div_op.h          |   4 +-
 .../elementwise/elementwise_div_op_mlu.cc     |  14 +-
 .../elementwise/elementwise_div_op_npu.cc     |  20 +-
 .../elementwise_floordiv_op_npu.cc            |   8 +-
 .../elementwise/elementwise_max_op_npu.cc     |  18 +-
 .../elementwise/elementwise_min_op_mlu.cc     |   2 +-
 .../elementwise/elementwise_min_op_npu.cc     |  18 +-
 .../operators/elementwise/elementwise_mlu.h   |  26 +-
 .../elementwise/elementwise_mod_op_npu.cc     |   8 +-
 .../elementwise/elementwise_mul_op.h          |   4 +-
 .../elementwise/elementwise_mul_op_mlu.cc     |  12 +-
 .../elementwise/elementwise_mul_op_npu.cc     |  22 +-
 .../operators/elementwise/elementwise_npu.h   |  14 +-
 .../operators/elementwise/elementwise_op.h    |  20 +-
 .../elementwise/elementwise_op_broadcast.cu.h |   4 +-
 .../elementwise/elementwise_op_function.h     | 118 ++++-----
 .../elementwise/elementwise_op_impl.cu.h      |   4 +-
 .../elementwise/elementwise_pow_op_mlu.cc     |  12 +-
 .../elementwise/elementwise_pow_op_npu.cc     |  18 +-
 .../elementwise/elementwise_sub_op_mlu.cc     |  14 +-
 .../elementwise/elementwise_sub_op_npu.cc     |  20 +-
 .../operators/elementwise/elementwise_xpu.h   |  10 +-
 .../mkldnn/elementwise_mkldnn_op.h            |  25 +-
 paddle/fluid/operators/empty_op.cc            |   2 +-
 paddle/fluid/operators/expand_as_op.cc        |   2 -
 paddle/fluid/operators/expand_as_op.h         |  24 +-
 paddle/fluid/operators/expand_as_v2_op.cc     |   2 -
 paddle/fluid/operators/expand_as_v2_op.h      |   2 +-
 paddle/fluid/operators/expand_as_v2_op_mlu.cc |   8 +-
 paddle/fluid/operators/expand_as_v2_op_npu.cc |   6 +-
 paddle/fluid/operators/expand_op.cc           |   6 +-
 paddle/fluid/operators/expand_op.h            |  28 +-
 paddle/fluid/operators/expand_op_npu.cc       |   2 +-
 paddle/fluid/operators/expand_v2_op.cc        |   6 +-
 paddle/fluid/operators/expand_v2_op.h         |  12 +-
 paddle/fluid/operators/expand_v2_op_mlu.cc    |   4 +-
 paddle/fluid/operators/expand_v2_op_npu.cc    |  10 +-
 paddle/fluid/operators/eye_op_npu.cc          |   4 +-
 paddle/fluid/operators/fake_dequantize_op.cc  |  24 +-
 .../fluid/operators/fake_dequantize_op.cu.h   |  12 +-
 paddle/fluid/operators/fake_dequantize_op.h   |  24 +-
 paddle/fluid/operators/fake_quantize_op.cc    |  50 ++--
 paddle/fluid/operators/fake_quantize_op.cu.h  |  50 ++--
 paddle/fluid/operators/fake_quantize_op.h     | 144 +++++------
 paddle/fluid/operators/fc_op.h                |   6 +-
 paddle/fluid/operators/feed_forward_test.cu   |   4 +-
 paddle/fluid/operators/fill_any_like_op.cc    |   2 +-
 .../fluid/operators/fill_any_like_op_mlu.cc   |   2 +-
 .../fluid/operators/fill_any_like_op_npu.cc   |   2 +-
 .../fill_constant_batch_size_like_op_mlu.cc   |   2 +-
 .../fill_constant_batch_size_like_op_npu.cc   |   4 +-
 paddle/fluid/operators/fill_constant_op.cc    |   2 +-
 .../fluid/operators/fill_constant_op_mlu.cc   |   4 +-
 .../fluid/operators/fill_constant_op_npu.cc   |   2 +-
 paddle/fluid/operators/fill_diagonal_op.cc    |   4 +-
 .../operators/fill_diagonal_tensor_op.cc      |   4 +-
 paddle/fluid/operators/fill_zeros_like_op.h   |   2 +-
 .../fluid/operators/fill_zeros_like_op_npu.cc |   4 +-
 paddle/fluid/operators/filter_by_instag_op.cu |   4 +-
 paddle/fluid/operators/filter_by_instag_op.h  |   4 +-
 paddle/fluid/operators/flatten_op.cc          |   2 +-
 paddle/fluid/operators/flatten_op_npu.cc      |   6 +-
 paddle/fluid/operators/flip_op.cc             |   1 -
 paddle/fluid/operators/fsp_op.h               |  19 +-
 .../fluid/operators/fused/attn_bias_add.cu.h  |   2 +-
 paddle/fluid/operators/fused/attn_gemm.h      |  28 +-
 paddle/fluid/operators/fused/attn_gemm_int8.h |  62 ++---
 .../fluid/operators/fused/conv_fusion_op.cu   |  16 +-
 .../operators/fused/cudnn_bn_add_relu_test.cc | 171 ++++++-------
 .../fused/cudnn_bn_stats_finalize.cu.h        |   2 +-
 .../operators/fused/cudnn_norm_conv.cu.h      |   2 +-
 .../operators/fused/cudnn_norm_conv_test.cc   |  92 +++----
 .../fused/cudnn_scale_bias_add_relu.cu.h      |   2 +-
 paddle/fluid/operators/fused/fmha_ref.h       |  74 +++---
 .../operators/fused/fused_attention_op.cc     |   6 +-
 .../operators/fused/fused_attention_op.cu     | 185 ++++++++------
 ...sed_bias_dropout_residual_layer_norm_op.cc |   6 +-
 ...sed_bias_dropout_residual_layer_norm_op.cu |  51 ++--
 .../operators/fused/fused_bn_activation_op.cc |  40 +--
 .../operators/fused/fused_bn_activation_op.cu |  43 ++--
 .../operators/fused/fused_bn_activation_op.h  |   4 +-
 .../fused/fused_bn_add_activation_op.cc       |   6 +-
 .../fused/fused_bn_add_activation_op.cu       |  47 ++--
 .../fused/fused_bn_add_activation_op.h        |   2 +-
 .../fused/fused_dropout_act_bias_test.cu      |   4 +-
 .../operators/fused/fused_dropout_helper.h    |   9 +-
 .../fused/fused_elemwise_activation_op.cc     |   4 +-
 .../fused/fused_elemwise_activation_op.h      | 120 +++++----
 .../fused_embedding_eltwise_layernorm_op.cc   |   2 +-
 .../fused_embedding_eltwise_layernorm_op.cu   |  14 +-
 .../fused/fused_embedding_fc_lstm_op.cc       |  26 +-
 .../fused/fused_embedding_fc_lstm_op.h        |   2 +-
 .../fused/fused_embedding_seq_pool_op.h       |   2 +-
 .../fused_fc_elementwise_layernorm_op.cu      |  18 +-
 .../operators/fused/fused_feedforward_op.cc   |   4 +-
 .../operators/fused/fused_feedforward_op.cu   | 241 +++++++++---------
 .../operators/fused/fused_gate_attention.h    | 167 ++++++------
 .../fused/fused_gate_attention_op.cc          |   2 +-
 .../fused/fused_gate_attention_op.cu          | 116 +++++----
 .../operators/fused/fused_gemm_epilogue_op.cc |   2 +-
 .../operators/fused/fused_gemm_epilogue_op.cu |  30 ++-
 .../fused/fused_gemm_epilogue_op_xpu.cc       |  28 +-
 ...ed_layernorm_residual_dropout_bias_test.cu |   4 +-
 .../fused/fused_multi_transformer_int8_op.cc  |   2 +-
 .../fused/fused_multi_transformer_int8_op.cu  |  47 ++--
 .../fused/fused_multi_transformer_op.cc       |   4 +-
 .../fused/fused_multi_transformer_op.cu       |  38 +--
 .../fused/fused_multi_transformer_op.h        |   4 +-
 .../fused/fused_residual_dropout_bias_test.cu |   4 +-
 .../operators/fused/fused_seqpool_cvm_op.cu   |   2 +-
 .../operators/fused/fused_softmax_mask.cu.h   |   2 -
 .../fused/fusion_conv_inception_op.cu         |  12 +-
 .../operators/fused/fusion_group_op_test.cc   |  18 +-
 paddle/fluid/operators/fused/fusion_gru_op.cc |  10 +-
 paddle/fluid/operators/fused/fusion_gru_op.h  |   2 +-
 .../fluid/operators/fused/fusion_lstm_op.cc   |  16 +-
 paddle/fluid/operators/fused/fusion_lstm_op.h |   2 +-
 .../fused/fusion_repeated_fc_relu_op.cc       |  10 +-
 .../fused/fusion_repeated_fc_relu_op.h        |   2 +-
 .../fused/fusion_seqconv_eltadd_relu_op.cc    |   6 +-
 .../fused/fusion_seqconv_eltadd_relu_op.h     |   2 +-
 .../fused/fusion_seqexpand_concat_fc_op.cc    |   6 +-
 .../fused/fusion_seqexpand_concat_fc_op.h     |   2 +-
 .../fused/fusion_seqpool_concat_op.h          |   2 +-
 .../fused/fusion_seqpool_cvm_concat_op.h      |   2 +-
 .../fused/fusion_squared_mat_sub_op.cc        |  12 +-
 .../fused/fusion_squared_mat_sub_op.h         |   2 +-
 .../fusion_transpose_flatten_concat_op.cc     |   2 -
 .../fusion_transpose_flatten_concat_op.cu.cc  |   4 +-
 .../fused/mkldnn/fusion_gru_mkldnn_op.cc      |  24 +-
 .../fused/mkldnn/fusion_lstm_mkldnn_op.cc     |  32 ++-
 .../fused/mkldnn/fusion_rnn_mkldnn.h          |   8 +-
 .../fused/mkldnn/multi_gru_mkldnn_op.cc       |  16 +-
 paddle/fluid/operators/fused/multi_gru_op.h   |   1 -
 .../operators/fused/multihead_matmul_op.cu    |  12 +-
 .../operators/fused/resnet_basic_block_op.cc  |  42 +--
 .../fused/resnet_basic_block_op_xpu.cc        | 232 ++++++++++-------
 .../fluid/operators/fused/resnet_unit_op.cc   |  22 +-
 .../fluid/operators/fused/resnet_unit_op.cu   | 106 ++++----
 .../operators/fused/resnet_unit_op_xpu.cc     |  98 +++----
 .../operators/fused/skip_layernorm_op.cu      |  12 +-
 .../fluid/operators/fused/yolo_box_head_op.cu |   6 +-
 .../fluid/operators/fused/yolo_box_post_op.cu |  14 +-
 .../fluid/operators/fused_softmax_mask_op.cc  |   2 -
 .../fused_softmax_mask_upper_triangle_op.cc   |   2 -
 .../fused_softmax_mask_upper_triangle_op.cu   |  13 +-
 .../fluid/operators/fused_token_prune_op.cc   |   2 -
 .../fluid/operators/fused_token_prune_op.cu   |  33 +--
 paddle/fluid/operators/gather_nd_op.cc        |   2 +-
 paddle/fluid/operators/gather_nd_op_mlu.cc    |  20 +-
 paddle/fluid/operators/gather_nd_op_npu.cc    |  20 +-
 paddle/fluid/operators/gather_op.cc           |   4 +-
 paddle/fluid/operators/gather_op_mlu.cc       |  12 +-
 paddle/fluid/operators/gather_op_npu.cc       |  16 +-
 .../fluid/operators/gather_scatter_kernel.cc  |  20 +-
 .../fluid/operators/gather_scatter_kernel.cu  |  17 +-
 .../fluid/operators/gather_scatter_kernel.h   |  26 +-
 paddle/fluid/operators/gather_test.cc         |   6 +-
 paddle/fluid/operators/gaussian_random_op.cc  |   6 +-
 paddle/fluid/operators/gaussian_random_op.cu  |   2 +-
 .../fluid/operators/gaussian_random_op_mlu.cc |   4 +-
 .../fluid/operators/gaussian_random_op_npu.cc |   4 +-
 paddle/fluid/operators/gelu_op_npu.cc         |  12 +-
 .../fluid/operators/graph_khop_sampler_op.cu  |  20 +-
 .../fluid/operators/graph_khop_sampler_op.h   |  20 +-
 .../operators/grid_sampler_cudnn_op.cu.cc     |  20 +-
 paddle/fluid/operators/grid_sampler_op.cc     |   2 +-
 paddle/fluid/operators/grid_sampler_op_mlu.cc |   8 +-
 paddle/fluid/operators/group_norm_op.cc       |   4 +-
 paddle/fluid/operators/group_norm_op.cu       |  33 +--
 paddle/fluid/operators/group_norm_op.h        |  31 +--
 paddle/fluid/operators/group_norm_op_npu.cc   |  69 +++--
 paddle/fluid/operators/gru_op.cc              |   8 +-
 paddle/fluid/operators/gru_op.cu.cc           |   6 +-
 paddle/fluid/operators/gru_op.h               |  18 +-
 paddle/fluid/operators/gru_unit_op.cc         |   2 -
 paddle/fluid/operators/gru_unit_op.h          |  41 +--
 paddle/fluid/operators/hinge_loss_op.h        |  14 +-
 paddle/fluid/operators/histogram_op.cc        |   1 -
 paddle/fluid/operators/huber_loss_op_npu.cc   |  46 ++--
 paddle/fluid/operators/im2sequence_op.h       |  14 +-
 paddle/fluid/operators/increment_op_npu.cc    |   4 +-
 paddle/fluid/operators/index_sample_op_npu.cc |  14 +-
 paddle/fluid/operators/index_select_op.cc     |   2 -
 paddle/fluid/operators/index_select_op.h      |   2 +-
 paddle/fluid/operators/index_select_op_npu.cc |  15 +-
 paddle/fluid/operators/inplace_abn_op.cc      |  94 +++----
 paddle/fluid/operators/inplace_abn_op.cu      |  48 ++--
 paddle/fluid/operators/inplace_abn_op.h       |   2 +-
 paddle/fluid/operators/instance_norm_op.cc    |  20 +-
 paddle/fluid/operators/instance_norm_op.h     |   2 +-
 .../fluid/operators/instance_norm_op_npu.cc   |  14 +-
 paddle/fluid/operators/interpolate_op.cc      |   5 +-
 paddle/fluid/operators/interpolate_op.cu      |  73 +++---
 paddle/fluid/operators/interpolate_op.h       | 125 ++++-----
 paddle/fluid/operators/interpolate_op_npu.cc  |  20 +-
 paddle/fluid/operators/interpolate_v2_op.cc   |   5 +-
 .../fluid/operators/interpolate_v2_op_mlu.cc  |  29 ++-
 .../fluid/operators/interpolate_v2_op_npu.cc  | 111 ++++----
 paddle/fluid/operators/ipu/ipu_runtime_op.cc  |   4 +-
 paddle/fluid/operators/isfinite_op.h          |  60 ++---
 paddle/fluid/operators/jit/benchmark.cc       |   2 +-
 paddle/fluid/operators/kldiv_loss_op.cc       |   2 -
 paddle/fluid/operators/kldiv_loss_op_npu.cc   |  16 +-
 paddle/fluid/operators/kron_op.cc             |   4 +-
 paddle/fluid/operators/l1_norm_op.cc          |   2 -
 paddle/fluid/operators/l1_norm_op.h           |  14 +-
 paddle/fluid/operators/label_smooth_op_mlu.cc |   6 +-
 paddle/fluid/operators/label_smooth_op_npu.cc |  22 +-
 paddle/fluid/operators/layer_norm_kernel.cu.h |   6 +-
 paddle/fluid/operators/layer_norm_op.cc       |   5 +-
 paddle/fluid/operators/layer_norm_op_mlu.cc   |  31 +--
 paddle/fluid/operators/layer_norm_op_npu.cc   |  60 ++---
 paddle/fluid/operators/layout_utils.h         |  18 +-
 .../fluid/operators/limit_by_capacity_op.cu   |  10 +-
 paddle/fluid/operators/linear_chain_crf_op.h  | 125 +++++----
 paddle/fluid/operators/linspace_op.cc         |   2 +-
 paddle/fluid/operators/lod_reset_op.cc        |   2 +-
 paddle/fluid/operators/lod_reset_op.h         |   6 +-
 .../fluid/operators/lod_tensor_to_array_op.cc |  12 +-
 paddle/fluid/operators/log_loss_op_npu.cc     |  39 +--
 paddle/fluid/operators/log_loss_op_xpu.cc     |  17 +-
 paddle/fluid/operators/log_softmax_op_npu.cc  |  10 +-
 .../fluid/operators/lookup_table_dequant_op.h |   2 +-
 paddle/fluid/operators/lookup_table_op.h      |   2 +-
 paddle/fluid/operators/lookup_table_v2_op.cu  |  24 +-
 paddle/fluid/operators/lookup_table_v2_op.h   |   6 +-
 .../fluid/operators/lookup_table_v2_op_mlu.cc |   2 +-
 .../fluid/operators/lookup_table_v2_op_npu.cc |   2 +-
 paddle/fluid/operators/lrn_op.cc              |  25 +-
 paddle/fluid/operators/lrn_op.cu              |  16 +-
 paddle/fluid/operators/lrn_op.h               |  37 +--
 paddle/fluid/operators/lstm_op.h              |  31 +--
 paddle/fluid/operators/lstm_unit_op.cu        |  27 +-
 paddle/fluid/operators/lstm_unit_op.h         |  27 +-
 paddle/fluid/operators/lstmp_op.h             |  37 +--
 paddle/fluid/operators/margin_rank_loss_op.h  |  16 +-
 paddle/fluid/operators/marker_op.cu           |   4 +-
 .../fluid/operators/masked_select_op_mlu.cc   |  12 +-
 .../fluid/operators/masked_select_op_npu.cc   |  12 +-
 .../fluid/operators/match_matrix_tensor_op.cc |   8 +-
 .../fluid/operators/match_matrix_tensor_op.h  |   2 +-
 paddle/fluid/operators/math/beam_search.cc    |   2 +-
 paddle/fluid/operators/math/beam_search.cu    |   2 +-
 paddle/fluid/operators/math/beam_search.h     |   2 +-
 .../fluid/operators/math/beam_search_npu.cc   |  14 +-
 .../fluid/operators/math/beam_search_xpu.cc   |   2 +-
 .../fluid/operators/math/concat_and_split.cc  |  40 +--
 .../fluid/operators/math/concat_and_split.cu  |  10 +-
 .../fluid/operators/math/concat_and_split.h   |  10 +-
 paddle/fluid/operators/math/concat_test.cc    |  56 ++--
 paddle/fluid/operators/math/context_project.h |  10 +-
 paddle/fluid/operators/math/cross_entropy.cc  |  20 +-
 paddle/fluid/operators/math/cross_entropy.cu  |   6 +-
 paddle/fluid/operators/math/cross_entropy.h   |   6 +-
 paddle/fluid/operators/math/im2col.cc         |  16 +-
 paddle/fluid/operators/math/im2col.cu         |  16 +-
 paddle/fluid/operators/math/im2col.h          |   8 +-
 paddle/fluid/operators/math/im2col_cfo_cpu.h  |  12 +-
 paddle/fluid/operators/math/im2col_test.cc    |  26 +-
 .../fluid/operators/math/matrix_bit_code.cc   | 106 ++++----
 paddle/fluid/operators/math/matrix_bit_code.h |  46 ++--
 paddle/fluid/operators/math/maxouting.cc      |  12 +-
 paddle/fluid/operators/math/maxouting.cu      |  12 +-
 paddle/fluid/operators/math/maxouting.h       |  12 +-
 paddle/fluid/operators/math/sample_prob.cu    |   8 +-
 paddle/fluid/operators/math/sample_prob.h     |  14 +-
 .../operators/math/selected_rows_functor.cc   |   8 +-
 .../operators/math/selected_rows_functor.cu   |   8 +-
 .../operators/math/selected_rows_functor.h    |   8 +-
 .../math/selected_rows_functor_test.cc        |  12 +-
 .../math/selected_rows_functor_test.cu.cc     |  19 +-
 .../fluid/operators/math/sequence_padding.cc  |   4 +-
 .../fluid/operators/math/sequence_pooling.cc  |  12 +-
 .../fluid/operators/math/sequence_pooling.cu  |   4 +-
 .../fluid/operators/math/sequence_pooling.h   |   4 +-
 .../operators/math/sequence_pooling_test.cc   |   4 +-
 paddle/fluid/operators/math/softmax.cu        |  12 +-
 paddle/fluid/operators/math/softmax.h         |  20 +-
 paddle/fluid/operators/math/softmax_impl.h    |  50 ++--
 paddle/fluid/operators/math/tree2col.cc       |  14 +-
 paddle/fluid/operators/math/tree2col.cu       |  14 +-
 paddle/fluid/operators/math/tree2col.h        |  14 +-
 paddle/fluid/operators/math/unpooling.cc      |  32 +--
 paddle/fluid/operators/math/unpooling.cu      |  32 +--
 paddle/fluid/operators/math/unpooling.h       |  32 +--
 paddle/fluid/operators/math/vol2col.cc        |   8 +-
 paddle/fluid/operators/math/vol2col.cu        |   8 +-
 paddle/fluid/operators/math/vol2col.h         |   8 +-
 paddle/fluid/operators/math/vol2col_test.cc   |  16 +-
 paddle/fluid/operators/matmul_op.cc           |  63 +++--
 paddle/fluid/operators/matmul_op_mlu.cc       |  40 +--
 paddle/fluid/operators/matmul_op_npu.cc       |  46 ++--
 paddle/fluid/operators/matmul_op_xpu.cc       |  18 +-
 paddle/fluid/operators/matmul_v2_op.cc        |   4 +-
 paddle/fluid/operators/matmul_v2_op.h         |  10 +-
 paddle/fluid/operators/matmul_v2_op_mlu.cc    |  46 ++--
 paddle/fluid/operators/matmul_v2_op_npu.cc    |  40 +--
 paddle/fluid/operators/mean_iou_op.cu         |  16 +-
 paddle/fluid/operators/mean_iou_op.h          |  18 +-
 paddle/fluid/operators/mean_op_mlu.cc         |  12 +-
 paddle/fluid/operators/mean_op_npu.cc         |   6 +-
 paddle/fluid/operators/memcpy_d2h_op.cc       |   2 +-
 paddle/fluid/operators/memcpy_h2d_op.cc       |   2 +-
 paddle/fluid/operators/memcpy_op.cc           |   2 +-
 paddle/fluid/operators/meshgrid_op.cc         |   4 +-
 paddle/fluid/operators/meshgrid_op_mlu.cc     |   4 +-
 paddle/fluid/operators/meshgrid_op_npu.cc     |   6 +-
 .../operators/metrics/accuracy_op_mlu.cc      |  10 +-
 .../operators/metrics/accuracy_op_npu.cc      |  12 +-
 .../operators/metrics/accuracy_op_xpu.cc      |  14 +-
 .../operators/metrics/precision_recall_op.h   |  16 +-
 paddle/fluid/operators/minus_op.h             |   6 +-
 paddle/fluid/operators/miopen_lstm_cache.h    |   2 +-
 paddle/fluid/operators/miopen_rnn_cache.h     |   4 +-
 .../operators/mkldnn/activation_mkldnn_op.cc  |  18 +-
 .../operators/mkldnn/batch_norm_mkldnn_op.cc  |  49 ++--
 .../operators/mkldnn/concat_mkldnn_op.cc      |  37 +--
 .../fluid/operators/mkldnn/conv_mkldnn_op.cc  |  98 +++----
 .../mkldnn/conv_transpose_mkldnn_op.cc        |  27 +-
 .../operators/mkldnn/dequantize_mkldnn_op.cc  |   8 +-
 .../operators/mkldnn/expand_v2_mkldnn_op.cc   |  10 +-
 paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc |  57 +++--
 .../mkldnn/fill_constant_mkldnn_op.cc         |   8 +-
 .../operators/mkldnn/interpolate_mkldnn_op.cc |  16 +-
 .../operators/mkldnn/layer_norm_mkldnn_op.cc  |  22 +-
 .../fluid/operators/mkldnn/lrn_mkldnn_op.cc   |  28 +-
 .../fluid/operators/mkldnn/matmul_mkldnn_op.h |   8 +-
 .../operators/mkldnn/matmul_v2_mkldnn_op.cc   |  51 ++--
 .../fluid/operators/mkldnn/mul_mkldnn_op.cc   |  16 +-
 .../fluid/operators/mkldnn/pad3d_mkldnn_op.cc |   8 +-
 .../fluid/operators/mkldnn/prelu_mkldnn_op.cc |  27 +-
 .../operators/mkldnn/quantize_mkldnn_op.cc    |   6 +-
 .../operators/mkldnn/requantize_mkldnn_op.cc  |   8 +-
 .../operators/mkldnn/reshape_mkldnn_op.cc     |   8 +-
 .../fluid/operators/mkldnn/shape_mkldnn_op.cc |   4 +-
 .../mkldnn/shuffle_channel_mkldnn_op.cc       |   7 +-
 .../fluid/operators/mkldnn/slice_mkldnn_op.cc |  32 ++-
 .../operators/mkldnn/softmax_mkldnn_op.cc     |  23 +-
 .../operators/mkldnn/softplus_mkldnn_op.h     |   8 +-
 .../fluid/operators/mkldnn/split_mkldnn_op.cc |  11 +-
 .../fluid/operators/mkldnn/stack_mkldnn_op.cc |  13 +-
 .../fluid/operators/mkldnn/sum_mkldnn_op.cc   |   4 +-
 .../operators/mkldnn/transpose_mkldnn_op.cc   |  13 +-
 paddle/fluid/operators/mlu/mlu_baseop.cc      |   8 +-
 paddle/fluid/operators/mlu/mlu_baseop.h       |  20 +-
 .../fluid/operators/modified_huber_loss_op.cu |  10 +-
 .../fluid/operators/modified_huber_loss_op.h  |  18 +-
 paddle/fluid/operators/mul_op.cc              |   1 -
 paddle/fluid/operators/mul_op_npu.cc          |  16 +-
 paddle/fluid/operators/multi_dot_op.cc        |   2 +-
 paddle/fluid/operators/multinomial_op_npu.cc  |   6 +-
 paddle/fluid/operators/multiplex_op.cc        |   2 +-
 paddle/fluid/operators/nccl/nccl_op.cu.cc     |   2 +-
 paddle/fluid/operators/nce_op.cc              |   2 -
 paddle/fluid/operators/nce_op.h               |  68 +++--
 paddle/fluid/operators/norm_op_npu.cc         |  16 +-
 paddle/fluid/operators/norm_utils.cu.h        |   2 +-
 paddle/fluid/operators/number_count_op.cu     |   2 +-
 paddle/fluid/operators/one_hot_op.cc          |   2 +-
 paddle/fluid/operators/one_hot_op.cu          |   4 +-
 paddle/fluid/operators/one_hot_op.h           |   4 +-
 paddle/fluid/operators/one_hot_op_npu.cc      |   4 +-
 paddle/fluid/operators/one_hot_op_xpu.cc      |   4 +-
 paddle/fluid/operators/one_hot_v2_op.cc       |   2 +-
 paddle/fluid/operators/one_hot_v2_op_mlu.cc   |   5 +-
 paddle/fluid/operators/one_hot_v2_op_npu.cc   |   4 +-
 .../fluid/operators/optimizers/adadelta_op.cc |   2 +-
 .../fluid/operators/optimizers/adagrad_op.cc  |   2 +-
 paddle/fluid/operators/optimizers/adam_op.h   |   4 +-
 .../operators/optimizers/adam_op_functor.h    |   4 +-
 .../fluid/operators/optimizers/adam_op_mlu.cc |  56 ++--
 .../fluid/operators/optimizers/adam_op_npu.cc |  22 +-
 .../fluid/operators/optimizers/adamax_op.cc   |   2 +-
 .../optimizers/decayed_adagrad_op.cc          |   2 +-
 .../operators/optimizers/decayed_adagrad_op.h |  12 +-
 .../operators/optimizers/dgc_momentum_op.cc   |   2 +-
 .../operators/optimizers/dgc_momentum_op.h    |  42 +--
 .../distributed_fused_lamb_init_op.cu         |  72 +++---
 .../optimizers/distributed_fused_lamb_op.cc   |   2 +-
 .../optimizers/distributed_fused_lamb_op.cu   |  39 ++-
 paddle/fluid/operators/optimizers/dpsgd_op.cc |   2 +-
 paddle/fluid/operators/optimizers/dpsgd_op.h  |   8 +-
 paddle/fluid/operators/optimizers/ftrl_op.cc  |   2 +-
 paddle/fluid/operators/optimizers/ftrl_op.h   |  18 +-
 paddle/fluid/operators/optimizers/lamb_op.cc  |   2 +-
 .../operators/optimizers/lars_momentum_op.cu  |   2 +-
 .../operators/optimizers/lars_momentum_op.h   |   2 +-
 .../operators/optimizers/merged_adam_op.cc    |   4 +-
 .../optimizers/merged_momentum_op_mlu.cc      |  12 +-
 .../optimizers/merged_momentum_op_npu.cc      |  12 +-
 .../optimizers/mkldnn/sgd_mkldnn_op.cc        |  12 +-
 .../fluid/operators/optimizers/momentum_op.cc |   2 +-
 .../operators/optimizers/momentum_op_mlu.cc   |  12 +-
 .../operators/optimizers/momentum_op_npu.cc   |  12 +-
 .../pow2_decay_with_linear_warmup_op.h        |   8 +-
 .../pow2_decay_with_linear_warmup_op_xpu.cc   |   8 +-
 .../optimizers/proximal_adagrad_op.cc         |   2 +-
 .../optimizers/proximal_adagrad_op.h          |  18 +-
 .../operators/optimizers/proximal_gd_op.cc    |   2 +-
 .../operators/optimizers/proximal_gd_op.h     |  13 +-
 .../operators/optimizers/rmsprop_op_npu.cc    |   6 +-
 paddle/fluid/operators/optimizers/sgd_op.cc   |   2 +-
 paddle/fluid/operators/optimizers/sgd_op.cu   |  18 +-
 paddle/fluid/operators/optimizers/sgd_op.h    |  34 +--
 .../optimizers/sparse_momentum_op.cc          |   2 +-
 .../operators/optimizers/sparse_momentum_op.h |  32 ++-
 paddle/fluid/operators/p_norm_op_npu.cc       |  14 +-
 paddle/fluid/operators/pad2d_op.cc            |  17 +-
 paddle/fluid/operators/pad2d_op.cu            |  15 +-
 paddle/fluid/operators/pad3d_op.cc            |   6 +-
 paddle/fluid/operators/pad3d_op_npu.cc        |  13 +-
 .../fluid/operators/pad_constant_like_op.cc   |   2 -
 paddle/fluid/operators/pad_constant_like_op.h |  14 +-
 paddle/fluid/operators/pad_op.cc              |   2 -
 paddle/fluid/operators/pad_op_npu.cc          |  11 +-
 paddle/fluid/operators/partial_concat_op.cc   |   4 +-
 paddle/fluid/operators/partial_concat_op.cu   |   8 +-
 paddle/fluid/operators/partial_concat_op.h    |   8 +-
 paddle/fluid/operators/partial_sum_op.cc      |   4 +-
 paddle/fluid/operators/partial_sum_op.cu      |   9 +-
 paddle/fluid/operators/partial_sum_op.h       |   8 +-
 paddle/fluid/operators/pool_op.cc             |   6 +-
 paddle/fluid/operators/pool_op.h              |   6 +-
 paddle/fluid/operators/pool_op_mlu.cc         |  32 +--
 paddle/fluid/operators/pool_op_npu.cc         |  14 +-
 .../operators/positive_negative_pair_op.h     |  25 +-
 paddle/fluid/operators/prelu_op.cc            |   2 +-
 paddle/fluid/operators/prroi_pool_op.cc       |   2 +-
 paddle/fluid/operators/prroi_pool_op.cu       |  28 +-
 paddle/fluid/operators/prroi_pool_op.h        |  22 +-
 paddle/fluid/operators/pscore/fake_init_op.cc |   2 +-
 .../operators/pull_box_extended_sparse_op.h   |  10 +-
 paddle/fluid/operators/pull_box_sparse_op.h   |   6 +-
 paddle/fluid/operators/pull_gpups_sparse_op.h |   6 +-
 paddle/fluid/operators/put_along_axis_op.cc   |   4 +-
 paddle/fluid/operators/pyramid_hash_op.cc     |  10 +-
 paddle/fluid/operators/quantize_linear_op.cc  |  10 +-
 paddle/fluid/operators/quantize_linear_op.cu  |   6 +-
 paddle/fluid/operators/quantize_linear_op.h   |  30 +--
 paddle/fluid/operators/quantize_op.h          |   1 -
 paddle/fluid/operators/random_routing_op.cu   |   2 +-
 paddle/fluid/operators/randperm_op.h          |   4 +-
 paddle/fluid/operators/randperm_op_mlu.cc     |   4 +-
 paddle/fluid/operators/range_op.cc            |   2 +-
 paddle/fluid/operators/range_op.h             |   8 +-
 paddle/fluid/operators/range_op_mlu.cc        |  10 +-
 paddle/fluid/operators/range_op_npu.cc        |  10 +-
 paddle/fluid/operators/rank_attention_op.cc   |   2 +-
 paddle/fluid/operators/rank_attention_op.cu   |  34 +--
 paddle/fluid/operators/rank_loss_op.h         |  20 +-
 paddle/fluid/operators/recurrent_op.cc        |  10 +-
 .../operators/reduce_ops/logsumexp_op_xpu.cc  |   4 +-
 .../operators/reduce_ops/reduce_any_op_npu.cc |   6 +-
 .../reduce_ops/reduce_any_op_npu_test.cc      |   2 +-
 .../operators/reduce_ops/reduce_max_op_mlu.cc |   6 +-
 .../operators/reduce_ops/reduce_max_op_npu.cc |  18 +-
 .../reduce_ops/reduce_mean_op_mlu.cc          |   8 +-
 .../reduce_ops/reduce_mean_op_npu.cc          |  12 +-
 .../operators/reduce_ops/reduce_min_op_mlu.cc |   6 +-
 .../operators/reduce_ops/reduce_min_op_npu.cc |   8 +-
 .../fluid/operators/reduce_ops/reduce_op.cu.h |   4 +-
 paddle/fluid/operators/reduce_ops/reduce_op.h |  70 ++---
 .../operators/reduce_ops/reduce_op_function.h |  14 +-
 .../operators/reduce_ops/reduce_op_mlu.h      |   4 +-
 .../operators/reduce_ops/reduce_op_xpu.h      |   4 +-
 .../reduce_ops/reduce_prod_op_npu.cc          |   8 +-
 .../operators/reduce_ops/reduce_sum_op.h      |  13 +-
 .../operators/reduce_ops/reduce_sum_op_mlu.cc |   8 +-
 .../operators/reduce_ops/reduce_sum_op_npu.cc |  15 +-
 .../operators/reduce_ops/reduce_sum_op_xpu.cc |   7 +-
 .../fluid/operators/repeat_interleave_op.cc   |   2 -
 paddle/fluid/operators/requantize_op.h        |   1 -
 paddle/fluid/operators/reshape_op.cc          |  18 +-
 paddle/fluid/operators/reshape_op_mlu.cc      |  10 +-
 paddle/fluid/operators/reshape_op_npu.cc      |  10 +-
 paddle/fluid/operators/rnn_op_mlu.cc          |  68 ++---
 paddle/fluid/operators/roi_align_op.cc        |   2 +-
 paddle/fluid/operators/roi_align_op_mlu.cc    |  14 +-
 paddle/fluid/operators/roi_align_op_npu.cc    |  23 +-
 paddle/fluid/operators/roi_pool_op.cc         |   2 +-
 paddle/fluid/operators/roll_op.cc             |   2 -
 paddle/fluid/operators/row_conv_op.cc         |  21 +-
 paddle/fluid/operators/row_conv_op.cu         |  11 +-
 paddle/fluid/operators/rrelu_op.cc            |   2 -
 paddle/fluid/operators/run_program_op.cc      |   4 +-
 paddle/fluid/operators/sample_logits_op.cu    |  38 +--
 paddle/fluid/operators/sample_logits_op.h     |  56 ++--
 paddle/fluid/operators/sampling_id_op.cc      |   2 +-
 paddle/fluid/operators/sampling_id_op.h       |   6 +-
 paddle/fluid/operators/save_combine_op.cc     |   4 +-
 paddle/fluid/operators/scale_op_mlu.cc        |  10 +-
 paddle/fluid/operators/scale_op_npu.cc        |  10 +-
 paddle/fluid/operators/scatter_nd_add_op.cc   |   2 +-
 paddle/fluid/operators/scatter_op_mlu.cc      |   8 +-
 paddle/fluid/operators/scatter_op_npu.cc      |  12 +-
 paddle/fluid/operators/scatter_test.cc        |   6 +-
 paddle/fluid/operators/search_compute.h       |   2 +-
 paddle/fluid/operators/seed_op.cc             |   2 +-
 paddle/fluid/operators/seed_op.cu             |   2 +-
 paddle/fluid/operators/seed_op.h              |   4 +-
 paddle/fluid/operators/seed_op_npu.cc         |   2 +-
 .../sequence_ops/sequence_concat_op.h         |  16 +-
 .../operators/sequence_ops/sequence_conv_op.h |  19 +-
 .../sequence_ops/sequence_conv_op_xpu.cc      |   9 +-
 .../sequence_ops/sequence_mask_op.cc          |   2 +-
 .../operators/sequence_ops/sequence_mask_op.h |  10 +-
 .../sequence_ops/sequence_mask_op_npu.cc      |  10 +-
 .../operators/sequence_ops/sequence_pool_op.h |  10 +-
 .../sequence_ops/sequence_scatter_op.cc       |   2 +-
 .../sequence_ops/sequence_scatter_op.h        |  10 +-
 .../sequence_ops/sequence_slice_op.h          |  18 +-
 .../sequence_softmax_cudnn_op.cu.cc           |   2 +-
 .../sequence_ops/sequence_softmax_op.h        |   2 +-
 .../sequence_topk_avg_pooling_op.h            |   6 +-
 .../sequence_ops/sequence_unpad_op.h          |   2 +-
 paddle/fluid/operators/set_value_op.cc        |   4 +-
 paddle/fluid/operators/set_value_op.h         |   2 +-
 paddle/fluid/operators/set_value_op_mlu.cc    |  16 +-
 paddle/fluid/operators/set_value_op_npu.cc    |  16 +-
 paddle/fluid/operators/shape_op.cc            |   2 +-
 paddle/fluid/operators/shape_op_mlu.cc        |   4 +-
 paddle/fluid/operators/shape_op_npu.cc        |   6 +-
 paddle/fluid/operators/shard_index_op_npu.cc  |   2 +-
 paddle/fluid/operators/share_buffer_op.cc     |   2 +-
 paddle/fluid/operators/share_buffer_op.h      |   4 +-
 paddle/fluid/operators/shuffle_batch_op.cc    |   2 +-
 paddle/fluid/operators/shuffle_batch_op.cu    |  18 +-
 paddle/fluid/operators/shuffle_batch_op.h     |   2 +-
 paddle/fluid/operators/shuffle_channel_op.cu  |  10 +-
 paddle/fluid/operators/shuffle_channel_op.h   |   8 +-
 .../sigmoid_cross_entropy_with_logits_op.cc   |   1 -
 ...igmoid_cross_entropy_with_logits_op_mlu.cc |  16 +-
 ...igmoid_cross_entropy_with_logits_op_npu.cc |  16 +-
 paddle/fluid/operators/similarity_focus_op.h  |   6 +-
 paddle/fluid/operators/size_op.cc             |   2 +-
 paddle/fluid/operators/size_op_mlu.cc         |   4 +-
 paddle/fluid/operators/size_op_npu.cc         |   4 +-
 paddle/fluid/operators/slice_op.cc            |  11 +-
 paddle/fluid/operators/slice_op.h             |  29 ++-
 paddle/fluid/operators/slice_op_mlu.cc        |  33 ++-
 paddle/fluid/operators/slice_op_npu.cc        |  33 ++-
 paddle/fluid/operators/smooth_l1_loss_op.h    |  26 +-
 .../fluid/operators/smooth_l1_loss_op_npu.cc  |  26 +-
 paddle/fluid/operators/softmax_op_mlu.cc      |   2 +-
 paddle/fluid/operators/softmax_op_npu.cc      |   2 +-
 .../softmax_with_cross_entropy_op_mlu.cc      |  22 +-
 .../softmax_with_cross_entropy_op_npu.cc      |  20 +-
 paddle/fluid/operators/space_to_depth_op.cc   |   2 +-
 paddle/fluid/operators/sparse_attention_op.cu |  93 +++----
 paddle/fluid/operators/spectral_norm_op.cc    |   2 -
 paddle/fluid/operators/split_op.cc            |   6 +-
 paddle/fluid/operators/split_op_mlu.cc        |  10 +-
 paddle/fluid/operators/split_op_npu.cc        |   6 +-
 paddle/fluid/operators/spp_op.h               |  22 +-
 .../fluid/operators/squared_l2_distance_op.h  |  18 +-
 paddle/fluid/operators/squared_l2_norm_op.cc  |   2 -
 .../fluid/operators/squared_l2_norm_op_mlu.cc |  22 +-
 .../fluid/operators/squared_l2_norm_op_npu.cc |  14 +-
 paddle/fluid/operators/stack_op_mlu.cc        |   6 +-
 paddle/fluid/operators/stack_op_npu.cc        |  14 +-
 paddle/fluid/operators/stft_op.h              |  14 +-
 paddle/fluid/operators/strided_memcpy.h       |   6 +-
 paddle/fluid/operators/strided_slice_op.cc    |   4 +-
 .../fluid/operators/strided_slice_op_mlu.cc   |  38 +--
 .../fluid/operators/strided_slice_op_npu.cc   |  38 +--
 .../operators/string/faster_tokenizer_op.cc   |   2 +-
 .../operators/string/faster_tokenizer_op.h    |   4 +-
 paddle/fluid/operators/sum_op.cc              |   1 -
 paddle/fluid/operators/sum_op_mlu.cc          |   4 +-
 paddle/fluid/operators/sum_op_npu.cc          |   6 +-
 paddle/fluid/operators/sum_op_xpu.cc          |   2 +-
 paddle/fluid/operators/svd_helper.h           | 130 +++++-----
 .../fluid/operators/sync_batch_norm_op_mlu.cc |  43 ++--
 .../fluid/operators/sync_batch_norm_op_npu.cc |  39 +--
 paddle/fluid/operators/take_along_axis_op.cc  |   4 +-
 .../fluid/operators/take_along_axis_op_npu.cc |  16 +-
 paddle/fluid/operators/tdm_child_op.h         |   2 +-
 paddle/fluid/operators/tdm_sampler_op.h       |   2 +-
 .../teacher_student_sigmoid_loss_op.cc        |   2 +-
 .../teacher_student_sigmoid_loss_op.h         |  19 +-
 paddle/fluid/operators/temporal_shift_op.cc   |   2 -
 paddle/fluid/operators/temporal_shift_op.cu   |  12 +-
 paddle/fluid/operators/temporal_shift_op.h    |   8 +-
 .../operators/tensor_array_to_tensor_op.cc    |   1 -
 .../operators/tensorrt/tensorrt_engine_op.h   |   2 +-
 .../test_leaky_relu_grad_grad_functor.h       |  21 +-
 paddle/fluid/operators/tile_op.cc             |   6 +-
 paddle/fluid/operators/tile_op_functor.h      |   6 +-
 paddle/fluid/operators/tile_op_mlu.cc         |   8 +-
 paddle/fluid/operators/tile_op_npu.cc         |   8 +-
 paddle/fluid/operators/top_k_function_cuda.h  |   8 +-
 paddle/fluid/operators/top_k_op.cu            |  20 +-
 paddle/fluid/operators/top_k_op.h             |  20 +-
 paddle/fluid/operators/top_k_op_mlu.cc        |   4 +-
 paddle/fluid/operators/top_k_op_npu.cc        |   4 +-
 paddle/fluid/operators/top_k_op_xpu.cc        |  10 +-
 paddle/fluid/operators/top_k_v2_op_mlu.cc     |   4 +-
 paddle/fluid/operators/top_k_v2_op_npu.cc     |  10 +-
 paddle/fluid/operators/transfer_layout_op.cc  |   2 +-
 paddle/fluid/operators/transfer_layout_op.h   |   4 +-
 paddle/fluid/operators/transpose_op.cc        |   6 +-
 paddle/fluid/operators/transpose_op.cu.h      |  14 +-
 paddle/fluid/operators/transpose_op.h         |   4 +-
 paddle/fluid/operators/tree_conv_op.h         |  24 +-
 paddle/fluid/operators/tril_triu_op_mlu.cc    |   4 +-
 paddle/fluid/operators/tril_triu_op_npu.cc    |   4 +-
 .../truncated_gaussian_random_op_mlu.cc       |   4 +-
 .../truncated_gaussian_random_op_npu.cc       |   6 +-
 paddle/fluid/operators/unbind_op.cc           |   1 -
 .../uniform_random_inplace_op_xpu.cc          |   2 +-
 paddle/fluid/operators/uniform_random_op.cc   |   6 +-
 paddle/fluid/operators/uniform_random_op.cu   |   6 +-
 paddle/fluid/operators/uniform_random_op.h    |  16 +-
 .../fluid/operators/uniform_random_op_mlu.cc  |   6 +-
 .../fluid/operators/uniform_random_op_npu.cc  |   6 +-
 paddle/fluid/operators/unique_op.h            |  76 +++---
 .../fluid/operators/unique_with_counts_op.h   |   8 +-
 paddle/fluid/operators/unsqueeze_op.cc        |   2 +-
 paddle/fluid/operators/unsqueeze_op.h         |   4 +-
 paddle/fluid/operators/unstack_op_mlu.cc      |   8 +-
 paddle/fluid/operators/unstack_op_npu.cc      |  12 +-
 paddle/fluid/operators/utils.h                |  18 +-
 paddle/fluid/operators/var_conv_2d_op.cc      |   8 +-
 paddle/fluid/operators/var_conv_2d_op.h       |   2 +-
 paddle/fluid/operators/where_index_op_mlu.cc  |   8 +-
 paddle/fluid/operators/where_index_op_npu.cc  |   6 +-
 paddle/fluid/operators/where_op_mlu.cc        |   8 +-
 paddle/fluid/operators/where_op_npu.cc        |  18 +-
 .../platform/device/gpu/cuda/cudnn_desc.h     |   7 +-
 .../platform/device/gpu/cuda/cudnn_helper.h   |   2 +-
 .../platform/device/gpu/cudnn_desc_test.cc    |   2 +-
 .../platform/device/gpu/rocm/miopen_desc.h    |   1 -
 .../platform/device/gpu/rocm/miopen_helper.h  |   2 +-
 .../fluid/platform/device/ipu/ipu_backend.cc  |   4 +-
 .../fluid/platform/device/ipu/ipu_backend.h   |   4 +-
 paddle/fluid/platform/device/ipu/ipu_utils.h  |   2 +-
 .../fluid/platform/device/npu/npu_op_runner.h |   2 +-
 paddle/fluid/platform/device_code_test.cc     |  12 +-
 paddle/fluid/platform/mkldnn_helper.h         |   6 +-
 paddle/fluid/platform/mkldnn_reuse.h          |  54 ++--
 paddle/fluid/pybind/eager.cc                  |  18 +-
 paddle/fluid/pybind/eager_functions.cc        |   2 +-
 paddle/fluid/pybind/eager_method.cc           |  12 +-
 paddle/fluid/pybind/eager_utils.cc            |   8 +-
 paddle/fluid/pybind/eager_utils.h             |   2 +-
 paddle/fluid/pybind/imperative.cc             |  10 +-
 paddle/fluid/pybind/inference_api.cc          |   2 +-
 paddle/fluid/pybind/pybind.cc                 |   2 +-
 paddle/fluid/pybind/tensor.cc                 | 140 +++++-----
 paddle/fluid/pybind/tensor_py.h               |  80 +++---
 paddle/infrt/api/infrt_api.cc                 |  20 +-
 paddle/infrt/api/infrt_api.h                  |   4 +-
 paddle/infrt/api/infrt_api_test.cc.in         |   8 +-
 paddle/infrt/backends/tensorrt/trt_engine.cc  |   6 +-
 paddle/infrt/backends/tensorrt/trt_engine.h   |   9 +-
 paddle/infrt/backends/tensorrt/trt_utils.h    |   4 +-
 .../infrt/pass/infrt_weights_unfold_pass.cc   |   2 +-
 paddle/infrt/host_context/kernel_frame.cc     |   2 +-
 .../host_context/mlir_to_runtime_translate.cc |   6 +-
 paddle/infrt/host_context/value.cc            |   4 +-
 paddle/infrt/host_context/value.h             |  12 +-
 .../infrt/kernel/phi/dense_tensor_kernels.cc  |  56 ++--
 .../infrt/kernel/phi/dense_tensor_kernels.h   |  21 +-
 .../infershaped/infershape_launchers_test.cc  |  12 +-
 .../infershaped_kernel_launcher.cc            |   6 +-
 .../phi/infershaped/infershaped_utils.h       |   2 +-
 paddle/infrt/kernel/tensor_kernels.cc         |   6 +-
 paddle/infrt/kernel/tensorrt/trt_helper.h     |   2 +-
 paddle/infrt/kernel/tensorrt/trt_kernels.cc   |  10 +-
 paddle/infrt/kernel/tensorrt/trt_kernels.h    |   4 +-
 paddle/infrt/paddle/model_parser.cc           |   4 +-
 paddle/infrt/paddle/model_parser.h            |   2 +-
 paddle/infrt/tensor/phi/tensor_map.cc         |   7 +-
 paddle/infrt/tensor/phi/tensor_map.h          |   6 +-
 paddle/phi/api/include/tensor.h               |   2 +-
 paddle/phi/api/lib/utils/tensor_utils.cc      |   6 +-
 paddle/phi/api/lib/utils/tensor_utils.h       |   4 +-
 .../phi/backends/custom/custom_device_test.cc |  14 +-
 paddle/phi/core/dense_tensor.h                |   4 +-
 paddle/phi/core/dense_tensor.inl              |   6 +-
 paddle/phi/core/dense_tensor_impl.cc          |   2 +-
 paddle/phi/kernels/funcs/fc_functor.cc        |   4 +-
 paddle/phi/kernels/funcs/math_function.cc     |  36 +--
 paddle/phi/kernels/funcs/math_function.cu     |  28 +-
 paddle/phi/kernels/funcs/math_function.h      |  42 ++-
 paddle/phi/kernels/funcs/math_function_impl.h |  46 ++--
 paddle/phi/kernels/funcs/sequence2batch.cc    |   4 +-
 paddle/phi/kernels/funcs/sequence2batch.cu    |   4 +-
 paddle/phi/kernels/funcs/sequence2batch.h     |   4 +-
 paddle/phi/kernels/gpu/batch_norm_kernel.cu   |   2 +-
 paddle/phi/kernels/gpu/depthwise_conv.h       |  61 ++---
 .../impl/average_accumulates_kernel_impl.h    |  14 +-
 .../phi/tests/kernels/test_math_function.cc   |  38 +--
 .../phi/tests/kernels/test_math_function.cu   | 100 ++++----
 .../custom_op/custom_raw_op_kernel_op.cc      |   9 +-
 .../custom_op/custom_raw_op_kernel_op.cu      |   5 +-
 .../tests/custom_op/custom_raw_op_kernel_op.h |  10 +-
 1042 files changed, 8120 insertions(+), 7816 deletions(-)

diff --git a/paddle/fluid/distributed/ps/service/brpc_utils.cc b/paddle/fluid/distributed/ps/service/brpc_utils.cc
index b98e85f9c23e5..915a1ffa15f6b 100644
--- a/paddle/fluid/distributed/ps/service/brpc_utils.cc
+++ b/paddle/fluid/distributed/ps/service/brpc_utils.cc
@@ -282,7 +282,7 @@ void DeserializeSelectedRows(
     const platform::DeviceContext& ctx) {
   const auto place = ctx.GetPlace();
   auto* slr = var->GetMutable<phi::SelectedRows>();
-  framework::Tensor* tensor = slr->mutable_value();
+  phi::DenseTensor* tensor = slr->mutable_value();
   slr->set_height(msg.slr_height());
   std::vector<int64_t> tmp_rows(msg.dims()[0]);
   memcpy(tmp_rows.data(), msg.data().data(), msg.dims()[0] * sizeof(int64_t));
diff --git a/paddle/fluid/eager/eager_tensor.h b/paddle/fluid/eager/eager_tensor.h
index 8026b8e368478..8bddb87d1fef0 100644
--- a/paddle/fluid/eager/eager_tensor.h
+++ b/paddle/fluid/eager/eager_tensor.h
@@ -248,7 +248,7 @@ class EagerVariable final {
     // Construct allocation only once.
     if (var_.IsInitialized()) {
       if (var_.IsType<paddle::framework::LoDTensor>() ||
-          var_.IsType<paddle::framework::Tensor>()) {
+          var_.IsType<phi::DenseTensor>()) {
         return SetImplWithLegacyTensor<phi::DenseTensor>();
       } else if (var_.IsType<phi::SelectedRows>()) {
         return SetImplWithLegacyTensor<phi::SelectedRows>();
@@ -286,7 +286,7 @@ class EagerVariable final {
   template <typename VarType>
   void ConstructVariableFromTensor(const paddle::experimental::Tensor& tensor) {
     auto* framework_tensor = var_.GetMutable<VarType>();
-    // Contruct framework::Tensor from egr::EagerVariable
+    // Contruct phi::DenseTensor from egr::EagerVariable
     auto tensor_dense = std::dynamic_pointer_cast<VarType>(tensor.impl());
     PADDLE_ENFORCE_EQ(
         (tensor_dense.get() && tensor_dense),
@@ -303,7 +303,7 @@ class EagerVariable final {
   void ConstructVariableFromCompatTensor(
       const paddle::experimental::Tensor& tensor) {
     auto* framework_holder = var_.GetMutable<VarType>();
-    // Contruct framework::Tensor from egr::EagerVariable
+    // Contruct phi::DenseTensor from egr::EagerVariable
     auto* compat_tensor =
         static_cast<VariableCompatTensor*>(tensor.impl().get());
     PADDLE_ENFORCE_NOT_NULL(compat_tensor,
diff --git a/paddle/fluid/framework/attribute_checker.h b/paddle/fluid/framework/attribute_checker.h
index 24f3f0be96b6c..6552d167e1d01 100644
--- a/paddle/fluid/framework/attribute_checker.h
+++ b/paddle/fluid/framework/attribute_checker.h
@@ -246,7 +246,7 @@ class TypedAttrChecker {
                         true,
                         platform::errors::InvalidArgument(
                             "Found Attribute('%s') with type(Variable), but it "
-                            "doesn't support Tensor type.",
+                            "doesn't support phi::DenseTensor type.",
                             attr_name_));
 
       VLOG(1) << "Found Attribute " << attr_name_ << " with type(Variable).";
diff --git a/paddle/fluid/framework/copy_same_tensor_test.cc b/paddle/fluid/framework/copy_same_tensor_test.cc
index d4f36be5e87e7..10e0b76f00459 100644
--- a/paddle/fluid/framework/copy_same_tensor_test.cc
+++ b/paddle/fluid/framework/copy_same_tensor_test.cc
@@ -46,7 +46,7 @@ static bool CopySameTensorTestMain(const DDim &dims,
   FLAGS_use_system_allocator = true;  // force to use system allocator
 
   // Step 1: create a cpu tensor and initialize it with random value;
-  Tensor src_cpu_tensor;
+  phi::DenseTensor src_cpu_tensor;
   {
     src_cpu_tensor.Resize(dims);
     auto *src_ptr_cpu = src_cpu_tensor.mutable_data<T>(platform::CPUPlace());
@@ -60,9 +60,9 @@ static bool CopySameTensorTestMain(const DDim &dims,
   }
 
   // Step 2: copy the source tensor to dst place
-  Tensor dst_cpu_tensor;
+  phi::DenseTensor dst_cpu_tensor;
   {
-    Tensor src_tensor;
+    phi::DenseTensor src_tensor;
     TensorCopySync(src_cpu_tensor, src_place, &src_tensor);
 
     // The source tensor and dst_tensor is the same
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index e7ed9f2108128..8f778e4babe4e 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -133,8 +133,8 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
   for (auto& in_name : inputs) {
     VLOG(3) << "Custom Operator: input name - " << in_name;
     if (detail::IsDuplicableVar(in_name)) {
-      // return const std::vector<const Tensor*>
-      auto vec_x = ctx.MultiInput<Tensor>(in_name);
+      // return const std::vector<const phi::DenseTensor*>
+      auto vec_x = ctx.MultiInput<phi::DenseTensor>(in_name);
       PADDLE_ENFORCE_NE(vec_x.empty(),
                         true,
                         platform::errors::NotFound(
@@ -161,7 +161,7 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
       }
       kernel_ctx.EmplaceBackInputs(std::move(custom_vec_in));
     } else {
-      auto* x = ctx.Input<Tensor>(in_name);
+      auto* x = ctx.Input<phi::DenseTensor>(in_name);
       PADDLE_ENFORCE_NOT_NULL(
           x,
           platform::errors::NotFound("Input tensor (%s) is nullptr.", in_name));
@@ -222,7 +222,7 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
 
   VLOG(3) << "Custom Operator: push outputs into CustomOpKernelContext.";
   // cache the target tensor pointers
-  std::vector<Tensor*> true_out_ptrs;
+  std::vector<phi::DenseTensor*> true_out_ptrs;
   for (size_t i = 0; i < outputs.size(); ++i) {
     auto out_name = outputs[i];
     if (detail::IsDuplicableVar(out_name)) {
@@ -231,7 +231,7 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
                          "If custom operator's outputs contains `paddle::Vec("
                          ")` type, "
                          "it only can hold one output."));
-      auto vec_out = ctx.MultiOutput<Tensor>(out_name);
+      auto vec_out = ctx.MultiOutput<phi::DenseTensor>(out_name);
       PADDLE_ENFORCE_NE(vec_out.empty(),
                         true,
                         platform::errors::NotFound(
@@ -253,7 +253,7 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
       }
       kernel_ctx.EmplaceBackOutputs(std::move(custom_vec_out));
     } else {
-      auto* out = ctx.Output<Tensor>(out_name);
+      auto* out = ctx.Output<phi::DenseTensor>(out_name);
       PADDLE_ENFORCE_NOT_NULL(out,
                               platform::errors::NotFound(
                                   "Output tensor (%s) is nullptr.", out_name));
@@ -431,7 +431,7 @@ class CustomOperator : public OperatorWithKernel {
    */
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const OpKernelType& expected_kernel_type) const override {
     return OpKernelType(expected_kernel_type.data_type_,
                         expected_kernel_type.place_,
@@ -511,7 +511,7 @@ class CustomOpMaker : public OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Custom Operator.
 
-According to the Tensor operation function implemented by the user
+According to the phi::DenseTensor operation function implemented by the user
 independently of the framework, it is encapsulated into a framework
 operator to adapt to various execution scenarios such as dynamic graph,
 mode static graph mode, and inference mode.
diff --git a/paddle/fluid/framework/data_device_transform.cc b/paddle/fluid/framework/data_device_transform.cc
index e65ecff60edd7..c8c92e95ea3a5 100644
--- a/paddle/fluid/framework/data_device_transform.cc
+++ b/paddle/fluid/framework/data_device_transform.cc
@@ -16,9 +16,9 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-void TransDataDevice(const Tensor &in,
+void TransDataDevice(const phi::DenseTensor &in,
                      const platform::Place &dst_place,
-                     Tensor *out) {
+                     phi::DenseTensor *out) {
   VLOG(3) << "DeviceTransform in, src_place " << in.place()
           << " dst_place: " << dst_place;
 
diff --git a/paddle/fluid/framework/data_device_transform.h b/paddle/fluid/framework/data_device_transform.h
index cb6b5feab2fca..55130519c4a6a 100644
--- a/paddle/fluid/framework/data_device_transform.h
+++ b/paddle/fluid/framework/data_device_transform.h
@@ -21,9 +21,9 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-void TransDataDevice(const Tensor& in,
+void TransDataDevice(const phi::DenseTensor& in,
                      const platform::Place& dst_place,
-                     Tensor* out);
+                     phi::DenseTensor* out);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu
index cd76747c03599..3e017f9b39377 100644
--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -55,7 +55,7 @@ class TestOpWithKernel : public OperatorWithKernel {
     } else {
       VLOG(3) << "use default kernel";
       return OpKernelType(proto::VarType::FP32,
-                          ctx.Input<Tensor>("input")->place());
+                          ctx.Input<phi::DenseTensor>("input")->place());
     }
   }
 };
@@ -66,7 +66,7 @@ class TestKernel : public OpKernel<float> {
   void Compute(const ExecutionContext& ctx) const {
     std::cout << ctx.DebugString() << std::endl;
 
-    const Tensor* input = ctx.Input<Tensor>("input");
+    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("input");
 
     std::cout << "input place:" << input->place() << std::endl;
     auto* output = ctx.Output<framework::LoDTensor>("output");
@@ -158,7 +158,7 @@ TEST(Operator, CPUtoGPU) {
       paddle::platform::DeviceContextPool::Instance();
   auto dev_ctx = pool.Get(cuda_place);
 
-  paddle::framework::Tensor output_tensor;
+  phi::DenseTensor output_tensor;
   paddle::framework::TensorCopy(output2->Get<paddle::framework::LoDTensor>(),
                                 paddle::platform::CPUPlace(),
                                 *dev_ctx,
diff --git a/paddle/fluid/framework/data_feed.proto b/paddle/fluid/framework/data_feed.proto
index a7ab70948795f..18d4cc7d4dc5c 100644
--- a/paddle/fluid/framework/data_feed.proto
+++ b/paddle/fluid/framework/data_feed.proto
@@ -19,7 +19,7 @@ message Slot {
   required string type = 2;
   optional bool is_dense = 3 [ default = false ];
   optional bool is_used = 4 [ default = false ];
-  repeated int32 shape = 5; // we can define N-D Tensor
+  repeated int32 shape = 5; // we can define N-D phi::DenseTensor
 }
 
 message MultiSlotDesc {
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index 3c6a89f2939a7..254a7abd66db5 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -55,8 +55,8 @@ void CastDataLayout::apply() {
 
 void TransDataLayout(const OpKernelType& kernel_type_for_var,
                      const OpKernelType& expected_kernel_type,
-                     const Tensor& in,
-                     Tensor* out) {
+                     const phi::DenseTensor& in,
+                     phi::DenseTensor* out) {
   PADDLE_ENFORCE(
       platform::places_are_same_class(kernel_type_for_var.place_,
                                       expected_kernel_type.place_),
@@ -97,7 +97,8 @@ using dnnl::memory;
 using dnnl::primitive;
 using dnnl::reorder;
 
-void* GetDataFromTensor(const Tensor& tensor, dnnl::memory::data_type type) {
+void* GetDataFromTensor(const phi::DenseTensor& tensor,
+                        dnnl::memory::data_type type) {
   switch (type) {
     case dnnl::memory::data_type::f32:
       return platform::to_void_cast(tensor.data<float>());
@@ -117,8 +118,8 @@ void* GetDataFromTensor(const Tensor& tensor, dnnl::memory::data_type type) {
 
 void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
                                const OpKernelType& expected_kernel_type,
-                               const Tensor& in,
-                               Tensor* out) {
+                               const phi::DenseTensor& in,
+                               phi::DenseTensor* out) {
   auto in_layout = kernel_type_for_var.data_layout_;
   auto out_layout = expected_kernel_type.data_layout_;
   auto place = expected_kernel_type.place_;
@@ -139,8 +140,8 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
 
 void innerTransDataLayoutFromMKLDNN(DataLayout in_layout,
                                     DataLayout out_layout,
-                                    const Tensor& in,
-                                    Tensor* out,
+                                    const phi::DenseTensor& in,
+                                    phi::DenseTensor* out,
                                     platform::Place place,
                                     bool always_copy) {
   // Set default as NCHW in case not specified
diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h
index 5eb1f3ecb49a7..b30884fa6f0da 100644
--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
@@ -38,12 +38,12 @@ namespace framework {
 struct CastDataLayout {
   CastDataLayout(const platform::DeviceContext* ctx,
                  const std::vector<int>& axis,
-                 const framework::Tensor& in,
-                 framework::Tensor* out)
+                 const phi::DenseTensor& in,
+                 phi::DenseTensor* out)
       : in_(in), out_(out), ctx_(ctx), axis_(axis) {}
 
-  const framework::Tensor in_;
-  framework::Tensor* out_;
+  const phi::DenseTensor in_;
+  phi::DenseTensor* out_;
   const platform::DeviceContext* ctx_;
   const std::vector<int> axis_;
 
@@ -101,17 +101,17 @@ inline MKLDNNDataType ToMKLDNNDataType(proto::VarType::Type type) {
 
 void innerTransDataLayoutFromMKLDNN(DataLayout in_layout,
                                     DataLayout out_layout,
-                                    const Tensor& in,
-                                    Tensor* out,
+                                    const phi::DenseTensor& in,
+                                    phi::DenseTensor* out,
                                     platform::Place place,
                                     bool always_copy = false);
 
 void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
                                const OpKernelType& expected_kernel_type,
-                               const Tensor& in,
-                               Tensor* out);
+                               const phi::DenseTensor& in,
+                               phi::DenseTensor* out);
 
-void* GetDataFromTensor(const Tensor& tensor, MKLDNNDataType type);
+void* GetDataFromTensor(const phi::DenseTensor& tensor, MKLDNNDataType type);
 
 #endif
 
@@ -119,8 +119,8 @@ std::vector<int> GetAxis(const DataLayout& from, const DataLayout& to);
 
 void TransDataLayout(const OpKernelType& kernel_type_for_var,
                      const OpKernelType& expected_kernel_type,
-                     const Tensor& in,
-                     Tensor* out);
+                     const phi::DenseTensor& in,
+                     phi::DenseTensor* out);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/data_layout_transform_test.cc b/paddle/fluid/framework/data_layout_transform_test.cc
index f1ac6b5216ecd..0c329a3e8c222 100644
--- a/paddle/fluid/framework/data_layout_transform_test.cc
+++ b/paddle/fluid/framework/data_layout_transform_test.cc
@@ -18,8 +18,8 @@
 
 TEST(DataTransform, DataLayoutFunction) {
   auto place = paddle::platform::CPUPlace();
-  paddle::framework::Tensor in = paddle::framework::Tensor();
-  paddle::framework::Tensor out = paddle::framework::Tensor();
+  phi::DenseTensor in = phi::DenseTensor();
+  phi::DenseTensor out = phi::DenseTensor();
   in.mutable_data<double>(phi::make_ddim({2, 3, 1, 2}), place);
   in.set_layout(paddle::framework::DataLayout::kNHWC);
 
@@ -48,7 +48,7 @@ TEST(DataTransform, DataLayoutFunction) {
 #ifdef PADDLE_WITH_MKLDNN
 TEST(DataTransformBf16, GetDataFromTensorDNNL) {
   auto place = paddle::platform::CPUPlace();
-  paddle::framework::Tensor in = paddle::framework::Tensor();
+  phi::DenseTensor in = phi::DenseTensor();
   in.mutable_data<paddle::platform::bfloat16>(phi::make_ddim({2, 3, 1, 2}),
                                               place);
 
@@ -61,7 +61,7 @@ TEST(DataTransformBf16, GetDataFromTensorDNNL) {
 
 TEST(DataTransformInt32, GetDataFromTensorDNNL) {
   auto place = paddle::platform::CPUPlace();
-  paddle::framework::Tensor in = paddle::framework::Tensor();
+  phi::DenseTensor in = phi::DenseTensor();
   in.mutable_data<int32_t>(phi::make_ddim({2, 3, 1, 2}), place);
 
   void* in_data =
diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc
index 044bf1fca39e2..db8c3c8c86cb5 100644
--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -31,19 +31,19 @@ class Variable;
 namespace paddle {
 namespace framework {
 
-static void PassTensorData(Tensor *from, Tensor *to) {
+static void PassTensorData(phi::DenseTensor *from, phi::DenseTensor *to) {
   to->ShareDataWith(*from);
-  *from = Tensor();
+  *from = phi::DenseTensor();
 }
 
 void TransformData(const OpKernelType &expected_kernel_type,
                    const OpKernelType &kernel_type_for_var,
-                   const Tensor &input_tensor,
-                   Tensor *output_tensor) {
+                   const phi::DenseTensor &input_tensor,
+                   phi::DenseTensor *output_tensor) {
   bool transformed = false;
-  Tensor in;
+  phi::DenseTensor in;
   in.ShareDataWith(input_tensor);
-  Tensor out;
+  phi::DenseTensor out;
   const DataLayout lin = kernel_type_for_var.data_layout_;
   const DataLayout lout = expected_kernel_type.data_layout_;
   // do layout transform
@@ -120,7 +120,7 @@ void TransformData(const OpKernelType &expected_kernel_type,
 }
 
 void SetTensorToVariable(const Variable &in_var,
-                         const Tensor &tensor,
+                         const phi::DenseTensor &tensor,
                          Variable *out_var) {
   if (in_var.IsType<LoDTensor>()) {
     auto &in_lod_tensor = in_var.Get<LoDTensor>();
diff --git a/paddle/fluid/framework/data_transform.h b/paddle/fluid/framework/data_transform.h
index 7fe20beec7dd7..2fcea7803ed31 100644
--- a/paddle/fluid/framework/data_transform.h
+++ b/paddle/fluid/framework/data_transform.h
@@ -35,14 +35,14 @@ class Variable;
 
 void TransformData(const OpKernelType &expected_kernel_type,
                    const OpKernelType &kernel_type_for_var,
-                   const Tensor &input_tensor,
-                   Tensor *out);
+                   const phi::DenseTensor &input_tensor,
+                   phi::DenseTensor *out);
 
 /**
  * Set OutVar from InVar, except the tensor is shared with `tensor`
  */
 void SetTensorToVariable(const Variable &in_var,
-                         const Tensor &tensor,
+                         const phi::DenseTensor &tensor,
                          Variable *out_var);
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/data_type_test.cc b/paddle/fluid/framework/data_type_test.cc
index 01802c11d5219..9f36bd4636890 100644
--- a/paddle/fluid/framework/data_type_test.cc
+++ b/paddle/fluid/framework/data_type_test.cc
@@ -20,13 +20,12 @@
 #include "paddle/fluid/framework/tensor.h"
 
 TEST(DataType, float16) {
-  using paddle::framework::Tensor;
   using paddle::platform::CPUPlace;
   using paddle::platform::float16;
   namespace f = paddle::framework;
   f::proto::VarType::Type dtype = f::proto::VarType::FP16;
 
-  Tensor tensor;
+  phi::DenseTensor tensor;
   CPUPlace cpu;
   tensor.mutable_data(cpu, f::TransToPhiDataType(dtype));
 
@@ -43,13 +42,12 @@ TEST(DataType, float16) {
 }
 
 TEST(DataType, bfloat16) {
-  using paddle::framework::Tensor;
   using paddle::platform::bfloat16;
   using paddle::platform::CPUPlace;
   namespace f = paddle::framework;
   f::proto::VarType::Type dtype = f::proto::VarType::BF16;
 
-  Tensor tensor;
+  phi::DenseTensor tensor;
   CPUPlace cpu;
   tensor.mutable_data(cpu, f::TransToPhiDataType(dtype));
 
diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc
index 59d20306c665a..e7abe21daeb5e 100644
--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
@@ -35,8 +35,8 @@ struct CastDataTypeFunctor {
 #if defined(PADDLE_WITH_XPU)
 
 template <typename InType, typename OutType>
-static void XPUCastData(const framework::Tensor& in,
-                        framework::Tensor* out,
+static void XPUCastData(const phi::DenseTensor& in,
+                        phi::DenseTensor* out,
                         const platform::XPUDeviceContext* dev_ctx) {
   using XPUInTDType = typename XPUTypeTrait<InType>::Type;
   using XPUOutTDType = typename XPUTypeTrait<OutType>::Type;
@@ -51,8 +51,8 @@ static void XPUCastData(const framework::Tensor& in,
 
 template <typename InType>
 static void XPUTransDataType(
-    const framework::Tensor& in,
-    framework::Tensor* out,
+    const phi::DenseTensor& in,
+    phi::DenseTensor* out,
     const paddle::framework::proto::VarType::Type& dst_type,
     const platform::DeviceContext* ctx) {
   auto* context = static_cast<const platform::XPUDeviceContext*>(ctx);
@@ -79,12 +79,12 @@ static void XPUTransDataType(
 
 template <typename InType>
 struct CastDataType {
-  CastDataType(const framework::Tensor& in,
-               framework::Tensor* out,
+  CastDataType(const phi::DenseTensor& in,
+               phi::DenseTensor* out,
                const platform::DeviceContext* ctx)
       : in_(in), out_(out), ctx_(ctx) {}
-  const framework::Tensor in_;
-  framework::Tensor* out_;
+  const phi::DenseTensor in_;
+  phi::DenseTensor* out_;
   const platform::DeviceContext* ctx_;
 
   template <typename OutType>
@@ -121,8 +121,8 @@ struct CastDataType {
 
 void TransDataType(const OpKernelType& kernel_type_for_var,
                    const OpKernelType& expected_kernel_type,
-                   const Tensor& in,
-                   Tensor* out) {
+                   const phi::DenseTensor& in,
+                   phi::DenseTensor* out) {
   PADDLE_ENFORCE_EQ(
       framework::TransToProtoVarType(in.dtype()),
       kernel_type_for_var.data_type_,
@@ -135,9 +135,9 @@ void TransDataType(const OpKernelType& kernel_type_for_var,
   TransDataType(in, dst_type, out);
 }
 
-void TransDataType(const Tensor& in,
+void TransDataType(const phi::DenseTensor& in,
                    const paddle::framework::proto::VarType::Type& type,
-                   Tensor* out) {
+                   phi::DenseTensor* out) {
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
 
   out->Resize(in.dims());
@@ -213,8 +213,8 @@ void TransDataType(const Tensor& in,
 
 void TransComplexToReal(const proto::VarType::Type& dst_type,
                         const proto::VarType::Type& src_type,
-                        const Tensor& in,
-                        Tensor* out) {
+                        const phi::DenseTensor& in,
+                        phi::DenseTensor* out) {
   auto& pool = platform::DeviceContextPool::Instance();
   auto* ctx = pool.Get(in.place());
   out->Resize(in.dims());
diff --git a/paddle/fluid/framework/data_type_transform.h b/paddle/fluid/framework/data_type_transform.h
index b6449861369a2..619e15b6045e8 100644
--- a/paddle/fluid/framework/data_type_transform.h
+++ b/paddle/fluid/framework/data_type_transform.h
@@ -30,11 +30,11 @@ using KernelTypePair = std::pair<OpKernelType, OpKernelType>;
 
 void TransDataType(const OpKernelType& kernel_type_for_var,
                    const OpKernelType& expected_kernel_type,
-                   const Tensor& in,
-                   Tensor* out);
-void TransDataType(const Tensor& in,
+                   const phi::DenseTensor& in,
+                   phi::DenseTensor* out);
+void TransDataType(const phi::DenseTensor& in,
                    const paddle::framework::proto::VarType::Type& type,
-                   Tensor* out);
+                   phi::DenseTensor* out);
 
 /**
  * Transform complex gradient to real data type.
@@ -49,8 +49,8 @@ void TransDataType(const Tensor& in,
  */
 void TransComplexToReal(const proto::VarType::Type& dst_type,
                         const proto::VarType::Type& src_type,
-                        const Tensor& in,
-                        Tensor* out);
+                        const phi::DenseTensor& in,
+                        phi::DenseTensor* out);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/data_type_transform_test.cc b/paddle/fluid/framework/data_type_transform_test.cc
index 64d91611ab40a..cfdcb18a841b8 100644
--- a/paddle/fluid/framework/data_type_transform_test.cc
+++ b/paddle/fluid/framework/data_type_transform_test.cc
@@ -63,8 +63,8 @@ TEST(DataTypeTransform, CPUTransform) {
 
   // data type transform from float32
   {
-    paddle::framework::Tensor in;
-    paddle::framework::Tensor out;
+    phi::DenseTensor in;
+    phi::DenseTensor out;
 
     float* ptr = in.mutable_data<float>(phi::make_ddim({2, 3}), place);
     int data_number = 2 * 3;
@@ -88,8 +88,8 @@ TEST(DataTypeTransform, CPUTransform) {
 
   // data type transform from/to float16
   {
-    paddle::framework::Tensor in;
-    paddle::framework::Tensor out;
+    phi::DenseTensor in;
+    phi::DenseTensor out;
 
     paddle::platform::float16* ptr = in.mutable_data<paddle::platform::float16>(
         phi::make_ddim({2, 3}), place);
@@ -201,8 +201,8 @@ TEST(DataTypeTransform, CPUTransform) {
 
   // data type transform from/to bfloat16
   {
-    paddle::framework::Tensor in;
-    paddle::framework::Tensor out;
+    phi::DenseTensor in;
+    phi::DenseTensor out;
 
     paddle::platform::bfloat16* ptr =
         in.mutable_data<paddle::platform::bfloat16>(phi::make_ddim({2, 3}),
@@ -315,8 +315,8 @@ TEST(DataTypeTransform, CPUTransform) {
 
   // data type transform from/to int32
   {
-    paddle::framework::Tensor in;
-    paddle::framework::Tensor out;
+    phi::DenseTensor in;
+    phi::DenseTensor out;
 
     int32_t* ptr = in.mutable_data<int32_t>(phi::make_ddim({2, 3}), place);
     int data_number = 2 * 3;
diff --git a/paddle/fluid/framework/data_type_transform_test.cu b/paddle/fluid/framework/data_type_transform_test.cu
index 8490afd69d9ea..0a808cfdbf738 100644
--- a/paddle/fluid/framework/data_type_transform_test.cu
+++ b/paddle/fluid/framework/data_type_transform_test.cu
@@ -62,10 +62,10 @@ TEST(DataTypeTransform, GPUTransform) {
 
   // data type transform from float32
   {
-    paddle::framework::Tensor in;
-    paddle::framework::Tensor in_gpu;
-    paddle::framework::Tensor out_gpu;
-    paddle::framework::Tensor out;
+    phi::DenseTensor in;
+    phi::DenseTensor in_gpu;
+    phi::DenseTensor out_gpu;
+    phi::DenseTensor out;
 
     float* in_ptr = in.mutable_data<float>(phi::make_ddim({2, 3}), cpu_place);
     float arr[6] = {0, 1, 2, 3, 4, 5};
@@ -97,10 +97,10 @@ TEST(DataTypeTransform, GPUTransform) {
 
   // data type transform from/to float16
   {
-    paddle::framework::Tensor in;
-    paddle::framework::Tensor in_gpu;
-    paddle::framework::Tensor out_gpu;
-    paddle::framework::Tensor out;
+    phi::DenseTensor in;
+    phi::DenseTensor in_gpu;
+    phi::DenseTensor out_gpu;
+    phi::DenseTensor out;
 
     paddle::platform::float16* ptr = in.mutable_data<paddle::platform::float16>(
         phi::make_ddim({2, 3}), cpu_place);
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index 01f707eb9baaf..293ef3492691c 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -248,7 +248,7 @@ void AllReduceOpHandle::AllReduceFunc(
                      ->FindVar(out_var_names[0])
                      ->GetMutable<LoDTensor>();
 
-    // Reduce All Tensor to trg in CPU
+    // Reduce All phi::DenseTensor to trg in CPU
     ReduceBufferData func(lod_tensor_data, trg.data(), numel);
     VisitDataType(framework::TransToProtoVarType(trg.dtype()), func);
 
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
index 18b9dc3ffac9f..d28f81f3556cc 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -62,7 +62,7 @@ void BroadcastOpHandle::BroadcastOneVar(
       in_var,
       platform::errors::NotFound("Variable %s is not found in scopes.",
                                  in_var_handle.name()));
-  Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
+  phi::DenseTensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
   if (UNLIKELY(!in_tensor.IsInitialized())) {
     VLOG(3) << "in var " << in_var_handle.name() << "not inited, return!";
     return;
@@ -236,7 +236,7 @@ void BroadcastOpHandle::InitOutputValue(
   auto *in_var =
       var_scopes.at(in_var_handle.scope_idx())->FindVar(in_var_handle.name());
 
-  Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
+  phi::DenseTensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
 
   // NOTE: The tensors' Place of input and output must be all on GPU or all on
   // CPU.
diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.h b/paddle/fluid/framework/details/broadcast_op_handle_test.h
index 154bf2b354e1a..f0825196e4478 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h
@@ -286,7 +286,7 @@ struct TestBroadcastOpHandle {
     }
 
     p::CPUPlace cpu_place;
-    f::Tensor result_tensor;
+    phi::DenseTensor result_tensor;
     f::TensorCopySync(rt, cpu_place, &result_tensor);
     float* ct = result_tensor.data<float>();
 
@@ -312,7 +312,7 @@ struct TestBroadcastOpHandle {
                           "the expected, expect %s, but got %s.",
                           lod,
                           tensor.lod()));
-    f::Tensor result_tensor;
+    phi::DenseTensor result_tensor;
     f::TensorCopySync(tensor, cpu_place, &result_tensor);
     float* ct = result_tensor.mutable_data<float>(cpu_place);
     for (int64_t k = 0; k < phi::product(kDims); ++k) {
diff --git a/paddle/fluid/framework/details/build_strategy_test.cc b/paddle/fluid/framework/details/build_strategy_test.cc
index 11c0746acc7b7..4b184ba552898 100644
--- a/paddle/fluid/framework/details/build_strategy_test.cc
+++ b/paddle/fluid/framework/details/build_strategy_test.cc
@@ -52,7 +52,8 @@ class SumOpWithKernel : public OperatorWithKernel {
   void InferShape(framework::InferShapeContext *ctx) const override {}
   OpKernelType GetExpectedKernelType(
       const ExecutionContext &ctx) const override {
-    return OpKernelType(proto::VarType::FP32, ctx.Input<Tensor>("X")->place());
+    return OpKernelType(proto::VarType::FP32,
+                        ctx.Input<phi::DenseTensor>("X")->place());
   }
 };
 
diff --git a/paddle/fluid/framework/details/fetch_async_op_handle.cc b/paddle/fluid/framework/details/fetch_async_op_handle.cc
index adf49c81c049a..ec49510e0d41b 100644
--- a/paddle/fluid/framework/details/fetch_async_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_async_op_handle.cc
@@ -130,8 +130,8 @@ static void CheckTensorAttrs(const LoDTensor *tensor,
           offset));
 }
 
-static void TransData(const framework::Tensor *src_item,
-                      framework::Tensor *dst_item,
+static void TransData(const phi::DenseTensor *src_item,
+                      phi::DenseTensor *dst_item,
                       const platform::DeviceContext &ctx) {
   if (src_item->IsInitialized() && src_item->numel() > 0) {
     if (platform::is_gpu_place(src_item->place())) {
diff --git a/paddle/fluid/framework/details/gather_op_handle.cc b/paddle/fluid/framework/details/gather_op_handle.cc
index 91959d5146be6..50b34b57ec5c0 100644
--- a/paddle/fluid/framework/details/gather_op_handle.cc
+++ b/paddle/fluid/framework/details/gather_op_handle.cc
@@ -77,7 +77,7 @@ void GatherOpHandle::RunImpl() {
 
   auto &pre_in_value = pre_in_var->Get<phi::SelectedRows>();
   std::vector<int64_t> out_rows;
-  std::vector<Tensor> in_tensors;
+  std::vector<phi::DenseTensor> in_tensors;
 
   // Gather the inputs
   for (auto *in_handle : in_var_handles) {
@@ -121,7 +121,7 @@ void GatherOpHandle::RunImpl() {
   out_dim[0] = static_cast<int64_t>(rows);
   out_value->mutable_value()->Resize(out_dim).mutable_data(
       t_out_p, pre_in_value.value().dtype());
-  Tensor *out_tensor = out_value->mutable_value();
+  phi::DenseTensor *out_tensor = out_value->mutable_value();
 
   // copy
   auto dev_ctx = dev_ctxes_.at(out_var_handle->place());
diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc
index 45d8939f788a0..3437eb5570dc7 100644
--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@@ -204,7 +204,7 @@ struct TestGatherOpHandle {
               out_select_rows.rows()[k]));
     }
 
-    f::Tensor result_tensor;
+    phi::DenseTensor result_tensor;
     f::TensorCopy(rt, cpu_place, *(ctxs_[output_scope_idx]), &result_tensor);
     float* ct = result_tensor.data<float>();
 
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
index bce7b64e6d735..e749d1568ff4b 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -332,7 +332,7 @@ void TensorCheckerVisitor<phi::CPUContext>::apply(
 template <>
 void tensor_check<phi::CPUContext>(const std::string& op_type,
                                    const std::string& var_name,
-                                   const framework::Tensor& tensor,
+                                   const phi::DenseTensor& tensor,
                                    const platform::Place& place) {
   TensorCheckerVisitor<phi::CPUContext> vistor(
       op_type, var_name, tensor, place);
@@ -348,7 +348,7 @@ void CheckVarHasNanOrInf(const std::string& op_type,
       platform::errors::NotFound(
           "Cannot find var: `%s` in op `%s`.", var_name, op_type));
 
-  const Tensor* tensor{nullptr};
+  const phi::DenseTensor* tensor{nullptr};
   if (var->IsType<framework::LoDTensor>()) {
     tensor = &var->Get<framework::LoDTensor>();
   } else if (var->IsType<phi::SelectedRows>()) {
@@ -371,7 +371,8 @@ void CheckVarHasNanOrInf(const std::string& op_type,
     tensor_check<phi::GPUContext>(op_type, var_name, *tensor, place);
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "Tensor[%s] use gpu place. PaddlePaddle must compile with GPU.",
+        "phi::DenseTensor[%s] use gpu place. PaddlePaddle must compile with "
+        "GPU.",
         var_name));
 #endif
     return;
@@ -400,10 +401,13 @@ void CheckVarHasNanOrInf(const std::string& op_type,
         flag,
         true,
         platform::errors::Fatal(
-            "Operator %s output Tensor %s contains Inf.", op_type, var_name));
+            "Operator %s output phi::DenseTensor %s contains Inf.",
+            op_type,
+            var_name));
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "Tensor[%s] use xpu place. PaddlePaddle must compile with XPU.",
+        "phi::DenseTensor[%s] use xpu place. PaddlePaddle must compile with "
+        "XPU.",
         var_name));
 #endif
     return;
@@ -431,10 +435,13 @@ void CheckVarHasNanOrInf(const std::string& op_type,
         flag,
         true,
         platform::errors::Fatal(
-            "Operator %s output Tensor %s contains Inf.", op_type, var_name));
+            "Operator %s output phi::DenseTensor %s contains Inf.",
+            op_type,
+            var_name));
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "Tensor[%s] use npu place. PaddlePaddle must compile with NPU.",
+        "phi::DenseTensor[%s] use npu place. PaddlePaddle must compile with "
+        "NPU.",
         var_name));
 #endif
     return;
@@ -473,8 +480,8 @@ using NpuOpRunner = paddle::operators::NpuOpRunner;
 
 constexpr int FLOAT_STATUS_SIZE = 8;
 
-static framework::Tensor& npu_float_status() {
-  static framework::Tensor float_status;
+static phi::DenseTensor& npu_float_status() {
+  static phi::DenseTensor float_status;
   return float_status;
 }
 
@@ -494,7 +501,7 @@ void NPUAllocAndClearFloatStatus(const framework::OperatorBase& op,
   flag.mutable_data<float>({FLOAT_STATUS_SIZE}, place);
   NpuOpRunner("NPUAllocFloatStatus", {}, {flag}).Run(stream);
 
-  framework::Tensor tmp;
+  phi::DenseTensor tmp;
   tmp.mutable_data<float>({FLOAT_STATUS_SIZE}, place);
   NpuOpRunner("NPUClearFloatStatus", {tmp}, {flag}).Run(stream);
 }
@@ -503,7 +510,7 @@ void PrintNpuVarInfo(const std::string& op_type,
                      const std::string& var_name,
                      const framework::Variable* var,
                      const platform::Place& place) {
-  const Tensor* tensor{nullptr};
+  const phi::DenseTensor* tensor{nullptr};
   if (var->IsType<framework::LoDTensor>()) {
     tensor = &var->Get<framework::LoDTensor>();
   } else if (var->IsType<phi::SelectedRows>()) {
@@ -528,7 +535,7 @@ void PrintNpuVarInfo(const std::string& op_type,
   VLOG(10) << "begin check " << op_type << " var_name:" << var_name
            << ", place:" << tensor->place() << ", numel:" << tensor->numel();
 
-  framework::Tensor cpu_tensor;
+  phi::DenseTensor cpu_tensor;
   cpu_tensor.Resize(tensor->dims());
   cpu_tensor.mutable_data(platform::CPUPlace(), tensor->dtype());
   framework::TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
@@ -575,13 +582,13 @@ static void NPUCheckOpHasNanOrInf(const framework::OperatorBase& op,
   auto stream = dev_ctx->stream();
 
   auto& flag = npu_float_status();
-  Tensor tmp;
+  phi::DenseTensor tmp;
   tmp.mutable_data<float>({FLOAT_STATUS_SIZE}, place);
   // NPUGetFloatStatus updates data on input in-place.
   // tmp is only placeholder.
   NpuOpRunner("NPUGetFloatStatus", {flag}, {tmp}).Run(stream);
 
-  framework::Tensor cpu_tensor;
+  phi::DenseTensor cpu_tensor;
   auto cpu_place = platform::CPUPlace();
   float* cpu_data = static_cast<float*>(
       cpu_tensor.mutable_data<float>({FLOAT_STATUS_SIZE}, cpu_place));
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cu b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
index 4aa24f8cb6ab8..57552a16cc5f4 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
@@ -230,7 +230,7 @@ void TensorCheckerVisitor<phi::GPUContext>::apply(
 template <>
 void tensor_check<phi::GPUContext>(const std::string& op_type,
                                    const std::string& var_name,
-                                   const framework::Tensor& tensor,
+                                   const phi::DenseTensor& tensor,
                                    const platform::Place& place) {
   std::call_once(init_multi_gpu_op_var_map_flag, InitMultiGPUOpVarMap);
 
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.h b/paddle/fluid/framework/details/nan_inf_utils_detail.h
index 99186c43e129e..2a25bc7b68f36 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.h
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.h
@@ -28,7 +28,7 @@ template <typename DeviceContext>
 struct TensorCheckerVisitor {
   TensorCheckerVisitor(const std::string& op_type,
                        const std::string& var_name,
-                       const framework::Tensor& tensor,
+                       const phi::DenseTensor& tensor,
                        const platform::Place& place)
       : op_type_(op_type),
         var_name_(var_name),
@@ -51,14 +51,14 @@ struct TensorCheckerVisitor {
 
   std::string op_type_;
   std::string var_name_;
-  const framework::Tensor& tensor_;
+  const phi::DenseTensor& tensor_;
   const platform::Place& place_;
 };
 
 template <typename DeviceContext>
 void tensor_check(const std::string& op_type,
                   const std::string& var_name,
-                  const framework::Tensor& tensor,
+                  const phi::DenseTensor& tensor,
                   const platform::Place& place);
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/reduce_and_gather.h b/paddle/fluid/framework/details/reduce_and_gather.h
index de53a5de99b96..a1715062d4cb5 100644
--- a/paddle/fluid/framework/details/reduce_and_gather.h
+++ b/paddle/fluid/framework/details/reduce_and_gather.h
@@ -165,7 +165,7 @@ struct GatherLocalSelectedRowsFunctor {
  private:
   const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes_;
   std::vector<platform::Place> in_places_;
-  std::vector<Tensor> in_tensors_;
+  std::vector<phi::DenseTensor> in_tensors_;
 
   platform::Place out_place_;
   phi::SelectedRows *dst_selected_rows_;
diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc
index 7f38629f4e606..d35f9360637e6 100644
--- a/paddle/fluid/framework/details/reduce_op_handle_test.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc
@@ -228,7 +228,7 @@ struct TestReduceOpHandle {
               out_select_rows.rows()[k]));
     }
 
-    f::Tensor result_tensor;
+    phi::DenseTensor result_tensor;
     f::TensorCopySync(rt, cpu_place, &result_tensor);
     float *ct = result_tensor.data<float>();
 
@@ -279,7 +279,7 @@ struct TestReduceOpHandle {
 
     auto &rt = out_var->Get<f::LoDTensor>();
 
-    f::Tensor result_tensor;
+    phi::DenseTensor result_tensor;
     f::TensorCopySync(rt, cpu_place, &result_tensor);
     float *ct = result_tensor.data<float>();
 
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
index b453e7c4a813e..caffeba538dae 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -43,13 +43,13 @@ ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {}
 
 struct ScaleLossGradFunctor {
   float coeff_;
-  Tensor *out_;
+  phi::DenseTensor *out_;
   platform::Place place_;
   proto::VarType::Type out_dtype_;
   platform::DeviceContext *ctx_;
 
   ScaleLossGradFunctor(float coeff,
-                       Tensor *out,
+                       phi::DenseTensor *out,
                        platform::Place place,
                        proto::VarType::Type dtype,
                        platform::DeviceContext *ctx)
diff --git a/paddle/fluid/framework/details/scope_buffered_monitor.cc b/paddle/fluid/framework/details/scope_buffered_monitor.cc
index 9a92ae19f9425..7f9f7a537313f 100644
--- a/paddle/fluid/framework/details/scope_buffered_monitor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_monitor.cc
@@ -31,7 +31,7 @@ namespace details {
 static constexpr double kMB = 1 / (1024 * 1024);
 
 static void GetTensors(Variable *var,
-                       std::unordered_set<Tensor *> *tensor_set) {
+                       std::unordered_set<phi::DenseTensor *> *tensor_set) {
   if (var->IsType<LoDTensor>() && var->Get<LoDTensor>().IsInitialized()) {
     tensor_set->insert(var->GetMutable<LoDTensor>());
   } else if (var->IsType<phi::SelectedRows>() &&
@@ -47,7 +47,8 @@ static void GetTensors(Variable *var,
   }
 }
 
-static void GetTensors(Scope *scope, std::unordered_set<Tensor *> *tensor_set) {
+static void GetTensors(Scope *scope,
+                       std::unordered_set<phi::DenseTensor *> *tensor_set) {
   for (auto &var_name : scope->LocalVarNames()) {
     GetTensors(scope->FindVar(var_name), tensor_set);
   }
@@ -58,7 +59,7 @@ static void GetTensors(Scope *scope, std::unordered_set<Tensor *> *tensor_set) {
 }
 
 static size_t GetTensorMemorySize(Scope *scope, bool clear_cpu_tensor) {
-  std::unordered_set<Tensor *> tensor_set;
+  std::unordered_set<phi::DenseTensor *> tensor_set;
   GetTensors(scope, &tensor_set);
   size_t memory_size = 0;
   std::unordered_set<memory::Allocation *> allocation_set;
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_functor.h b/paddle/fluid/framework/details/share_tensor_buffer_functor.h
index 0ce66b9a0c7e7..d92bc0f0b0b1b 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_functor.h
+++ b/paddle/fluid/framework/details/share_tensor_buffer_functor.h
@@ -40,7 +40,7 @@ namespace framework {
 namespace details {
 
 // TODO(zjl): support SelectedRows
-static inline const Tensor &GetTensorFromVar(const Variable *var) {
+static inline const phi::DenseTensor &GetTensorFromVar(const Variable *var) {
   if (var->IsType<LoDTensor>()) {
     return var->Get<LoDTensor>();
   } else {
@@ -50,7 +50,7 @@ static inline const Tensor &GetTensorFromVar(const Variable *var) {
   }
 }
 
-static inline Tensor *GetMutableTensorFromVar(Variable *var) {
+static inline phi::DenseTensor *GetMutableTensorFromVar(Variable *var) {
   if (var->IsType<LoDTensor>()) {
     return var->GetMutable<LoDTensor>();
   } else {
diff --git a/paddle/fluid/framework/details/variable_visitor.cc b/paddle/fluid/framework/details/variable_visitor.cc
index 670c0b054c4cb..a495e405014ff 100644
--- a/paddle/fluid/framework/details/variable_visitor.cc
+++ b/paddle/fluid/framework/details/variable_visitor.cc
@@ -56,7 +56,7 @@ static void VisitVariable(const Variable& var, Func* func) {
 }
 
 struct TensorVisitor {
-  Tensor* result_{nullptr};
+  phi::DenseTensor* result_{nullptr};
 
   void operator()(LoDTensor* tensor) { result_ = tensor; }
 
@@ -71,7 +71,7 @@ struct TensorVisitor {
   }
 };
 
-Tensor& VariableVisitor::GetMutableTensor(Variable* var) {
+phi::DenseTensor& VariableVisitor::GetMutableTensor(Variable* var) {
   TensorVisitor vistor;
   VisitVariable(var, &vistor);
   return *vistor.result_;
diff --git a/paddle/fluid/framework/details/variable_visitor.h b/paddle/fluid/framework/details/variable_visitor.h
index a689c47a1611f..12a18c73f5d4d 100644
--- a/paddle/fluid/framework/details/variable_visitor.h
+++ b/paddle/fluid/framework/details/variable_visitor.h
@@ -29,7 +29,7 @@ namespace details {
 
 class VariableVisitor {
  public:
-  static Tensor &GetMutableTensor(Variable *var);
+  static phi::DenseTensor &GetMutableTensor(Variable *var);
 
   static void ShareDimsAndLoD(const Variable &src, Variable *trg);
 
diff --git a/paddle/fluid/framework/device_worker.cc b/paddle/fluid/framework/device_worker.cc
index 34aa34a058e92..e3115b58f1fb1 100644
--- a/paddle/fluid/framework/device_worker.cc
+++ b/paddle/fluid/framework/device_worker.cc
@@ -32,7 +32,7 @@ void DeviceWorker::SetDataFeed(DataFeed* data_feed) {
 }
 
 template <typename T>
-std::string PrintLodTensorType(Tensor* tensor,
+std::string PrintLodTensorType(phi::DenseTensor* tensor,
                                int64_t start,
                                int64_t end,
                                char separator = ',',
@@ -55,10 +55,10 @@ std::string PrintLodTensorType(Tensor* tensor,
   return os.str();
 }
 template <typename T>
-void PrintLodTensorType(Tensor* tensor,
+void PrintLodTensorType(phi::DenseTensor* tensor,
                         int64_t start,
                         int64_t end,
-                        std::string& out_val,
+                        std::string& out_val,  // NOLINT
                         char separator = ',',
                         bool need_leading_separator = true) {
   auto count = tensor->numel();
@@ -84,10 +84,10 @@ void PrintLodTensorType(Tensor* tensor,
 #define FLOAT_EPS 1e-8
 #define MAX_FLOAT_BUFF_SIZE 40
 template <>
-void PrintLodTensorType<float>(Tensor* tensor,
+void PrintLodTensorType<float>(phi::DenseTensor* tensor,
                                int64_t start,
                                int64_t end,
-                               std::string& out_val,
+                               std::string& out_val,  // NOLINT
                                char separator,
                                bool need_leading_separator) {
   char buf[MAX_FLOAT_BUFF_SIZE];
@@ -101,15 +101,15 @@ void PrintLodTensorType<float>(Tensor* tensor,
   for (int64_t i = start; i < end; i++) {
     if (i != start || need_leading_separator) out_val += separator;
     if (tensor->data<float>()[i] > -FLOAT_EPS &&
-        tensor->data<float>()[i] < FLOAT_EPS)
+        tensor->data<float>()[i] < FLOAT_EPS) {
       out_val += "0";
-    else {
-      sprintf(buf, "%.9f", tensor->data<float>()[i]);
+    } else {
+      sprintf(buf, "%.9f", tensor->data<float>()[i]);  // NOLINT
       out_val += buf;
     }
   }
 }
-std::string PrintLodTensorIntType(Tensor* tensor,
+std::string PrintLodTensorIntType(phi::DenseTensor* tensor,
                                   int64_t start,
                                   int64_t end,
                                   char separator = ',',
@@ -132,10 +132,10 @@ std::string PrintLodTensorIntType(Tensor* tensor,
   return os.str();
 }
 
-void PrintLodTensorIntType(Tensor* tensor,
+void PrintLodTensorIntType(phi::DenseTensor* tensor,
                            int64_t start,
                            int64_t end,
-                           std::string& out_val,
+                           std::string& out_val,  // NOLINT
                            char separator = ',',
                            bool need_leading_separator = true) {
   auto count = tensor->numel();
@@ -160,7 +160,7 @@ void PrintLodTensorIntType(Tensor* tensor,
   // return os.str();
 }
 
-std::string PrintLodTensor(Tensor* tensor,
+std::string PrintLodTensor(phi::DenseTensor* tensor,
                            int64_t start,
                            int64_t end,
                            char separator,
@@ -183,10 +183,10 @@ std::string PrintLodTensor(Tensor* tensor,
   return out_val;
 }
 
-void PrintLodTensor(Tensor* tensor,
+void PrintLodTensor(phi::DenseTensor* tensor,
                     int64_t start,
                     int64_t end,
-                    std::string& out_val,
+                    std::string& out_val,  // NOLINT
                     char separator,
                     bool need_leading_separator) {
   if (framework::TransToProtoVarType(tensor->dtype()) == proto::VarType::FP32) {
@@ -361,7 +361,7 @@ void DeviceWorker::DumpField(const Scope& scope,
         continue;
       }
       size_t acutal_thread_num =
-          std::min((size_t)batch_size, tensor_iterator_thread_num);
+          std::min(static_cast<size_t>(batch_size), tensor_iterator_thread_num);
       for (size_t i = 0; i < acutal_thread_num; i++) {
         size_t average_size = batch_size / acutal_thread_num;
         size_t begin =
@@ -378,7 +378,7 @@ void DeviceWorker::DumpField(const Scope& scope,
     VLOG(1) << "writing a batch takes " << tt.count() << " us";
 
     size_t acutal_thread_num =
-        std::min((size_t)batch_size, tensor_iterator_thread_num);
+        std::min(static_cast<size_t>(batch_size), tensor_iterator_thread_num);
     for (size_t i = 0; i < acutal_thread_num; i++) {
       size_t average_size = batch_size / acutal_thread_num;
       size_t begin =
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index 6b3766e580fae..6276d0c5003da 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -31,7 +31,6 @@ limitations under the License. */
 #include "paddle/fluid/distributed/ps/wrapper/fleet.h"
 #endif
 
-#include <map>
 #include "paddle/fluid/framework/data_feed.h"
 #include "paddle/fluid/framework/executor_gc_helper.h"
 #include "paddle/fluid/framework/heter_util.h"
@@ -60,15 +59,15 @@ class Scope;
 namespace paddle {
 namespace framework {
 
-std::string PrintLodTensor(Tensor* tensor,
+std::string PrintLodTensor(phi::DenseTensor* tensor,
                            int64_t start,
                            int64_t end,
                            char separator = ',',
                            bool need_leading_separator = false);
-void PrintLodTensor(Tensor* tensor,
+void PrintLodTensor(phi::DenseTensor* tensor,
                     int64_t start,
                     int64_t end,
-                    std::string& output_str,
+                    std::string& output_str,  // NOLINT
                     char separator = ',',
                     bool need_leading_separator = false);
 std::pair<int64_t, int64_t> GetTensorBound(LoDTensor* tensor, int index);
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
index b7bca733b8f9e..32c6e17143fa2 100644
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -134,7 +134,7 @@ struct DLDeviceVisitor
 };
 }  // namespace internal
 
-DLPackTensor::DLPackTensor(const Tensor &tensor, LaneType lanes) {
+DLPackTensor::DLPackTensor(const phi::DenseTensor &tensor, LaneType lanes) {
   // init data, data buffer
   t_.data = const_cast<void *>(tensor.data());
 
diff --git a/paddle/fluid/framework/dlpack_tensor.h b/paddle/fluid/framework/dlpack_tensor.h
index ff4cf23da6e96..c6fca6707fad2 100644
--- a/paddle/fluid/framework/dlpack_tensor.h
+++ b/paddle/fluid/framework/dlpack_tensor.h
@@ -28,7 +28,7 @@ class DLPackTensor {
       std::remove_reference<decltype(::DLTensor::shape[0])>::type;  // int64_t
 
   // lanes is only used in CPU to enable vectorization
-  explicit DLPackTensor(const Tensor& tensor, LaneType lanes = 1);
+  explicit DLPackTensor(const phi::DenseTensor& tensor, LaneType lanes = 1);
 
   inline operator const ::DLTensor&() const { return t_; }
 
diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc
index 9e3604e71a245..0ccc5bb4ad1a4 100644
--- a/paddle/fluid/framework/dlpack_tensor_test.cc
+++ b/paddle/fluid/framework/dlpack_tensor_test.cc
@@ -47,7 +47,7 @@ constexpr uint8_t GetDLDataTypeCode() {
 template <typename T>
 void TestMain(const platform::Place &place, uint16_t lanes) {
   DDim dims{4, 5, 6, 7};
-  Tensor tensor;
+  phi::DenseTensor tensor;
   tensor.Resize(dims);
   void *p = tensor.mutable_data<T>(place);
 
@@ -85,7 +85,7 @@ void TestMain(const platform::Place &place, uint16_t lanes) {
 template <typename T>
 void TestToDLManagedTensor(const platform::Place &place, uint16_t lanes) {
   DDim dims{6, 7};
-  Tensor tensor;
+  phi::DenseTensor tensor;
   tensor.Resize(dims);
   tensor.mutable_data<T>(place);
 
diff --git a/paddle/fluid/framework/downpour_lite_worker.cc b/paddle/fluid/framework/downpour_lite_worker.cc
index bd2c404a6fd2a..cb082f6385653 100644
--- a/paddle/fluid/framework/downpour_lite_worker.cc
+++ b/paddle/fluid/framework/downpour_lite_worker.cc
@@ -314,11 +314,11 @@ void DownpourLiteWorker::TrainFilesWithProfiler() {
       PADDLE_ENFORCE_EQ(framework::TensorContainsInf(*tensor),
                         false,
                         platform::errors::InvalidArgument(
-                            "Tensor %s contains Inf.", var_name));
+                            "phi::DenseTensor %s contains Inf.", var_name));
       PADDLE_ENFORCE_EQ(framework::TensorContainsNAN(*tensor),
                         false,
                         platform::errors::InvalidArgument(
-                            "Tensor %s contains NAN.", var_name));
+                            "phi::DenseTensor %s contains NAN.", var_name));
     }
 
 #if defined(PADDLE_WITH_PSLIB) || defined(PADDLE_WITH_PSCORE)
@@ -487,7 +487,7 @@ void DownpourLiteWorker::TrainFiles() {
             if (var == nullptr) {
               continue;
             }
-            Tensor* tensor = nullptr;
+            phi::DenseTensor* tensor = nullptr;
             int64_t len = 0;
             if (var->IsType<framework::LoDTensor>()) {
               tensor = var->GetMutable<LoDTensor>();
@@ -534,11 +534,11 @@ void DownpourLiteWorker::TrainFiles() {
       PADDLE_ENFORCE_EQ(framework::TensorContainsInf(*tensor),
                         false,
                         platform::errors::InvalidArgument(
-                            "Tensor %s contains Inf.", var_name));
+                            "phi::DenseTensor %s contains Inf.", var_name));
       PADDLE_ENFORCE_EQ(framework::TensorContainsNAN(*tensor),
                         false,
                         platform::errors::InvalidArgument(
-                            "Tensor %s contains NAN.", var_name));
+                            "phi::DenseTensor %s contains NAN.", var_name));
     }
 
 #if defined(PADDLE_WITH_PSLIB) || defined(PADDLE_WITH_PSCORE)
diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc
index 0bd577d2aa6c0..7ae37052be1f5 100644
--- a/paddle/fluid/framework/downpour_worker.cc
+++ b/paddle/fluid/framework/downpour_worker.cc
@@ -584,11 +584,11 @@ void DownpourWorker::TrainFilesWithProfiler() {
       PADDLE_ENFORCE_EQ(framework::TensorContainsInf(*tensor),
                         false,
                         platform::errors::InvalidArgument(
-                            "Tensor %s contains Inf.", var_name));
+                            "phi::DenseTensor %s contains Inf.", var_name));
       PADDLE_ENFORCE_EQ(framework::TensorContainsNAN(*tensor),
                         false,
                         platform::errors::InvalidArgument(
-                            "Tensor %s contains NAN.", var_name));
+                            "phi::DenseTensor %s contains NAN.", var_name));
     }
 
     if (need_to_push_sparse_) {
@@ -872,7 +872,7 @@ void DownpourWorker::TrainFiles() {
             if (var == nullptr) {
               continue;
             }
-            Tensor* tensor = nullptr;
+            phi::DenseTensor* tensor = nullptr;
             int64_t len = 0;
             if (var->IsType<framework::LoDTensor>()) {
               tensor = var->GetMutable<LoDTensor>();
@@ -919,11 +919,11 @@ void DownpourWorker::TrainFiles() {
       PADDLE_ENFORCE_EQ(framework::TensorContainsInf(*tensor),
                         false,
                         platform::errors::InvalidArgument(
-                            "Tensor %s contains Inf.", var_name));
+                            "phi::DenseTensor %s contains Inf.", var_name));
       PADDLE_ENFORCE_EQ(framework::TensorContainsNAN(*tensor),
                         false,
                         platform::errors::InvalidArgument(
-                            "Tensor %s contains NAN.", var_name));
+                            "phi::DenseTensor %s contains NAN.", var_name));
     }
 
     if (need_to_push_sparse_) {
diff --git a/paddle/fluid/framework/eigen.h b/paddle/fluid/framework/eigen.h
index 22d3ac4333fb6..bbf34c03130c1 100644
--- a/paddle/fluid/framework/eigen.h
+++ b/paddle/fluid/framework/eigen.h
@@ -57,19 +57,19 @@ struct EigenTensor {
   using ConstType =
       Eigen::TensorMap<Eigen::Tensor<const T, D, MajorType, IndexType>>;
 
-  static Type From(Tensor& tensor, DDim dims) {  // NOLINT
+  static Type From(phi::DenseTensor& tensor, DDim dims) {  // NOLINT
     return Type(tensor.data<T>(), EigenDim<D>::From(dims));
   }
 
-  static Type From(Tensor& tensor) {  // NOLINT
+  static Type From(phi::DenseTensor& tensor) {  // NOLINT
     return From(tensor, tensor.dims());
   }  // NOLINT
 
-  static ConstType From(const Tensor& tensor, DDim dims) {
+  static ConstType From(const phi::DenseTensor& tensor, DDim dims) {
     return ConstType(tensor.data<T>(), EigenDim<D>::From(dims));
   }
 
-  static ConstType From(const Tensor& tensor) {
+  static ConstType From(const phi::DenseTensor& tensor) {
     return From(tensor, tensor.dims());
   }
 };
@@ -78,7 +78,7 @@ template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {
-  static typename EigenMatrix::Type Reshape(Tensor& tensor,  // NOLINT
+  static typename EigenMatrix::Type Reshape(phi::DenseTensor& tensor,  // NOLINT
                                             int num_col_dims) {
     int rank = tensor.dims().size();
     PADDLE_ENFORCE_EQ((num_col_dims > 0 && num_col_dims < rank),
@@ -92,7 +92,7 @@ struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {
                              phi::flatten_to_2d(tensor.dims(), num_col_dims));
   }
 
-  static typename EigenMatrix::ConstType Reshape(const Tensor& tensor,
+  static typename EigenMatrix::ConstType Reshape(const phi::DenseTensor& tensor,
                                                  int num_col_dims) {
     int rank = tensor.dims().size();
     PADDLE_ENFORCE_EQ((num_col_dims > 0 && num_col_dims < rank),
@@ -111,13 +111,14 @@ template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> {
-  // Flatten reshapes a Tensor into an EigenVector.
-  static typename EigenVector::Type Flatten(Tensor& tensor) {  // NOLINT
+  // Flatten reshapes a phi::DenseTensor into an EigenVector.
+  static typename EigenVector::Type Flatten(
+      phi::DenseTensor& tensor) {  // NOLINT
     return EigenVector::From(tensor, {product(tensor.dims())});
   }
 
   static typename EigenVector::ConstType Flatten(
-      const Tensor& tensor) {  // NOLINT
+      const phi::DenseTensor& tensor) {  // NOLINT
     return EigenVector::From(tensor, {product(tensor.dims())});
   }
 };
@@ -132,14 +133,16 @@ struct EigenScalar {
   using ConstType = Eigen::TensorMap<
       Eigen::TensorFixedSize<const T, Eigen::Sizes<>, MajorType, IndexType>>;
 
-  static Type From(Tensor& tensor) { return Type(tensor.data<T>()); }  // NOLINT
+  static Type From(phi::DenseTensor& tensor) {  // NOLINT
+    return Type(tensor.data<T>());
+  }
 
-  static ConstType From(const Tensor& tensor) {
+  static ConstType From(const phi::DenseTensor& tensor) {
     return ConstType(tensor.data<T>());
   }
 };
 
-// Define Tensor with 32-bit index.
+// Define phi::DenseTensor with 32-bit index.
 template <typename T, int D, int MajorType = Eigen::RowMajor>
 using Tensor32BitIndex =
     Eigen::TensorMap<Eigen::Tensor<T, D, MajorType, int>, Eigen::Aligned>;
diff --git a/paddle/fluid/framework/eigen_test.cc b/paddle/fluid/framework/eigen_test.cc
index 4e214bd36f33a..1ce55c8a8de2b 100644
--- a/paddle/fluid/framework/eigen_test.cc
+++ b/paddle/fluid/framework/eigen_test.cc
@@ -28,8 +28,8 @@ TEST(EigenDim, From) {
   ASSERT_EQ(3, ed[2]);
 }
 
-TEST(Eigen, Tensor) {
-  Tensor t;
+TEST(Eigen, DenseTensor) {
+  phi::DenseTensor t;
   float* p =
       t.mutable_data<float>(phi::make_ddim({1, 2, 3}), platform::CPUPlace());
   for (int i = 0; i < 1 * 2 * 3; i++) {
@@ -52,7 +52,7 @@ TEST(Eigen, Tensor) {
 }
 
 TEST(Eigen, ScalarFrom) {
-  Tensor t;
+  phi::DenseTensor t;
   int* p = t.mutable_data<int>(phi::make_ddim({1}), platform::CPUPlace());
   *p = static_cast<int>(100);
 
@@ -63,7 +63,7 @@ TEST(Eigen, ScalarFrom) {
 }
 
 TEST(Eigen, VectorFrom) {
-  Tensor t;
+  phi::DenseTensor t;
   float* p = t.mutable_data<float>(phi::make_ddim({6}), platform::CPUPlace());
   for (int i = 0; i < 6; i++) {
     p[i] = static_cast<float>(i);
@@ -79,7 +79,7 @@ TEST(Eigen, VectorFrom) {
 }
 
 TEST(Eigen, VectorFlatten) {
-  Tensor t;
+  phi::DenseTensor t;
   float* p =
       t.mutable_data<float>(phi::make_ddim({1, 2, 3}), platform::CPUPlace());
   for (int i = 0; i < 1 * 2 * 3; i++) {
@@ -96,7 +96,7 @@ TEST(Eigen, VectorFlatten) {
 }
 
 TEST(Eigen, Matrix) {
-  Tensor t;
+  phi::DenseTensor t;
   float* p =
       t.mutable_data<float>(phi::make_ddim({2, 3}), platform::CPUPlace());
   for (int i = 0; i < 2 * 3; i++) {
@@ -116,7 +116,7 @@ TEST(Eigen, Matrix) {
 }
 
 TEST(Eigen, MatrixReshape) {
-  Tensor t;
+  phi::DenseTensor t;
   float* p = t.mutable_data<float>({2, 3, 6, 4}, platform::CPUPlace());
   for (int i = 0; i < 2 * 3 * 6 * 4; ++i) {
     p[i] = static_cast<float>(i);
diff --git a/paddle/fluid/framework/fleet/ascend_wrapper.h b/paddle/fluid/framework/fleet/ascend_wrapper.h
index 2eb9dad870e67..372f0e7d38be0 100644
--- a/paddle/fluid/framework/fleet/ascend_wrapper.h
+++ b/paddle/fluid/framework/fleet/ascend_wrapper.h
@@ -132,7 +132,7 @@ class AscendInstance {
           "Not support %s as tensor type.", DataTypeToString(type)));
     }
   }
-  ge::Tensor ConvertToGeTensor(const Tensor *tensor) {
+  ge::Tensor ConvertToGeTensor(const phi::DenseTensor *tensor) {
     auto numel = tensor->numel();
     std::vector<int64_t> vec_dim;
     auto dimen = arity(tensor->dims());
@@ -164,10 +164,10 @@ class AscendInstance {
   }
 
   void RunAscendSubgraph(int graph_idx,
-                         const std::vector<const Tensor *> &inputs,
-                         std::vector<Tensor *> *outputs) {
+                         const std::vector<const phi::DenseTensor *> &inputs,
+                         std::vector<phi::DenseTensor *> *outputs) {
     VLOG(1) << "Ascend Graph[" << graph_idx << "] is about to run.";
-    // Convert paddle Tensor to GE Tensor
+    // Convert paddle phi::DenseTensor to GE phi::DenseTensor
     std::vector<ge::Tensor> ge_inputs;
     for (const auto &e : inputs) {
       ge_inputs.push_back(ConvertToGeTensor(e));
@@ -187,7 +187,8 @@ class AscendInstance {
     for (size_t i = 0; i < ge_outputs.size(); ++i) {
       const uint8_t *ret_data = ge_outputs[i].GetData();
       size_t size = ge_outputs[i].GetSize();
-      VLOG(1) << "GE Tensor size of the " << i << "th output var is " << size;
+      VLOG(1) << "GE phi::DenseTensor size of the " << i << "th output var is "
+              << size;
       auto *dst = (*outputs)[i]->mutable_data<uint8_t>({(int64_t)size},
                                                        platform::CPUPlace());
       memcpy(dst, ret_data, size);
diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto
index 2a56dc60335d9..fbfca2f983e7d 100644
--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -124,7 +124,7 @@ message VarType {
     FP16 = 4;
     FP32 = 5;
     FP64 = 6;
-    // Tensor<size_t> is used in C++.
+    // phi::DenseTensor<size_t> is used in C++.
     SIZE_T = 19;
     UINT8 = 20;
     INT8 = 21;
diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc
index 22d9eb43c59eb..93906e6c53e71 100644
--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -147,7 +147,7 @@ int64_t CompatMetaTensor::numel() const {
   ValidCheck(*this);
   if (is_runtime_) {
     auto* var = PADDLE_GET_CONST(Variable*, var_);
-    return var->Get<Tensor>().numel();
+    return var->Get<phi::DenseTensor>().numel();
   } else {
     auto* var = PADDLE_GET_CONST(VarDesc*, var_);
     return var->ElementSize();
diff --git a/paddle/fluid/framework/infershape_utils.h b/paddle/fluid/framework/infershape_utils.h
index 3b7744515b2ec..d85c550dc0d6f 100644
--- a/paddle/fluid/framework/infershape_utils.h
+++ b/paddle/fluid/framework/infershape_utils.h
@@ -81,15 +81,17 @@ class CompatMetaTensor : public phi::MetaTensor {
   }
 
   const phi::SelectedRows& GetSelectedRows() const {
-    PADDLE_ENFORCE_EQ(is_runtime_,
-                      true,
-                      platform::errors::Unavailable(
-                          "Only can get Tensor from MetaTensor in rumtime."));
+    PADDLE_ENFORCE_EQ(
+        is_runtime_,
+        true,
+        platform::errors::Unavailable(
+            "Only can get phi::DenseTensor from MetaTensor in rumtime."));
     auto* var = PADDLE_GET_CONST(Variable*, var_);
-    PADDLE_ENFORCE_EQ(var->IsType<phi::SelectedRows>(),
-                      true,
-                      platform::errors::Unavailable(
-                          "The Tensor in MetaTensor is not SelectedRows."));
+    PADDLE_ENFORCE_EQ(
+        var->IsType<phi::SelectedRows>(),
+        true,
+        platform::errors::Unavailable(
+            "The phi::DenseTensor in MetaTensor is not SelectedRows."));
     return var->Get<phi::SelectedRows>();
   }
 
diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
index 9bfc031f42e94..1e84646378106 100644
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
@@ -47,7 +47,7 @@ AttentionLSTMFusePass::AttentionLSTMFusePass() {
       .IsTensor()
       .IsOptional()
       .End()
-      .AddInput("ShapeTensorList")  // vector<Tensor<int>>
+      .AddInput("ShapeTensorList")  // vector<phi::DenseTensor<int>>
       .IsOptional()
       .End()
       .AddOutput("Out")
@@ -262,11 +262,12 @@ void PrepareParameters(Graph* graph, const Param& param, ir::Node* lstm_op) {
   // reshape attention_bias
   auto* attention_bias_t =
       scope.FindVar(param.AttentionBias)->GetMutable<LoDTensor>();
-  PADDLE_ENFORCE_EQ(attention_bias_t->dims().size(),
-                    1,
-                    platform::errors::InvalidArgument(
-                        "Tensor attention bias dimension size(%d) must be 1.",
-                        attention_bias_t->dims().size()));
+  PADDLE_ENFORCE_EQ(
+      attention_bias_t->dims().size(),
+      1,
+      platform::errors::InvalidArgument(
+          "phi::DenseTensor attention bias dimension size(%d) must be 1.",
+          attention_bias_t->dims().size()));
   attention_bias_t->Resize(phi::make_ddim({1, attention_bias_t->dims()[0]}));
 
   auto* attention_scalar_bias_t =
@@ -339,11 +340,12 @@ void PrepareLSTMBias(const LoDTensor& B_forget,
                                       B_output.data<float>(),
                                       B_cell.data<float>()};
 
-  PADDLE_ENFORCE_EQ(B_forget.dims().size(),
-                    1,
-                    platform::errors::InvalidArgument(
-                        "Tensor B forget dimension size(%d) must be 1.",
-                        B_forget.dims().size()));
+  PADDLE_ENFORCE_EQ(
+      B_forget.dims().size(),
+      1,
+      platform::errors::InvalidArgument(
+          "phi::DenseTensor B forget dimension size(%d) must be 1.",
+          B_forget.dims().size()));
   int D = B_forget.dims()[0];
   out->Resize(phi::make_ddim({1, 4 * D}));
   auto* out_data = out->mutable_data<float>(platform::CPUPlace());
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
index 3c7f77708cd0b..5da676dd09487 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
@@ -38,7 +38,7 @@ class Scope;
 namespace {
 template <typename T1, typename T2>
 void ConvertTensorType(paddle::framework::LoDTensor* tensor) {
-  paddle::framework::Tensor tmp_tensor;
+  phi::DenseTensor tmp_tensor;
   tmp_tensor.set_type(paddle::experimental::CppTypeToDataType<T2>::Type());
   tmp_tensor.Resize(tensor->dims());
   auto* tmp_data = tmp_tensor.mutable_data<T2>(paddle::platform::CPUPlace());
@@ -93,13 +93,13 @@ void recompute_bias_and_weights(const Scope* scope,
       Eigen::Array<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
 
   // Re-compute bias of conv2d from BN
-  PADDLE_ENFORCE_EQ(
-      eltwise_y_in_tensor->dims(),
-      bn_bias_tensor.dims(),
-      platform::errors::InvalidArgument("Tensor elementwise y(%d) and batch "
-                                        "norm bias(%d) must have same dims.",
-                                        eltwise_y_in_tensor->dims().size(),
-                                        bn_bias_tensor.dims().size()));
+  PADDLE_ENFORCE_EQ(eltwise_y_in_tensor->dims(),
+                    bn_bias_tensor.dims(),
+                    platform::errors::InvalidArgument(
+                        "phi::DenseTensor elementwise y(%d) and batch "
+                        "norm bias(%d) must have same dims.",
+                        eltwise_y_in_tensor->dims().size(),
+                        bn_bias_tensor.dims().size()));
 
   auto* scale_tensor = scope->FindVar(bn_scale.Name())->GetMutable<LoDTensor>();
   auto* variance_tensor =
@@ -375,7 +375,7 @@ void ConvBNFusePass::ApplyImpl(ir::Graph* graph) const {
             conv_bias_tensor->dims(),
             eltwise_y_in_tensor->dims(),
             platform::errors::InvalidArgument(
-                "Tensor convolution bias(%d) and elementwise y(%d) "
+                "phi::DenseTensor convolution bias(%d) and elementwise y(%d) "
                 "must have same dims.",
                 conv_bias_tensor->dims().size(),
                 eltwise_y_in_tensor->dims().size()));
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc
index a71f6ac94b415..1cdefad43030e 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -132,7 +132,7 @@ int FCFusePass::ApplyFCPattern(Graph* graph, bool with_relu) const {
     GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(mul_out, mul_out, fc_pattern);
 
-    // Only support 2D-Tensor as weight for FC
+    // Only support 2D-phi::DenseTensor as weight for FC
     std::vector<int64_t> w_shape = w->Var()->GetShape();
     size_t w_rank = w_shape.size();
     if (w_rank != 2) return;
diff --git a/paddle/fluid/framework/ir/fusion_group/operation.cc b/paddle/fluid/framework/ir/fusion_group/operation.cc
index dd399eea604fd..98279e73c1a7f 100644
--- a/paddle/fluid/framework/ir/fusion_group/operation.cc
+++ b/paddle/fluid/framework/ir/fusion_group/operation.cc
@@ -141,7 +141,8 @@ void OperationMap::InsertUnaryElementwiseOperations() {
   // scale
   //  out = (bias_after_scale) ? scale * X +  bias : scale(X + bias)
   //  here we use '=' operator to separate th default value
-  // TODO(wangchaochaohu): Later we need to support Tensor input for scale and
+  // TODO(wangchaochaohu): Later we need to support phi::DenseTensor input for
+  // scale and
   //  bias.
   insert_handler(
       "scale",
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass.cc
index 8a24e93170c25..69d304d0a7c2b 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass.cc
@@ -132,7 +132,7 @@ class ShrinkDepsOpFunctor {
 /**
  * Shrink op dependencies according to no need buffer vars.
  *
- * If some ops do not need Tensor buffer of any input,
+ * If some ops do not need phi::DenseTensor buffer of any input,
  * just remove the dependency of this op, i.e, decrease reference count.
  *
  * For example, input Y of elementwise_add_grad op is only used to infer shape
diff --git a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc
index 394c1ae797e4c..df19bc9ade8d5 100644
--- a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc
@@ -27,7 +27,7 @@ namespace framework {
 namespace ir {
 
 void ComputePropagateScalesMkldnnPass::GetTensorFromVector(
-    const std::vector<float>& data_v, Tensor* tensor) const {
+    const std::vector<float>& data_v, phi::DenseTensor* tensor) const {
   const int size = static_cast<int>(data_v.size());
   auto* data = tensor->mutable_data<float>({size}, platform::CPUPlace());
   for (int i = 0; i < size; i++) {
@@ -41,15 +41,15 @@ void ComputePropagateScalesMkldnnPass::GetQuantInfo(
   GetInfoFromTheFirstOp(graph, "has_quant_info", "var_quant_scales", &info_map);
 
   for (auto iter = info_map.begin(); iter != info_map.end(); iter++) {
-    Tensor tensor;
+    phi::DenseTensor tensor;
     GetTensorFromVector(iter->second, &tensor);
     auto pair = std::make_pair(false, tensor);
     var_quant_scales->insert(std::make_pair(iter->first, pair));
   }
 }
 
-std::vector<float> ComputePropagateScalesMkldnnPass::GetScales(Tensor* tensor,
-                                                               int axis) const {
+std::vector<float> ComputePropagateScalesMkldnnPass::GetScales(
+    phi::DenseTensor* tensor, int axis) const {
   PADDLE_ENFORCE_LT(axis,
                     2,
                     platform::errors::InvalidArgument(
@@ -120,7 +120,7 @@ void ComputePropagateScalesMkldnnPass::ComputeVarScales(
         volume *= dims[i];
       }
 
-      Tensor tmp_tensor;
+      phi::DenseTensor tmp_tensor;
       std::vector<int64_t> reshape_dims = {dims[0], volume};
       tmp_tensor.Resize(phi::make_ddim(reshape_dims));
       auto* weight_data = weight_tensor->data<float>();
@@ -130,7 +130,7 @@ void ComputePropagateScalesMkldnnPass::ComputeVarScales(
       }
 
       auto scales_v = GetScales(&tmp_tensor, axis);
-      Tensor tensor;
+      phi::DenseTensor tensor;
       GetTensorFromVector(scales_v, &tensor);
       auto pair = std::make_pair(false, tensor);
       var_quant_scales->insert(std::make_pair(var_name, pair));
@@ -142,7 +142,7 @@ void ComputePropagateScalesMkldnnPass::ComputeSingleGruWeightScales(
     Scope* scope,
     const std::string& wx_var_name,
     const std::string& wh_var_name,
-    Tensor* tensor) const {
+    phi::DenseTensor* tensor) const {
   auto* wx_var = scope->FindVar(wx_var_name);
   PADDLE_ENFORCE_NOT_NULL(
       wx_var,
@@ -228,7 +228,7 @@ void ComputePropagateScalesMkldnnPass::ComputeGruWeightScales(
       for (int i = 0; i < wx_names_size; i++) {
         auto wh_var_name = wh_var_names[i];
         auto wx_var_name = wx_var_names[i];
-        Tensor tensor;
+        phi::DenseTensor tensor;
         ComputeSingleGruWeightScales(scope, wx_var_name, wh_var_name, &tensor);
         auto pair = std::make_pair(false, tensor);
         var_quant_scales->insert(std::make_pair(wx_var_name, pair));
@@ -241,7 +241,7 @@ void ComputePropagateScalesMkldnnPass::ComputeSingleLstmWeightScales(
     Scope* scope,
     const std::string& wx_var_name,
     const std::string& wh_var_name,
-    Tensor* tensor) const {
+    phi::DenseTensor* tensor) const {
   auto* wx_var = scope->FindVar(wx_var_name);
   PADDLE_ENFORCE_NOT_NULL(
       wx_var,
@@ -307,7 +307,7 @@ void ComputePropagateScalesMkldnnPass::ComputeLstmWeightScales(
       for (int i = 0; i < wx_names_size; i++) {
         auto wh_var_name = wh_var_names[i];
         auto wx_var_name = wx_var_names[i];
-        Tensor tensor;
+        phi::DenseTensor tensor;
         ComputeSingleLstmWeightScales(scope, wx_var_name, wh_var_name, &tensor);
         auto pair = std::make_pair(false, tensor);
         var_quant_scales->insert(std::make_pair(wx_var_name, pair));
@@ -348,7 +348,7 @@ void ComputePropagateScalesMkldnnPass::UpdateScaleOpInScale(
     const auto tensor = pair.second;
 
     const auto scale = PADDLE_GET_CONST(float, op_node->Op()->GetAttr("scale"));
-    Tensor tmp_tensor;
+    phi::DenseTensor tmp_tensor;
     tmp_tensor.Resize(tensor.dims());
     auto* data = tmp_tensor.mutable_data<float>(platform::CPUPlace());
     for (int i = 0; i < tensor.numel(); i++) {
diff --git a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h
index 09863fdc768b2..ecc3ad16a54e6 100644
--- a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h
@@ -22,7 +22,8 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
-using StringPairMap = std::unordered_map<std::string, std::pair<bool, Tensor>>;
+using StringPairMap =
+    std::unordered_map<std::string, std::pair<bool, phi::DenseTensor>>;
 
 class ComputePropagateScalesMkldnnPass : public FusePassBase {
  public:
@@ -38,11 +39,11 @@ class ComputePropagateScalesMkldnnPass : public FusePassBase {
 
  private:
   void GetTensorFromVector(const std::vector<float>& data_v,
-                           Tensor* tensor) const;
+                           phi::DenseTensor* tensor) const;
 
   void GetQuantInfo(ir::Graph* graph, StringPairMap* var_quant_scales) const;
 
-  std::vector<float> GetScales(Tensor* tensor, int axis) const;
+  std::vector<float> GetScales(phi::DenseTensor* tensor, int axis) const;
 
   void ComputeVarScales(ir::Graph* graph,
                         Scope* scope,
@@ -54,7 +55,7 @@ class ComputePropagateScalesMkldnnPass : public FusePassBase {
   void ComputeSingleGruWeightScales(Scope* scope,
                                     const std::string& wx_var_name,
                                     const std::string& wh_var_name,
-                                    Tensor* tensor) const;
+                                    phi::DenseTensor* tensor) const;
 
   void ComputeGruWeightScales(ir::Graph* graph,
                               Scope* scope,
@@ -65,7 +66,7 @@ class ComputePropagateScalesMkldnnPass : public FusePassBase {
   void ComputeSingleLstmWeightScales(Scope* scope,
                                      const std::string& wx_var_name,
                                      const std::string& wh_var_name,
-                                     Tensor* tensor) const;
+                                     phi::DenseTensor* tensor) const;
 
   void ComputeLstmWeightScales(ir::Graph* graph,
                                Scope* scope,
diff --git a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc
index 38c6fb57d58e3..03c01507ca27d 100644
--- a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc
@@ -59,7 +59,7 @@ class ComputePropagateScalesMkldnnPassTest : public testing::Test {
     pass.reset(new ComputePropagateScalesMkldnnPass());
   }
 
-  std::vector<float> GetScales(Tensor* tensor, int axis) const {
+  std::vector<float> GetScales(phi::DenseTensor* tensor, int axis) const {
     return pass->GetScales(tensor, axis);
   }
 
@@ -164,7 +164,7 @@ class ComputePropagateScalesMkldnnPassTest : public testing::Test {
           graph, &scope, wx_name, wh_name, &var_quant_scales);
     }
     bool is_unsigned;
-    framework::Tensor wx_result_tensor;
+    phi::DenseTensor wx_result_tensor;
 
     std::tie(is_unsigned, wx_result_tensor) = var_quant_scales[wx_var_names];
     ASSERT_EQ(is_unsigned, false);
@@ -235,7 +235,7 @@ TEST_F(ComputePropagateScalesMkldnnPassTest, get_scales_function) {
   const auto& values = positive_and_negative_values;
   float max_val = *std::max_element(values.begin(), values.end());
 
-  framework::Tensor var_tensor;
+  phi::DenseTensor var_tensor;
   var_tensor.Resize(phi::make_dim(values.size(), 1));
   std::copy(begin(values),
             end(values),
@@ -273,7 +273,7 @@ TEST_F(ComputePropagateScalesMkldnnPassTest, compute_var_scales) {
   ComputeVarScales(graph, &scope, ops, weight_name, axis, &var_quant_scales);
 
   bool is_unsigned;
-  framework::Tensor result_tensor;
+  phi::DenseTensor result_tensor;
 
   std::tie(is_unsigned, result_tensor) = var_quant_scales[weight_var_name];
 
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.cc
index b9ffee3c00c46..814d1d5d73dcd 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.cc
@@ -62,14 +62,14 @@ void recompute_bias_and_weights(const Scope* scope,
       Eigen::Array<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
 
   // Re-compute bias of conv2d from AffineChannel
-  PADDLE_ENFORCE_EQ(
-      eltwise_y_in_tensor->dims(),
-      ac_bias_tensor.dims(),
-      platform::errors::InvalidArgument(
-          "Tensor elementwise y(%d) and activation bias(%d) must have same "
-          "dimension.",
-          eltwise_y_in_tensor->dims().size(),
-          ac_bias_tensor.dims().size()));
+  PADDLE_ENFORCE_EQ(eltwise_y_in_tensor->dims(),
+                    ac_bias_tensor.dims(),
+                    platform::errors::InvalidArgument(
+                        "phi::DenseTensor elementwise y(%d) and activation "
+                        "bias(%d) must have same "
+                        "dimension.",
+                        eltwise_y_in_tensor->dims().size(),
+                        ac_bias_tensor.dims().size()));
 
   auto* scale_tensor = scope->FindVar(ac_scale.Name())->GetMutable<LoDTensor>();
 
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
index 6f7bb614cc79f..78fc02329efe2 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
@@ -192,7 +192,7 @@ class MKLDNNConvBatchNormPassTest {
     return prog;
   }
 
-  void FillTensorWithRandomData(Tensor* tnsr,
+  void FillTensorWithRandomData(phi::DenseTensor* tnsr,
                                 float lowb,
                                 float upb,
                                 platform::CPUPlace place) {
@@ -206,7 +206,7 @@ class MKLDNNConvBatchNormPassTest {
     }
   }
 
-  void CompareTensors(Tensor* tensor1, Tensor* tensor2) {
+  void CompareTensors(phi::DenseTensor* tensor1, phi::DenseTensor* tensor2) {
     // check dims
     for (int i = 0; i < tensor1->numel(); ++i) {
       EXPECT_NEAR(tensor1->data<float>()[i], tensor2->data<float>()[i], 1e-3);
@@ -306,7 +306,7 @@ class MKLDNNConvBatchNormPassTest {
     // Need to copy result over as the same scope is used in both executors
     // so first result will be overwritten by second
     auto* m_tensor = exe.FindTensor("m");
-    Tensor no_ir_result;
+    phi::DenseTensor no_ir_result;
     TensorCopy(*m_tensor, place, &no_ir_result);
 
     graph.reset(pass->Apply(graph.release()));
diff --git a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
index b674ef52183c0..abe51960183c5 100644
--- a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
@@ -406,7 +406,7 @@ void QuantDequantMkldnnPass::RemoveFakeOps(
   GraphSafeRemoveNodes(graph, nodes2rm);
 }
 
-void QuantDequantMkldnnPass::TransposeWeight(Tensor* input) const {
+void QuantDequantMkldnnPass::TransposeWeight(phi::DenseTensor* input) const {
   const auto in_dims = input->dims();
   std::vector<int> out_dim_v;
   std::vector<int> axis;
@@ -421,7 +421,7 @@ void QuantDequantMkldnnPass::TransposeWeight(Tensor* input) const {
   auto out_stride = phi::stride(out_dims);
   const int count = input->numel();
 
-  Tensor trans_tensor;
+  phi::DenseTensor trans_tensor;
   trans_tensor.Resize(out_dims);
   float* trans_data = trans_tensor.mutable_data<float>(platform::CPUPlace());
   float* in_data = input->mutable_data<float>(platform::CPUPlace());
@@ -465,7 +465,7 @@ bool QuantDequantMkldnnPass::IsInt8Weight(
 
 void QuantDequantMkldnnPass::ConvertFromINT8ToFP32(
     const std::vector<float>& scales,
-    Tensor* weight_tensor,
+    phi::DenseTensor* weight_tensor,
     int8_t* int8_weight_data,
     float* fp32_weight_data,
     const std::string& weight_var_name) const {
diff --git a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h
index eee7fc96ed1d4..deb9072e04a49 100644
--- a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h
@@ -66,7 +66,7 @@ class QuantDequantMkldnnPass : public FusePassBase {
       const;
 
   void ConvertFromINT8ToFP32(const std::vector<float>& scales,
-                             Tensor* weight_tensor,
+                             phi::DenseTensor* weight_tensor,
                              int8_t* int8_weight_data,
                              float* fp32_weight_data,
                              const std::string& weight_var_name) const;
@@ -106,7 +106,7 @@ class QuantDequantMkldnnPass : public FusePassBase {
                     Scope* scope,
                     const std::string& weight_name) const;
 
-  void TransposeWeight(Tensor* input) const;
+  void TransposeWeight(phi::DenseTensor* input) const;
 
   void DequantizeOpWeights(
       Node* op_node,
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
index 089c252ea6947..ed1d0653df715 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
@@ -639,12 +639,12 @@ PDNode* MultiHeadMatmulV3Pattern::operator()() {
 
 namespace {
 template <typename T>
-inline void QKVWeightsProcess(Tensor* wq_tensor,
-                              Tensor* wk_tensor,
-                              Tensor* wv_tensor,
-                              Tensor* bq_tensor,
-                              Tensor* bk_tensor,
-                              Tensor* bv_tensor) {
+inline void QKVWeightsProcess(phi::DenseTensor* wq_tensor,
+                              phi::DenseTensor* wk_tensor,
+                              phi::DenseTensor* wv_tensor,
+                              phi::DenseTensor* bq_tensor,
+                              phi::DenseTensor* bk_tensor,
+                              phi::DenseTensor* bv_tensor) {
   auto* wq_data = wq_tensor->mutable_data<T>(platform::CPUPlace());
   auto* wk_data = wk_tensor->mutable_data<T>(platform::CPUPlace());
   auto* wv_data = wv_tensor->mutable_data<T>(platform::CPUPlace());
diff --git a/paddle/fluid/framework/ir/pass_test_util.h b/paddle/fluid/framework/ir/pass_test_util.h
index c028108617c21..44f6c66295466 100644
--- a/paddle/fluid/framework/ir/pass_test_util.h
+++ b/paddle/fluid/framework/ir/pass_test_util.h
@@ -129,7 +129,7 @@ bool RunPassAndAssert(Graph* graph,
 /// @param[in]  var_name  The variable name.
 /// @param[in]  dims      The dimensions of allocated tensor.
 ///
-/// @tparam     T         Tensor data type.
+/// @tparam     T         phi::DenseTensor data type.
 ///
 template <typename T>
 void InitLoDTensorHolder(const Scope& scope,
diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h
index 33d293faad129..dec38ae386159 100644
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -22,18 +22,18 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/mixed_vector.h"
-#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
 
 namespace paddle {
 namespace framework {
 
 using LoDTensor = phi::DenseTensor;
 
-// Split Tensor and copy to each place specified in places.
+// Split phi::DenseTensor and copy to each place specified in places.
 std::vector<LoDTensor> SplitLoDTensor(
     const LoDTensor& src, const std::vector<platform::Place> places);
 
diff --git a/paddle/fluid/framework/new_executor/data_transfer.cc b/paddle/fluid/framework/new_executor/data_transfer.cc
index 06962f7b5e773..a348adbedfcc7 100644
--- a/paddle/fluid/framework/new_executor/data_transfer.cc
+++ b/paddle/fluid/framework/new_executor/data_transfer.cc
@@ -194,7 +194,8 @@ bool IsTensorOfVarInitialized(Variable* var) {
     if (var->IsType<LoDTensor>() || var->IsType<phi::SelectedRows>()) {
       return GetLoDTensorOrSelectedRowsValueFromVar(*var)->IsInitialized();
     } else if (var->IsType<LoDTensorArray>()) {
-      return static_cast<const Tensor*>(&(var->Get<LoDTensorArray>()[0]))
+      return static_cast<const phi::DenseTensor*>(
+                 &(var->Get<LoDTensorArray>()[0]))
           ->IsInitialized();
     }
   }
@@ -440,7 +441,7 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
     for (size_t i = 0; i < var_name_item.second.size(); ++i) {
       auto var = var_name_item.second[i];
       auto var_name = new_ins[var_name_item.first].at(i);
-      const Tensor* tensor_in;
+      const phi::DenseTensor* tensor_in;
       std::string new_var_name;
       bool is_transferred = false;
 
@@ -450,8 +451,8 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
         if (var->Get<LoDTensorArray>().size() == 0) {
           continue;
         }
-        tensor_in =
-            static_cast<const Tensor*>(&(var->Get<LoDTensorArray>()[0]));
+        tensor_in = static_cast<const phi::DenseTensor*>(
+            &(var->Get<LoDTensorArray>()[0]));
       } else {
         continue;
       }
@@ -470,7 +471,8 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
               (expected_kernel_key.data_layout_ != DataLayout::kMKLDNN) &&
               (paddle::platform::MKLDNNDeviceContext::tls()
                    .get_cur_paddle_data_layout() == DataLayout::kNHWC)) {
-            VLOG(7) << "Created reshaped dummy input based on MKL-DNN Tensor , "
+            VLOG(7) << "Created reshaped dummy input based on MKL-DNN "
+                       "phi::DenseTensor , "
                        "but kNHWC layout"
                     << var_name_item.first << " in Operator "
                     << op_base->Type();
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index c379e135b16b6..d973942d9f975 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -355,10 +355,10 @@ void InterpreterCore::BuildAndCacheInstructionCtx(Instruction* instr_node) {
 }
 
 void InterpreterCore::BuildInplace() {
-  // NOTE(Ruibiao): coalesce_tensor_op outputs a FusedOutput Tensor and a list
-  // of Output Tensors which are sliced from the FusedOutput. These outputs
-  // sholud not be the outvar of the in-place var-pair since memory reuse
-  // between FusedOutput and Output Tensors is assumed. For the following
+  // NOTE(Ruibiao): coalesce_tensor_op outputs a FusedOutput phi::DenseTensor
+  // and a list of Output Tensors which are sliced from the FusedOutput. These
+  // outputs sholud not be the outvar of the in-place var-pair since memory
+  // reuse between FusedOutput and Output Tensors is assumed. For the following
   // example:
   // fused_var, var1, var2, var3 = coalesce_tensor(var1, var2, var3)
   // var1 = sum(var4, var5)
@@ -444,9 +444,9 @@ void InterpreterCore::BuildOperatorDependences() {
   }
 }
 
-// At the end of each step, the holder of Tensor in LoDTensorArray is null.
-// Clear these Tensors and leave LoDTensorArray empty, otherwise an exception
-// will occur in the next step
+// At the end of each step, the holder of phi::DenseTensor in LoDTensorArray is
+// null. Clear these Tensors and leave LoDTensorArray empty, otherwise an
+// exception will occur in the next step
 void InterpreterCore::ClearLoDTensorArrayInLocalScope() {
   auto vars = local_scope_->LocalVars();
   for (auto var : vars) {
@@ -994,7 +994,7 @@ void InterpreterCore::RecordStreamForGC(const Instruction& instr) {
 
   gpuStream_t stream =
       reinterpret_cast<const phi::GPUContext&>(instr.DeviceContext()).stream();
-  auto TensorRecordStream = [&stream](Tensor& tensor) {
+  auto TensorRecordStream = [&stream](phi::DenseTensor& tensor) {
     auto allocation = tensor.Holder();
     if (allocation == nullptr) {
       return;
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.cc b/paddle/fluid/framework/new_executor/new_executor_defs.cc
index c40a80ce0752c..0d4fdaab41b6b 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.cc
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc
@@ -299,7 +299,7 @@ void InterpretercoreInferShapeContext::ShareLoD(const std::string& in,
 
 // TODO(dzhwinter) : reuse ShareLoD in most operators.
 // Need to call ShareLayout explicitly in sequence related ops.
-// Shall we have a better method to shared info between in/out Tensor?
+// Shall we have a better method to shared info between in/out phi::DenseTensor?
 #ifdef PADDLE_WITH_MKLDNN
   // Fix me: ugly workaround below
   // Correct solution:
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 4c28a9b59535e..ac0af60ec722e 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -528,9 +528,10 @@ void OperatorBase::GenerateTemporaryNames() {
   }
 }
 
-const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var) {
+const phi::DenseTensor* GetLoDTensorOrSelectedRowsValueFromVar(
+    const Variable& var) {
   if (var.IsType<LoDTensor>()) {
-    return static_cast<const Tensor*>(&(var.Get<LoDTensor>()));
+    return static_cast<const phi::DenseTensor*>(&(var.Get<LoDTensor>()));
   } else if (var.IsType<phi::SelectedRows>()) {
     return &(var.Get<phi::SelectedRows>().value());
   } else {
@@ -540,7 +541,7 @@ const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var) {
   }
 }
 
-Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var) {
+phi::DenseTensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var) {
   if (var->IsType<LoDTensor>()) {
     return var->GetMutable<LoDTensor>();
   } else if (var->IsType<phi::SelectedRows>()) {
@@ -607,20 +608,20 @@ Variable* ExecutionContext::OutputVar(const std::string& name) const {
 }
 
 template <>
-const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
-    const std::string& name) const {
+const std::vector<const phi::DenseTensor*>
+ExecutionContext::MultiInput<phi::DenseTensor>(const std::string& name) const {
   LogVarUsageIfUnusedVarCheckEnabled(name);
 
   auto vars = MultiInputVar(name);
   if (vars.size() == 0) {
     return {};
   }
-  std::vector<const Tensor*> res;
+  std::vector<const phi::DenseTensor*> res;
   res.reserve(vars.size());
   std::transform(vars.begin(),
                  vars.end(),
                  std::back_inserter(res),
-                 [&](const Variable* var) -> const Tensor* {
+                 [&](const Variable* var) -> const phi::DenseTensor* {
                    if (var == nullptr) return nullptr;
                    PADDLE_ENFORCE_EQ(var->IsType<LoDTensor>(),
                                      true,
@@ -634,19 +635,19 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
 }
 
 template <>
-std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
+std::vector<phi::DenseTensor*> ExecutionContext::MultiOutput<phi::DenseTensor>(
     const std::string& name) const {
   auto vars = MultiOutputVar(name);
 
   if (vars.size() == 0) {
     return {};
   }
-  std::vector<Tensor*> res;
+  std::vector<phi::DenseTensor*> res;
   res.reserve(vars.size());
   std::transform(vars.begin(),
                  vars.end(),
                  std::back_inserter(res),
-                 [&](Variable* var) -> Tensor* {
+                 [&](Variable* var) -> phi::DenseTensor* {
                    return var == nullptr ? nullptr
                                          : var->GetMutable<LoDTensor>();
                  });
@@ -958,7 +959,7 @@ class RuntimeInferShapeContext : public InferShapeContext {
 
 // TODO(dzhwinter) : reuse ShareLoD in most operators.
 // Need to call ShareLayout explicitly in sequence related ops.
-// Shall we have a better method to shared info between in/out Tensor?
+// Shall we have a better method to shared info between in/out phi::DenseTensor?
 #ifdef PADDLE_WITH_MKLDNN
     // Fix me: ugly workaround below
     // Correct solution:
@@ -1210,7 +1211,7 @@ struct OperatorWithKernel::CacheImpl {
 
 static void CheckTensorNANOrInf(const std::string& op_type,
                                 const std::string& name,
-                                const framework::Tensor& tensor) {
+                                const phi::DenseTensor& tensor) {
   if (tensor.memory_size() == 0) {
     return;
   }
@@ -1218,16 +1219,18 @@ static void CheckTensorNANOrInf(const std::string& op_type,
       framework::TransToProtoVarType(tensor.dtype()) != proto::VarType::FP64) {
     return;
   }
-  PADDLE_ENFORCE_NE(
-      framework::TensorContainsInf(tensor),
-      true,
-      platform::errors::Fatal(
-          "Operator %s output Tensor %s contains Inf.", op_type, name));
-  PADDLE_ENFORCE_NE(
-      framework::TensorContainsNAN(tensor),
-      true,
-      platform::errors::Fatal(
-          "Operator %s output Tensor %s contains NAN.", op_type, name));
+  PADDLE_ENFORCE_NE(framework::TensorContainsInf(tensor),
+                    true,
+                    platform::errors::Fatal(
+                        "Operator %s output phi::DenseTensor %s contains Inf.",
+                        op_type,
+                        name));
+  PADDLE_ENFORCE_NE(framework::TensorContainsNAN(tensor),
+                    true,
+                    platform::errors::Fatal(
+                        "Operator %s output phi::DenseTensor %s contains NAN.",
+                        op_type,
+                        name));
 }
 
 bool OperatorWithKernel::SupportGPU() const {
@@ -2112,7 +2115,7 @@ void OperatorWithKernel::HandleComplexGradToRealGrad(
               << " var `" << var_name << "` to "
               << framework::DataTypeToString(dst_type)
               << " real var in static graph.";
-      Tensor out;
+      phi::DenseTensor out;
       TransComplexToReal(dst_type, src_type, *grad_tensor, &out);
       SetTensorToVariable(*grad_var, out, grad_var);
     }
@@ -2153,7 +2156,7 @@ Scope* OperatorWithKernel::PrepareData(
 
       auto* tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var);
 
-      // When no_buffer_ins then checking of Tensor::holder_ is
+      // When no_buffer_ins then checking of phi::DenseTensor::holder_ is
       // not a thread safe. And for infershape scenario checks
       // to be omitted are not really needed
       if (should_skip_input == true) {
@@ -2180,7 +2183,8 @@ Scope* OperatorWithKernel::PrepareData(
           out->Resize(tensor_in->dims());
           platform::MatchShapeToLayout(
               out, tensor_in->layout(), DataLayout::kNHWC);
-          VLOG(7) << "Created reshaped dummy input based on MKL-DNN Tensor , "
+          VLOG(7) << "Created reshaped dummy input based on MKL-DNN "
+                     "phi::DenseTensor , "
                      "but kNHWC layout"
                   << in_name << " in Operator " << type_;
         } else {
@@ -2308,7 +2312,7 @@ Scope* OperatorWithKernel::PrepareData(
       }
 
       // Do transfer
-      Tensor out;
+      phi::DenseTensor out;
       TransformData(new_expected_kernel_key ? *new_expected_kernel_key
                                             : expected_kernel_key,
                     kernel_type_for_var,
@@ -2375,9 +2379,9 @@ void OperatorWithKernel::ParseInputDataType(
     const std::string& name,
     proto::VarType::Type* data_type) const {
   if (var != nullptr) {
-    const Tensor* t = nullptr;
-    if (var->IsType<Tensor>()) {
-      t = &var->Get<Tensor>();
+    const phi::DenseTensor* t = nullptr;
+    if (var->IsType<phi::DenseTensor>()) {
+      t = &var->Get<phi::DenseTensor>();
     } else if (var->IsType<LoDTensor>()) {
       t = &var->Get<LoDTensor>();
     } else if (var->IsType<phi::SelectedRows>()) {
@@ -2391,13 +2395,13 @@ void OperatorWithKernel::ParseInputDataType(
       }
     }
     if (t != nullptr) {
-      PADDLE_ENFORCE_EQ(
-          t->IsInitialized(),
-          true,
-          platform::errors::InvalidArgument("The %s Op's Input Variable `%s` "
-                                            "contains uninitialized Tensor.",
-                                            Type(),
-                                            name));
+      PADDLE_ENFORCE_EQ(t->IsInitialized(),
+                        true,
+                        platform::errors::InvalidArgument(
+                            "The %s Op's Input Variable `%s` "
+                            "contains uninitialized phi::DenseTensor.",
+                            Type(),
+                            name));
       *data_type = paddle::framework::TransToProtoVarType(t->dtype());
     }
   }
@@ -2412,9 +2416,9 @@ void OperatorWithKernel::ParseMultiInputDataType(
   for (size_t i = 0; i < vars.size(); ++i) {
     const Variable* var = vars[i];
     if (var != nullptr) {
-      const Tensor* t = nullptr;
-      if (var->IsType<Tensor>()) {
-        t = &var->Get<Tensor>();
+      const phi::DenseTensor* t = nullptr;
+      if (var->IsType<phi::DenseTensor>()) {
+        t = &var->Get<phi::DenseTensor>();
       } else if (var->IsType<LoDTensor>()) {
         t = &var->Get<LoDTensor>();
       } else if (var->IsType<phi::SelectedRows>()) {
@@ -2428,13 +2432,13 @@ void OperatorWithKernel::ParseMultiInputDataType(
         }
       }
       if (t != nullptr) {
-        PADDLE_ENFORCE_EQ(
-            t->IsInitialized(),
-            true,
-            platform::errors::InvalidArgument("The %s Op's Input Variable `%s` "
-                                              "contains uninitialized Tensor.",
-                                              Type(),
-                                              name));
+        PADDLE_ENFORCE_EQ(t->IsInitialized(),
+                          true,
+                          platform::errors::InvalidArgument(
+                              "The %s Op's Input Variable `%s` "
+                              "contains uninitialized phi::DenseTensor.",
+                              Type(),
+                              name));
         proto::VarType::Type tmp =
             paddle::framework::TransToProtoVarType(t->dtype());
         PADDLE_ENFORCE(tmp == *data_type || *data_type == default_data_type,
@@ -2496,7 +2500,7 @@ proto::VarType::Type OperatorWithKernel::IndicateVarDataType(
   return data_type;
 }
 
-Tensor* OperatorWithKernel::GetTensorFormInputSafely(
+phi::DenseTensor* OperatorWithKernel::GetTensorFormInputSafely(
     const ExecutionContext& ctx, const std::string& name) const {
   // 1. get variable and check
   // NOTE: only supports signal input var now
@@ -2509,9 +2513,9 @@ Tensor* OperatorWithKernel::GetTensorFormInputSafely(
       platform::errors::NotFound(
           "The variable %s is not found when promote complex types.", name));
   // 2. get tensor and check
-  Tensor* t = nullptr;
-  if (var->IsType<Tensor>()) {
-    t = var->GetMutable<Tensor>();
+  phi::DenseTensor* t = nullptr;
+  if (var->IsType<phi::DenseTensor>()) {
+    t = var->GetMutable<phi::DenseTensor>();
   } else if (var->IsType<LoDTensor>()) {
     t = var->GetMutable<LoDTensor>();
   } else if (var->IsType<phi::SelectedRows>()) {
@@ -2520,18 +2524,19 @@ Tensor* OperatorWithKernel::GetTensorFormInputSafely(
     PADDLE_THROW(platform::errors::Unimplemented(
         "Unsupported input variable type in complex type promotion."));
   }
-  PADDLE_ENFORCE_NOT_NULL(
-      t,
+  PADDLE_ENFORCE_NOT_NULL(t,
+                          platform::errors::InvalidArgument(
+                              "The phi::DenseTensor of variable %s is nullptr "
+                              "when promote complex types."));
+  PADDLE_ENFORCE_EQ(
+      t->IsInitialized(),
+      true,
       platform::errors::InvalidArgument(
-          "The Tensor of variable %s is nullptr when promote complex types."));
-  PADDLE_ENFORCE_EQ(t->IsInitialized(),
-                    true,
-                    platform::errors::InvalidArgument(
-                        "The Tensor in the %s Op's Input Variable %s(%s) is "
-                        "not initialized.",
-                        Type(),
-                        name,
-                        ctx.InputName(name)));
+          "The phi::DenseTensor in the %s Op's Input Variable %s(%s) is "
+          "not initialized.",
+          Type(),
+          name,
+          ctx.InputName(name)));
   return t;
 }
 
@@ -2567,7 +2572,7 @@ OpKernelType OperatorWithKernel::GetExpectedKernelType(
 
 OpKernelType OperatorWithKernel::GetKernelTypeForVar(
     const std::string& var_name,
-    const Tensor& tensor,
+    const phi::DenseTensor& tensor,
     const OpKernelType& expected_kernel_type) const {
   return OpKernelType(
       expected_kernel_type.data_type_, tensor.place(), tensor.layout());
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index edb2d539f82ef..33b1f5c32300b 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -124,8 +124,9 @@ inline bool VarIsTensor(const Variable& var) {
   return var.IsType<LoDTensor>() || var.IsType<phi::SelectedRows>();
 }
 
-const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var);
-Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var);
+const phi::DenseTensor* GetLoDTensorOrSelectedRowsValueFromVar(
+    const Variable& var);
+phi::DenseTensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var);
 
 class ExecutionContext;
 class OperatorBase;
@@ -449,8 +450,8 @@ class ExecutionContext {
 #endif
 
   template <typename T, typename DevContext>
-  Tensor AllocateTmpTensor(const framework::DDim& dim,
-                           const DevContext& dev_ctx) const {
+  phi::DenseTensor AllocateTmpTensor(const framework::DDim& dim,
+                                     const DevContext& dev_ctx) const {
     phi::DenseTensor tmp;
     tmp.Resize(dim);
     dev_ctx.template Alloc<T>(&tmp);
@@ -552,11 +553,11 @@ class ExecutionArgumentMappingContext : public phi::ArgumentMappingContext {
 };
 
 template <>
-const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
-    const std::string& name) const;
+const std::vector<const phi::DenseTensor*>
+ExecutionContext::MultiInput<phi::DenseTensor>(const std::string& name) const;
 
 template <>
-std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
+std::vector<phi::DenseTensor*> ExecutionContext::MultiOutput<phi::DenseTensor>(
     const std::string& name) const;
 
 class OpKernelBase {
@@ -640,7 +641,7 @@ class OperatorWithKernel : public OperatorBase {
   // need transform data
   virtual OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const OpKernelType& expected_kernel_type) const;
 
   platform::Place GetExecutionPlace(
@@ -649,12 +650,13 @@ class OperatorWithKernel : public OperatorBase {
   }
 
   /* member functions for adapting to phi lib */
-  /** In the Tensor calculation library, the new Kernel adopts a clearer and
-   * more streamlined design. The arguments of the Kernel and the input and
-   * output arguments registered in the original OpMaker do not match in some
-   * cases, so we use map to record the arguments required by the kernel.
-   * When selecting Kernel during Op execution, select the arguments of the
-   * original Op according to the GetExpectedPhiKernelArgs returned arguments.
+  /** In the phi::DenseTensor calculation library, the new Kernel adopts a
+   * clearer and more streamlined design. The arguments of the Kernel and the
+   * input and output arguments registered in the original OpMaker do not match
+   * in some cases, so we use map to record the arguments required by the
+   * kernel. When selecting Kernel during Op execution, select the arguments of
+   * the original Op according to the GetExpectedPhiKernelArgs returned
+   * arguments.
    */
   phi::KernelSignature GetExpectedPhiKernelArgs(
       const ExecutionContext& ctx) const;
@@ -723,8 +725,8 @@ class OperatorWithKernel : public OperatorBase {
                                const std::string& name,
                                proto::VarType::Type* data_type) const;
   // used for IndicateOrPromoteVarDataTypes
-  Tensor* GetTensorFormInputSafely(const ExecutionContext& ctx,
-                                   const std::string& name) const;
+  phi::DenseTensor* GetTensorFormInputSafely(const ExecutionContext& ctx,
+                                             const std::string& name) const;
 
  protected:
   mutable std::unique_ptr<OpKernelType> kernel_type_;
diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc
index b5aaa22e86ee2..20aab15651d16 100644
--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
@@ -146,7 +146,7 @@ class CPUKernelTest : public OpKernel<float> {
     cpu_kernel_run_num++;
     ASSERT_EQ(ctx.InputName("x"), "IN1");
     ASSERT_EQ(ctx.OutputName("y"), "OUT1");
-    auto* x = ctx.Input<Tensor>("X");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
     ASSERT_EQ(x, nullptr);
   }
 };
@@ -196,13 +196,13 @@ class CPUKernalMultiInputsTest : public OpKernel<float> {
     auto outVar0 = ctx.MultiOutputVar("ys");
     ASSERT_EQ(outVar0.size(), 2U);
 
-    auto inTensor0 = ctx.MultiInput<Tensor>("xs");
+    auto inTensor0 = ctx.MultiInput<phi::DenseTensor>("xs");
     ASSERT_EQ(inTensor0.size(), 3U);
 
-    auto intTensor1 = ctx.Input<Tensor>("k");
+    auto intTensor1 = ctx.Input<phi::DenseTensor>("k");
     ASSERT_NE(intTensor1, nullptr);
 
-    auto outTensor0 = ctx.MultiOutput<Tensor>("ys");
+    auto outTensor0 = ctx.MultiOutput<phi::DenseTensor>("ys");
     ASSERT_EQ(outTensor0.size(), 2U);
 
     auto k = ctx.InputName("k");
@@ -349,7 +349,7 @@ class IndicateLoDTensorDataTypeTest : public OperatorWithKernel {
 class IndicateLoDTensorDataTypeTestProtoMaker : public OpProtoAndCheckerMaker {
  public:
   void Make() {
-    AddInput("LoDTensor", "Input of Tensor type Variable.");
+    AddInput("LoDTensor", "Input of phi::DenseTensor type Variable.");
     AddComment("This Op is only for IndicateVarDataType interface test.");
   }
 };
@@ -450,7 +450,8 @@ TEST(IndicateVarDataTypeTest, lodtensor) {
     EXPECT_TRUE(
         ex_msg.find(
             "The indicate_lod_tensor_data_type_test Op's Input Variable "
-            "`LoDTensor` contains uninitialized Tensor.") != std::string::npos);
+            "`LoDTensor` contains uninitialized phi::DenseTensor.") !=
+        std::string::npos);
   }
   ASSERT_TRUE(caught);
 }
@@ -477,7 +478,7 @@ TEST(IndicateVarDataTypeTest, selectedrows) {
     EXPECT_TRUE(
         ex_msg.find("The indicate_selected_rows_data_type_test Op's "
                     "Input Variable `SelectedRows` contains uninitialized "
-                    "Tensor.") != std::string::npos);
+                    "phi::DenseTensor.") != std::string::npos);
   }
   ASSERT_TRUE(caught);
 }
@@ -684,8 +685,8 @@ class OpWithoutUnusedVarKernelTest : public OpKernel<T> {
   void Compute(const ExecutionContext& ctx) const {
     ASSERT_EQ(ctx.InputName("X"), "X");
     ASSERT_EQ(ctx.OutputName("Y"), "Y");
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Output<Tensor>("Y");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* y = ctx.Output<phi::DenseTensor>("Y");
     ASSERT_NE(x, y);
     ASSERT_NE(y, nullptr);
   }
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc
index 0e1a75ebe64ee..79ba56ab147a3 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc
@@ -37,7 +37,7 @@ namespace paddle2cinn {
 
 using ir::Graph;
 using ir::Node;
-using CinnTensor = ::cinn::hlir::framework::Tensor;
+using CinnTensor = ::cinn::hlir::Tensor;
 using OpMapperContext = CinnGraphSymbolization::OpMapperContext;
 using CinnOpDesc = CinnGraphSymbolization::CinnOpDesc;
 using FeedInfoMap = CinnGraphSymbolization::FeedInfoMap;
@@ -45,7 +45,7 @@ using FeedInfoMap = CinnGraphSymbolization::FeedInfoMap;
 namespace utils {
 
 OpMapperContext::FeedInfo GetCinnFeedInfoFromTensor(
-    const Tensor& tensor, bool skip_trans_type = false) {
+    const phi::DenseTensor& tensor, bool skip_trans_type = false) {
   OpMapperContext::FeedInfo info;
   const auto& dim = tensor.dims();
   for (int i = 0; i < dim.size(); i++) {
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc
index 12bd9564c1ae3..929f009b2a3a2 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc
@@ -24,7 +24,7 @@ namespace paddle2cinn {
 using ::cinn::frontend::NetBuilder;
 using ir::Graph;
 using ir::Node;
-using CinnTensor = ::cinn::hlir::framework::Tensor;
+using CinnTensor = ::cinn::hlir::Tensor;
 using OpMapperContext = CinnGraphSymbolization::OpMapperContext;
 using CinnOpDesc = CinnGraphSymbolization::CinnOpDesc;
 using FeedInfoMap = CinnGraphSymbolization::FeedInfoMap;
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc
index 2dd09771cc5ea..ee030bb39caa9 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc
@@ -52,7 +52,7 @@ Program CreateAddProgram() {
   return program;
 }
 
-void SetRandData(hlir::framework::Tensor tensor, Target target) {
+void SetRandData(hlir::Tensor tensor, Target target) {
   auto* data = tensor->mutable_data<float>(target);
   std::random_device seed;
   std::default_random_engine engine(seed());
@@ -96,8 +96,8 @@ TEST(net_build, program_execute_multi_elementwise_add) {
   hlir::framework::GraphCompiler gc(target, scope, graph);
   auto runtime_program = gc.Build();
 
-  scope->Var<hlir::framework::Tensor>("A");
-  scope->Var<hlir::framework::Tensor>("B");
+  scope->Var<hlir::Tensor>("A");
+  scope->Var<hlir::Tensor>("B");
 
   auto A = scope->GetTensor("A");
   auto B = scope->GetTensor("B");
@@ -133,10 +133,10 @@ TEST(net_build, program_execute_fc) {
   hlir::framework::GraphCompiler gc(target, scope, graph);
   auto runtime_program = gc.Build();
 
-  scope->Var<hlir::framework::Tensor>(std::string(a.id()));
-  scope->Var<hlir::framework::Tensor>(std::string(w.id()));
-  scope->Var<hlir::framework::Tensor>(std::string(b.id()));
-  scope->Var<hlir::framework::Tensor>(std::string(mul_out->id));
+  scope->Var<hlir::Tensor>(std::string(a.id()));
+  scope->Var<hlir::Tensor>(std::string(w.id()));
+  scope->Var<hlir::Tensor>(std::string(b.id()));
+  scope->Var<hlir::Tensor>(std::string(mul_out->id));
 
   auto a_ten = scope->GetTensor(std::string(a.id()));
   auto w_ten = scope->GetTensor(std::string(w.id()));
diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h
index e1dbd85f129ad..d2d0af8effd10 100644
--- a/paddle/fluid/framework/program_desc.h
+++ b/paddle/fluid/framework/program_desc.h
@@ -75,14 +75,14 @@ class ProgramDesc {
   // fetch_ops.
   const std::vector<std::string> GetFetchTargetNames();
 
-  // The input variable of feed_op that holds input Tensor provided by users is
-  // referenced as feed_holder.
-  // This function is used to change or unify the feed_holder variables' name.
+  // The input variable of feed_op that holds input phi::DenseTensor provided by
+  // users is referenced as feed_holder. This function is used to change or
+  // unify the feed_holder variables' name.
   void SetFeedHolderName(const std::string &feed_holder_name);
 
-  // The output variable of fetch_op that holds output Tensor needed by users is
-  // referenced as fetch_holder.
-  // This function is used to change or unify the fetch_holder variables' name.
+  // The output variable of fetch_op that holds output phi::DenseTensor needed
+  // by users is referenced as fetch_holder. This function is used to change or
+  // unify the fetch_holder variables' name.
   void SetFetchHolderName(const std::string &fetch_holder_name);
 
   std::string CachedHashString();
diff --git a/paddle/fluid/framework/save_load_util.cc b/paddle/fluid/framework/save_load_util.cc
index cb6120d5e5ee6..dee25b998a67c 100644
--- a/paddle/fluid/framework/save_load_util.cc
+++ b/paddle/fluid/framework/save_load_util.cc
@@ -38,7 +38,7 @@ void CheckInStreamState(std::istream& istre, size_t length) {
 
 struct DeserializedDataFunctor {
   DeserializedDataFunctor(void** buf,
-                          Tensor* tensor,
+                          phi::DenseTensor* tensor,
                           const platform::Place& place)
       : buf_(buf), tensor_(tensor), place_(place) {}
 
@@ -48,7 +48,7 @@ struct DeserializedDataFunctor {
   }
 
   void** buf_;
-  Tensor* tensor_;
+  phi::DenseTensor* tensor_;
   platform::Place place_;
 };
 
@@ -58,13 +58,14 @@ size_t ReadTensorNumber(std::istream& istre) {
              sizeof(char) * tensor_number_mark.size());
   std::string str_read_tensor_number_mark(tensor_number_mark_buffer,
                                           tensor_number_mark.size());
-  PADDLE_ENFORCE_EQ(tensor_number_mark,
-                    str_read_tensor_number_mark,
-                    platform::errors::InvalidArgument(
-                        "Tensor number mark does not match, expect mark is "
-                        "[%s], but the mark read from file is [%s].",
-                        tensor_number_mark,
-                        str_read_tensor_number_mark));
+  PADDLE_ENFORCE_EQ(
+      tensor_number_mark,
+      str_read_tensor_number_mark,
+      platform::errors::InvalidArgument(
+          "phi::DenseTensor number mark does not match, expect mark is "
+          "[%s], but the mark read from file is [%s].",
+          tensor_number_mark,
+          str_read_tensor_number_mark));
 
   size_t tensor_number = 0;
   istre.read(reinterpret_cast<char*>(&tensor_number), sizeof(tensor_number));
@@ -82,13 +83,14 @@ std::string ReadTensorName(std::istream& istre) {
 
   std::string str_read_tensor_name_mark(name_mark_buffer,
                                         tensor_name_mark.size());
-  PADDLE_ENFORCE_EQ(tensor_name_mark,
-                    str_read_tensor_name_mark,
-                    platform::errors::InvalidArgument(
-                        "Tensor name mark does not match, expect mark is [%s], "
-                        "but the mark read from file is [%s].",
-                        tensor_name_mark,
-                        str_read_tensor_name_mark));
+  PADDLE_ENFORCE_EQ(
+      tensor_name_mark,
+      str_read_tensor_name_mark,
+      platform::errors::InvalidArgument(
+          "phi::DenseTensor name mark does not match, expect mark is [%s], "
+          "but the mark read from file is [%s].",
+          tensor_name_mark,
+          str_read_tensor_name_mark));
 
   size_t tensor_name_length = 0;
   istre.read(reinterpret_cast<char*>(&tensor_name_length),
@@ -120,7 +122,7 @@ bool SaveStaticNameListToDisk(
     const std::string& file_name,
     const std::vector<std::string>& vec_tensor_name_list,
     const Scope& scope) {
-  std::map<std::string, Tensor*> map_tensor;
+  std::map<std::string, phi::DenseTensor*> map_tensor;
 
   for (size_t i = 0; i < vec_tensor_name_list.size(); ++i) {
     auto var_ptr = scope.FindVar(vec_tensor_name_list[i]);
@@ -131,7 +133,7 @@ bool SaveStaticNameListToDisk(
                                    "that exe.run(startup_program) has "
                                    "been executed.",
                                    vec_tensor_name_list[i]));
-    Tensor* tensor = var_ptr->GetMutable<LoDTensor>();
+    phi::DenseTensor* tensor = var_ptr->GetMutable<LoDTensor>();
     PADDLE_ENFORCE_EQ(tensor->IsInitialized(),
                       true,
                       platform::errors::PreconditionNotMet(
@@ -149,11 +151,11 @@ bool SaveDygraphVarBaseListToDisk(
     const std::string& file_name,
     const std::vector<std::shared_ptr<imperative::VarBase>>&
         vec_var_base_list) {
-  std::map<std::string, Tensor*> map_tensor;
+  std::map<std::string, phi::DenseTensor*> map_tensor;
   for (size_t i = 0; i < vec_var_base_list.size(); ++i) {
     auto var_ptr = vec_var_base_list[i]->MutableVar();
 
-    Tensor* tensor = var_ptr->GetMutable<LoDTensor>();
+    phi::DenseTensor* tensor = var_ptr->GetMutable<LoDTensor>();
 
     PADDLE_ENFORCE_EQ(tensor->IsInitialized(),
                       true,
@@ -170,7 +172,7 @@ bool SaveDygraphVarBaseListToDisk(
 
 const std::vector<std::shared_ptr<imperative::VarBase>>
 LoadDygraphVarBaseListFromDisk(const std::string& file_name) {
-  std::map<std::string, std::shared_ptr<Tensor>> map_load_tensor;
+  std::map<std::string, std::shared_ptr<phi::DenseTensor>> map_load_tensor;
   LoadTensorFromDisk(file_name, &map_load_tensor);
 
   std::vector<std::shared_ptr<imperative::VarBase>> vec_res;
@@ -194,7 +196,7 @@ bool LoadStaticNameListFromDisk(
     const std::string& file_name,
     const std::vector<std::string>& vec_tensor_name_list,
     const Scope& scope) {
-  std::map<std::string, std::shared_ptr<Tensor>> map_load_tensor;
+  std::map<std::string, std::shared_ptr<phi::DenseTensor>> map_load_tensor;
   LoadTensorFromDisk(file_name, &map_load_tensor);
 
   for (size_t i = 0; i < vec_tensor_name_list.size(); ++i) {
@@ -214,7 +216,7 @@ bool LoadStaticNameListFromDisk(
             "please make sure that exe.run(startup_program) has been executed.",
             vec_tensor_name_list[i]));
 
-    Tensor* tensor = var_ptr->GetMutable<LoDTensor>();
+    phi::DenseTensor* tensor = var_ptr->GetMutable<LoDTensor>();
     PADDLE_ENFORCE_NOT_NULL(
         tensor,
         platform::errors::PreconditionNotMet(
@@ -261,8 +263,9 @@ bool LoadStaticNameListFromDisk(
   return true;
 }
 
-bool SaveTensorToDisk(const std::string& file_name,
-                      const std::map<std::string, Tensor*>& map_tensor) {
+bool SaveTensorToDisk(
+    const std::string& file_name,
+    const std::map<std::string, phi::DenseTensor*>& map_tensor) {
   MkDirRecursively(DirName(file_name).c_str());
 
   std::ofstream fout(file_name, std::ios::binary);
@@ -316,12 +319,13 @@ bool SaveTensorToDisk(const std::string& file_name,
     auto* data_ptr = tensor->data();
     if (platform::is_gpu_place(tensor->place())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      framework::Tensor temp;
+      phi::DenseTensor temp;
       TensorCopySync(*tensor, platform::CPUPlace(), &temp);
       data_ptr = temp.data();
 #else
-      PADDLE_THROW(platform::errors::Unavailable(
-          "Tensor is in CUDA device, but paddle not compiled with CUDA."));
+      PADDLE_THROW(
+          platform::errors::Unavailable("phi::DenseTensor is in CUDA device, "
+                                        "but paddle not compiled with CUDA."));
 #endif
     }
     fout.write(static_cast<const char*>(data_ptr),
@@ -341,7 +345,7 @@ bool SaveTensorToDisk(const std::string& file_name,
 
 bool LoadTensorFromDisk(
     const std::string& file_name,
-    std::map<std::string, std::shared_ptr<Tensor>>* map_tensor) {
+    std::map<std::string, std::shared_ptr<phi::DenseTensor>>* map_tensor) {
   std::ifstream fin(file_name, std::ios::binary);
 
   PADDLE_ENFORCE_EQ(
@@ -356,7 +360,7 @@ bool LoadTensorFromDisk(
   for (size_t i = 0; i < tensor_number; ++i) {
     std::string str_tensor_name = ReadTensorName(fin);
 
-    std::shared_ptr<Tensor> tensor_temp(new Tensor());
+    std::shared_ptr<phi::DenseTensor> tensor_temp(new phi::DenseTensor());
     uint32_t version;
     fin.read(reinterpret_cast<char*>(&version), sizeof(version));
     CheckInStreamState(fin, sizeof(version));
diff --git a/paddle/fluid/framework/save_load_util.h b/paddle/fluid/framework/save_load_util.h
index f4ec7fafdcb9e..4f8360d96f6d3 100644
--- a/paddle/fluid/framework/save_load_util.h
+++ b/paddle/fluid/framework/save_load_util.h
@@ -47,12 +47,13 @@ bool SaveDygraphVarBaseListToDisk(
 const std::vector<std::shared_ptr<imperative::VarBase>>
 LoadDygraphVarBaseListFromDisk(const std::string& file_name);
 
-bool SaveTensorToDisk(const std::string& file_name,
-                      const std::map<std::string, Tensor*>& map_tensor);
+bool SaveTensorToDisk(
+    const std::string& file_name,
+    const std::map<std::string, phi::DenseTensor*>& map_tensor);
 
 bool LoadTensorFromDisk(
     const std::string& file_name,
-    std::map<std::string, std::shared_ptr<Tensor>>* map_tensor);
+    std::map<std::string, std::shared_ptr<phi::DenseTensor>>* map_tensor);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/save_load_util_test.cc b/paddle/fluid/framework/save_load_util_test.cc
index 623f0f27bdaa2..b8b5888236f0e 100644
--- a/paddle/fluid/framework/save_load_util_test.cc
+++ b/paddle/fluid/framework/save_load_util_test.cc
@@ -23,10 +23,10 @@ namespace framework {
 TEST(test_save_load_util, test_save_load) {
   srand(time(NULL));
   auto cpu_place = platform::CPUPlace();
-  Tensor tensor1;
+  phi::DenseTensor tensor1;
   tensor1.Resize({1000, 1000});
   auto src_data_1 = tensor1.mutable_data<float>(cpu_place);
-  Tensor tensor2;
+  phi::DenseTensor tensor2;
   tensor2.Resize({5000, 1000});
   auto src_data_2 = tensor2.mutable_data<float>(cpu_place);
 
@@ -42,13 +42,13 @@ TEST(test_save_load_util, test_save_load) {
     src_data_2[i] = temp;
   }
 
-  std::map<std::string, Tensor*> map_tensor;
+  std::map<std::string, phi::DenseTensor*> map_tensor;
   map_tensor["t1"] = &tensor1;
   map_tensor["t2"] = &tensor2;
 
   SaveTensorToDisk("test_1", map_tensor);
 
-  std::map<std::string, std::shared_ptr<Tensor>> load_map_tensor;
+  std::map<std::string, std::shared_ptr<phi::DenseTensor>> load_map_tensor;
 
   LoadTensorFromDisk("test_1", &load_map_tensor);
 
diff --git a/paddle/fluid/framework/selected_rows_utils_test.cc b/paddle/fluid/framework/selected_rows_utils_test.cc
index 340acf53efa9d..1031a221a0796 100644
--- a/paddle/fluid/framework/selected_rows_utils_test.cc
+++ b/paddle/fluid/framework/selected_rows_utils_test.cc
@@ -28,7 +28,7 @@ class SelectedRowsTester : public ::testing::Test {
     int64_t row_numel = 100;
     selected_rows_.reset(new phi::SelectedRows(rows, height));
 
-    Tensor* value = selected_rows_->mutable_value();
+    phi::DenseTensor* value = selected_rows_->mutable_value();
     auto* data = value->mutable_data<float>(
         phi::make_ddim({static_cast<int64_t>(rows.size()), row_numel}), place_);
     for (int64_t i = 0; i < value->numel(); ++i) {
@@ -98,7 +98,7 @@ TEST(SelectedRows, SparseTable) {
   ASSERT_TRUE(table.HasKey(6));
   ASSERT_EQ(table.rows().size(), 3UL);
 
-  framework::Tensor ids;
+  phi::DenseTensor ids;
   ids.Resize(phi::make_ddim({4}));
   auto* ids_data = ids.mutable_data<int64_t>(cpu);
   ids_data[0] = static_cast<int64_t>(6);
@@ -106,7 +106,7 @@ TEST(SelectedRows, SparseTable) {
   ids_data[2] = static_cast<int64_t>(8);
   ids_data[3] = static_cast<int64_t>(10);
 
-  framework::Tensor get_value;
+  phi::DenseTensor get_value;
   auto* value_data =
       get_value.mutable_data<float>(phi::make_ddim({4, embedding_width}), cpu);
   table.Get(ids, &get_value);
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index fcb061aa93288..a83b3baa85e52 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -22,7 +22,6 @@ namespace paddle {
 namespace framework {
 
 using LoD = std::vector<paddle::framework::Vector<size_t>>;
-using Tensor = phi::DenseTensor;
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc
index fcf255dafc2e0..c9d740dcf8fc4 100644
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -22,8 +22,8 @@
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 
-TEST(Tensor, Dims) {
-  framework::Tensor tt;
+TEST(DenseTensor, Dims) {
+  phi::DenseTensor tt;
   tt.Resize({2, 3, 4});
   framework::DDim dims = tt.dims();
   ASSERT_EQ(arity(dims), 3);
@@ -32,8 +32,8 @@ TEST(Tensor, Dims) {
   }
 }
 
-TEST(Tensor, DataAssert) {
-  framework::Tensor src_tensor;
+TEST(DenseTensor, DataAssert) {
+  phi::DenseTensor src_tensor;
 
   bool caught = false;
   try {
@@ -41,16 +41,16 @@ TEST(Tensor, DataAssert) {
   } catch (platform::EnforceNotMet& err) {
     caught = true;
     std::string ex_msg = err.what();
-    EXPECT_TRUE(ex_msg.find("Tensor holds no memory. Call "
-                            "Tensor::mutable_data firstly.") !=
+    EXPECT_TRUE(ex_msg.find("phi::DenseTensor holds no memory. Call "
+                            "phi::DenseTensor::mutable_data firstly.") !=
                 std::string::npos);
   }
   ASSERT_TRUE(caught);
 }
 
-TEST(Tensor, MutableData) {
+TEST(DenseTensor, MutableData) {
   {
-    framework::Tensor src_tensor;
+    phi::DenseTensor src_tensor;
     float* p1 = nullptr;
     float* p2 = nullptr;
     // initialization
@@ -99,9 +99,10 @@ TEST(Tensor, MutableData) {
     EXPECT_NE(p1, p4);
     EXPECT_NE(p3_holder1.get(), p3_holder2.get());
   }
-  // Not sure if it's desired, but currently, Tensor type can be changed.
+  // Not sure if it's desired, but currently, phi::DenseTensor type can be
+  // changed.
   {
-    framework::Tensor src_tensor;
+    phi::DenseTensor src_tensor;
     int8_t* p1 = src_tensor.mutable_data<int8_t>(phi::make_ddim({1}),
                                                  platform::CPUPlace());
     EXPECT_NE(p1, nullptr);
@@ -115,7 +116,7 @@ TEST(Tensor, MutableData) {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   {
-    framework::Tensor src_tensor;
+    phi::DenseTensor src_tensor;
     float* p1 = nullptr;
     float* p2 = nullptr;
     // initialization
@@ -144,7 +145,7 @@ TEST(Tensor, MutableData) {
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
   {
-    framework::Tensor src_tensor;
+    phi::DenseTensor src_tensor;
     float* p1 = nullptr;
     float* p2 = nullptr;
     // initialization
@@ -173,10 +174,10 @@ TEST(Tensor, MutableData) {
 #endif
 }
 
-TEST(Tensor, ShareDataWith) {
+TEST(DenseTensor, ShareDataWith) {
   {
-    framework::Tensor src_tensor;
-    framework::Tensor dst_tensor;
+    phi::DenseTensor src_tensor;
+    phi::DenseTensor dst_tensor;
     // Try to share data form uninitialized tensor
     bool caught = false;
     try {
@@ -184,8 +185,8 @@ TEST(Tensor, ShareDataWith) {
     } catch (paddle::platform::EnforceNotMet& err) {
       caught = true;
       std::string ex_msg = err.what();
-      EXPECT_TRUE(ex_msg.find("Tensor holds no memory. Call "
-                              "Tensor::mutable_data firstly.") !=
+      EXPECT_TRUE(ex_msg.find("phi::DenseTensor holds no memory. Call "
+                              "phi::DenseTensor::mutable_data firstly.") !=
                   std::string::npos);
     }
     ASSERT_TRUE(caught);
@@ -198,8 +199,8 @@ TEST(Tensor, ShareDataWith) {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   {
-    framework::Tensor src_tensor;
-    framework::Tensor dst_tensor;
+    phi::DenseTensor src_tensor;
+    phi::DenseTensor dst_tensor;
     src_tensor.mutable_data<int>(phi::make_ddim({2, 3, 4}),
                                  platform::CUDAPlace(0));
     dst_tensor.ShareDataWith(src_tensor);
@@ -208,8 +209,8 @@ TEST(Tensor, ShareDataWith) {
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
   {
-    framework::Tensor src_tensor;
-    framework::Tensor dst_tensor;
+    phi::DenseTensor src_tensor;
+    phi::DenseTensor dst_tensor;
     src_tensor.mutable_data<int>(phi::make_ddim({2, 3, 4}),
                                  platform::NPUPlace(0));
     dst_tensor.ShareDataWith(src_tensor);
@@ -218,12 +219,12 @@ TEST(Tensor, ShareDataWith) {
 #endif
 }
 
-TEST(Tensor, Slice) {
+TEST(DenseTensor, Slice) {
   {
-    framework::Tensor src_tensor;
+    phi::DenseTensor src_tensor;
     src_tensor.mutable_data<int>(phi::make_ddim({5, 3, 4}),
                                  platform::CPUPlace());
-    framework::Tensor slice_tensor = src_tensor.Slice(1, 3);
+    phi::DenseTensor slice_tensor = src_tensor.Slice(1, 3);
     framework::DDim slice_dims = slice_tensor.dims();
     ASSERT_EQ(arity(slice_dims), 3);
     EXPECT_EQ(slice_dims[0], 2);
@@ -246,10 +247,10 @@ TEST(Tensor, Slice) {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   {
-    framework::Tensor src_tensor;
+    phi::DenseTensor src_tensor;
     src_tensor.mutable_data<double>(phi::make_ddim({6, 9}),
                                     platform::CUDAPlace(0));
-    framework::Tensor slice_tensor = src_tensor.Slice(2, 6);
+    phi::DenseTensor slice_tensor = src_tensor.Slice(2, 6);
     framework::DDim slice_dims = slice_tensor.dims();
     ASSERT_EQ(arity(slice_dims), 2);
     EXPECT_EQ(slice_dims[0], 4);
@@ -273,10 +274,10 @@ TEST(Tensor, Slice) {
 
 #ifdef PADDLE_WITH_ASCEND_CL
   {
-    framework::Tensor src_tensor;
+    phi::DenseTensor src_tensor;
     src_tensor.mutable_data<double>(phi::make_ddim({6, 9}),
                                     platform::NPUPlace(0));
-    framework::Tensor slice_tensor = src_tensor.Slice(2, 6);
+    phi::DenseTensor slice_tensor = src_tensor.Slice(2, 6);
     framework::DDim slice_dims = slice_tensor.dims();
     ASSERT_EQ(arity(slice_dims), 2);
     EXPECT_EQ(slice_dims[0], 4);
@@ -299,27 +300,27 @@ TEST(Tensor, Slice) {
 #endif
 }
 
-TEST(Tensor, ReshapeToMatrix) {
-  framework::Tensor src;
+TEST(DenseTensor, ReshapeToMatrix) {
+  phi::DenseTensor src;
   int* src_ptr = src.mutable_data<int>({2, 3, 4, 9}, platform::CPUPlace());
   for (int i = 0; i < 2 * 3 * 4 * 9; ++i) {
     src_ptr[i] = i;
   }
-  framework::Tensor res = framework::ReshapeToMatrix(src, 2);
+  phi::DenseTensor res = framework::ReshapeToMatrix(src, 2);
   ASSERT_EQ(res.dims()[0], 2 * 3);
   ASSERT_EQ(res.dims()[1], 4 * 9);
 }
 
-TEST(Tensor, Layout) {
-  framework::Tensor src;
+TEST(DenseTensor, Layout) {
+  phi::DenseTensor src;
   ASSERT_EQ(src.layout(), framework::DataLayout::kNCHW);
   src.set_layout(framework::DataLayout::kAnyLayout);
   ASSERT_EQ(src.layout(), framework::DataLayout::kAnyLayout);
 }
 
-TEST(Tensor, FP16) {
+TEST(DenseTensor, FP16) {
   using platform::float16;
-  framework::Tensor src;
+  phi::DenseTensor src;
   float16* src_ptr = src.mutable_data<float16>({2, 3}, platform::CPUPlace());
   for (int i = 0; i < 2 * 3; ++i) {
     src_ptr[i] = static_cast<float16>(i);
@@ -327,15 +328,16 @@ TEST(Tensor, FP16) {
   EXPECT_EQ(src.memory_size(), 2 * 3 * sizeof(float16));
   // EXPECT a human readable error message
   // src.data<uint8_t>();
-  // Tensor holds the wrong type, it holds N6paddle8platform7float16E at
+  // phi::DenseTensor holds the wrong type, it holds N6paddle8platform7float16E
+  // at
   // [/paddle/Paddle/paddle/fluid/framework/tensor_impl.h:43]
 }
 
-TEST(Tensor, Split) {
+TEST(DenseTensor, Split) {
   {
-    framework::Tensor src_tensor;
+    phi::DenseTensor src_tensor;
     src_tensor.mutable_data<int>(phi::make_ddim({6, 2}), platform::CPUPlace());
-    std::vector<framework::Tensor> split_tensor_list = src_tensor.Split(2, 0);
+    std::vector<phi::DenseTensor> split_tensor_list = src_tensor.Split(2, 0);
     ASSERT_EQ(split_tensor_list.size(), 3UL);
     EXPECT_EQ(split_tensor_list[0].dims()[0], 2);
     EXPECT_EQ(split_tensor_list[1].dims()[0], 2);
@@ -361,10 +363,10 @@ TEST(Tensor, Split) {
   }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   {
-    framework::Tensor src_tensor;
+    phi::DenseTensor src_tensor;
     src_tensor.mutable_data<double>(phi::make_ddim({6, 4}),
                                     platform::CUDAPlace(0));
-    std::vector<framework::Tensor> split_tensor_list = src_tensor.Split(2, 0);
+    std::vector<phi::DenseTensor> split_tensor_list = src_tensor.Split(2, 0);
     ASSERT_EQ(split_tensor_list.size(), 3UL);
     EXPECT_EQ(split_tensor_list[0].dims()[0], 2);
     EXPECT_EQ(split_tensor_list[1].dims()[0], 2);
@@ -393,11 +395,11 @@ TEST(Tensor, Split) {
 #endif
 }
 
-TEST(Tensor, Chunk) {
+TEST(DenseTensor, Chunk) {
   {
-    framework::Tensor src_tensor;
+    phi::DenseTensor src_tensor;
     src_tensor.mutable_data<int>(phi::make_ddim({6, 2}), platform::CPUPlace());
-    std::vector<framework::Tensor> split_tensor_list = src_tensor.Chunk(3, 0);
+    std::vector<phi::DenseTensor> split_tensor_list = src_tensor.Chunk(3, 0);
     ASSERT_EQ(split_tensor_list.size(), 3UL);
     EXPECT_EQ(split_tensor_list[0].dims()[0], 2);
     EXPECT_EQ(split_tensor_list[1].dims()[0], 2);
@@ -423,10 +425,10 @@ TEST(Tensor, Chunk) {
   }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   {
-    framework::Tensor src_tensor;
+    phi::DenseTensor src_tensor;
     src_tensor.mutable_data<double>(phi::make_ddim({6, 4}),
                                     platform::CUDAPlace(0));
-    std::vector<framework::Tensor> split_tensor_list = src_tensor.Chunk(3, 0);
+    std::vector<phi::DenseTensor> split_tensor_list = src_tensor.Chunk(3, 0);
     ASSERT_EQ(split_tensor_list.size(), 3UL);
     EXPECT_EQ(split_tensor_list[0].dims()[0], 2);
     EXPECT_EQ(split_tensor_list[1].dims()[0], 2);
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index ca1a65be7d0ab..efc7f685bc90b 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -137,7 +137,7 @@ void TensorCopyImpl(const TENSOR& src,
            platform::is_npu_place(dst_place)) {
     //  1. cpu tensor -> npu pinned tensor
     platform::NPUPinnedPlace npu_pinned_place;
-    Tensor npu_pinned_tensor;
+    phi::DenseTensor npu_pinned_tensor;
     npu_pinned_tensor.Resize(src.dims());
     auto npu_pinned_ptr =
         npu_pinned_tensor.mutable_data(npu_pinned_place, src.dtype());
@@ -179,12 +179,13 @@ void TensorCopyImpl(const TENSOR& src,
     auto src_npu_pinned_place = src_place;
     auto dst_npu_place = dst_place;
     auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_npu_place(ctx_place),
-                      true,
-                      platform::errors::PreconditionNotMet(
-                          "Device context place mismatch. When copying Tensor "
-                          "data from NPU Pinned memory to NPU memory, current "
-                          "device context place should be NPU."));
+    PADDLE_ENFORCE_EQ(
+        platform::is_npu_place(ctx_place),
+        true,
+        platform::errors::PreconditionNotMet(
+            "Device context place mismatch. When copying phi::DenseTensor "
+            "data from NPU Pinned memory to NPU memory, current "
+            "device context place should be NPU."));
     auto ctx_npu_place = ctx_place;
     PADDLE_ENFORCE_EQ(dst_npu_place,
                       ctx_npu_place,
@@ -204,12 +205,13 @@ void TensorCopyImpl(const TENSOR& src,
     auto src_npu_place = src_place;
     auto dst_npu_pinned_place = dst_place;
     auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_npu_place(ctx_place),
-                      true,
-                      platform::errors::PreconditionNotMet(
-                          "Device context place mismatch. When copying Tensor "
-                          "data from NPU memory to NPU Pinned memory, current "
-                          "device context place should be NPU."));
+    PADDLE_ENFORCE_EQ(
+        platform::is_npu_place(ctx_place),
+        true,
+        platform::errors::PreconditionNotMet(
+            "Device context place mismatch. When copying phi::DenseTensor "
+            "data from NPU memory to NPU Pinned memory, current "
+            "device context place should be NPU."));
     auto ctx_npu_place = ctx_place;
     PADDLE_ENFORCE_EQ(src_place,
                       ctx_npu_place,
@@ -291,12 +293,13 @@ void TensorCopyImpl(const TENSOR& src,
     auto src_gpu_place = src_place;
     auto dst_cuda_pinned_place = dst_place;
     auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx_place),
-                      true,
-                      platform::errors::PreconditionNotMet(
-                          "Device context place mismatch. When copying Tensor "
-                          "data from GPU memory to CUDA Pinned memory, current "
-                          "device context place should be GPU."));
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx_place),
+        true,
+        platform::errors::PreconditionNotMet(
+            "Device context place mismatch. When copying phi::DenseTensor "
+            "data from GPU memory to CUDA Pinned memory, current "
+            "device context place should be GPU."));
     auto ctx_gpu_place = ctx_place;
     PADDLE_ENFORCE_EQ(src_gpu_place,
                       ctx_gpu_place,
@@ -315,12 +318,13 @@ void TensorCopyImpl(const TENSOR& src,
     auto src_cuda_pinned_place = src_place;
     auto dst_gpu_place = dst_place;
     auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx_place),
-                      true,
-                      platform::errors::PreconditionNotMet(
-                          "Device context place mismatch. When copying Tensor "
-                          "data from CUDA Pinned memory to GPU memory, current "
-                          "device context place should be GPU."));
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx_place),
+        true,
+        platform::errors::PreconditionNotMet(
+            "Device context place mismatch. When copying phi::DenseTensor "
+            "data from CUDA Pinned memory to GPU memory, current "
+            "device context place should be GPU."));
     auto ctx_gpu_place = ctx_place;
     PADDLE_ENFORCE_EQ(dst_gpu_place,
                       ctx_gpu_place,
@@ -440,21 +444,21 @@ void TensorCopyImpl(const TENSOR& src,
   TensorCopyImpl(src, dst_place, *dev_ctx, dst);
 }
 
-void TensorCopy(const Tensor& src,
+void TensorCopy(const phi::DenseTensor& src,
                 const platform::Place& dst_place,
-                Tensor* dst) {
-  TensorCopyImpl<Tensor>(src, dst_place, dst);
+                phi::DenseTensor* dst) {
+  TensorCopyImpl<phi::DenseTensor>(src, dst_place, dst);
 }
-void TensorCopy(const Tensor& src,
+void TensorCopy(const phi::DenseTensor& src,
                 const platform::Place& dst_place,
                 const platform::DeviceContext& ctx,
-                Tensor* dst) {
-  TensorCopyImpl<Tensor>(src, dst_place, ctx, dst);
+                phi::DenseTensor* dst) {
+  TensorCopyImpl<phi::DenseTensor>(src, dst_place, ctx, dst);
 }
 
-void TensorCopySync(const Tensor& src,
+void TensorCopySync(const phi::DenseTensor& src,
                     const platform::Place& dst_place,
-                    Tensor* dst) {
+                    phi::DenseTensor* dst) {
   if (&src == dst) {
     auto src_copy = src;
     TensorCopySync(src_copy, dst_place, dst);
@@ -652,7 +656,7 @@ void TensorCopySync(const Tensor& src,
 }
 
 void TensorToStream(std::ostream& os,
-                    const Tensor& tensor,
+                    const phi::DenseTensor& tensor,
                     const platform::DeviceContext& dev_ctx) {
   {  // the 1st field, uint32_t version
     constexpr uint32_t version = 0;
@@ -813,7 +817,7 @@ void TensorToStream(std::ostream& os,
 
 struct DeserializedDataFunctor {
   DeserializedDataFunctor(void** buf,
-                          Tensor* tensor,
+                          phi::DenseTensor* tensor,
                           const platform::Place& place)
       : buf_(buf), tensor_(tensor), place_(place) {}
 
@@ -823,12 +827,12 @@ struct DeserializedDataFunctor {
   }
 
   void** buf_;
-  Tensor* tensor_;
+  phi::DenseTensor* tensor_;
   platform::Place place_;
 };
 
 void TensorFromStream(std::istream& is,
-                      Tensor* tensor,
+                      phi::DenseTensor* tensor,
                       const platform::DeviceContext& dev_ctx,
                       const size_t& seek,
                       const std::vector<int64_t>& shape) {
@@ -870,7 +874,7 @@ void TensorFromStream(std::istream& is,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
     defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) ||  \
     defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CUSTOM_DEVICE)
-      Tensor cpu_tensor;
+      phi::DenseTensor cpu_tensor;
       cpu_tensor.Resize(phi::make_ddim(shape));
       framework::VisitDataType(
           desc.data_type(),
@@ -907,7 +911,7 @@ void TensorFromStream(std::istream& is,
 }
 
 void TensorFromStream(std::istream& is,
-                      Tensor* tensor,
+                      phi::DenseTensor* tensor,
                       const platform::DeviceContext& dev_ctx) {
   uint32_t version;
   is.read(reinterpret_cast<char*>(&version), sizeof(version));
@@ -926,10 +930,10 @@ void TensorFromStream(std::istream& is,
         is.good(),
         true,
         platform::errors::Unavailable("Cannot read tensor desc size"));
-    PADDLE_ENFORCE_GE(
-        size,
-        0,
-        platform::errors::InvalidArgument("Tensor desc size should >= 0"));
+    PADDLE_ENFORCE_GE(size,
+                      0,
+                      platform::errors::InvalidArgument(
+                          "phi::DenseTensor desc size should >= 0"));
     std::unique_ptr<char[]> buf(new char[size]);
     is.read(reinterpret_cast<char*>(buf.get()), size);
     PADDLE_ENFORCE_EQ(
@@ -953,7 +957,7 @@ void TensorFromStream(std::istream& is,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
     defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) ||  \
     defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CUSTOM_DEVICE)
-      Tensor cpu_tensor;
+      phi::DenseTensor cpu_tensor;
       cpu_tensor.Resize(phi::make_ddim(dims));
       framework::VisitDataType(
           desc.data_type(),
@@ -994,7 +998,7 @@ void TensorFromStream(std::istream& is,
 
 // get tensor data point by DLDataType
 void* GetDstPtrByDLDataType(DLDataType type,
-                            framework::Tensor* dst,
+                            phi::DenseTensor* dst,
                             const platform::Place& dst_place) {
   // vector types not currently supported
   PADDLE_ENFORCE_LE(type.lanes,
@@ -1060,7 +1064,7 @@ void* GetDstPtrByDLDataType(DLDataType type,
   }
 }
 
-void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst) {
+void TensorFromDLPack(const ::DLTensor& dl_tensor, phi::DenseTensor* dst) {
   platform::CPUPlace dst_place = platform::CPUPlace();
   platform::CPUPlace src_place = platform::CPUPlace();
 
@@ -1103,13 +1107,13 @@ void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst) {
 }
 
 template <typename T>
-std::string format_tensor(const framework::Tensor& tensor) {
+std::string format_tensor(const phi::DenseTensor& tensor) {
   // TODO(zhiqiu): use the print option to format tensor.
   return "NOT IMPLEMENTED";
 }
 
 template <typename T>
-std::ostream& print_tensor(std::ostream& os, const framework::Tensor& tensor) {
+std::ostream& print_tensor(std::ostream& os, const phi::DenseTensor& tensor) {
   auto inspect = tensor.data<T>();
   auto element_num = tensor.numel();
 
@@ -1136,7 +1140,7 @@ std::ostream& print_tensor(std::ostream& os, const framework::Tensor& tensor) {
 
 template <>
 std::ostream& print_tensor<paddle::platform::complex<float>>(
-    std::ostream& os, const framework::Tensor& tensor) {
+    std::ostream& os, const phi::DenseTensor& tensor) {
   auto inspect = tensor.data<paddle::platform::complex<float>>();
   auto element_num = tensor.numel();
 
@@ -1154,7 +1158,7 @@ std::ostream& print_tensor<paddle::platform::complex<float>>(
 
 template <>
 std::ostream& print_tensor<paddle::platform::complex<double>>(
-    std::ostream& os, const framework::Tensor& tensor) {
+    std::ostream& os, const phi::DenseTensor& tensor) {
   auto inspect = tensor.data<paddle::platform::complex<double>>();
   auto element_num = tensor.numel();
 
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index 0780976b2c6f0..d1dc5e45c2d8c 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -60,13 +60,13 @@ class PrintOptions {
 };
 
 void TensorToStream(std::ostream& os,
-                    const Tensor& tensor,
+                    const phi::DenseTensor& tensor,
                     const platform::DeviceContext& dev_ctx);
 void TensorFromStream(std::istream& is,
-                      Tensor* tensor,
+                      phi::DenseTensor* tensor,
                       const platform::DeviceContext& dev_ctx);
 void TensorFromStream(std::istream& is,
-                      Tensor* tensor,
+                      phi::DenseTensor* tensor,
                       const platform::DeviceContext& dev_ctx,
                       const size_t& seek,
                       const std::vector<int64_t>& shape);
@@ -77,10 +77,10 @@ void TensorFromStream(std::istream& is,
 // If ctx_place and src_place are the same, src_ctx.Wait() is added
 // after memory::Copy; if ctx_place and dst_place are the same,
 // src_ctx.Wait() is added before memory::Copy.
-void TensorCopy(const Tensor& src,
+void TensorCopy(const phi::DenseTensor& src,
                 const platform::Place& dst_place,
                 const platform::DeviceContext& ctx,
-                Tensor* dst);
+                phi::DenseTensor* dst);
 
 // NOTE(zcd): If the src.place() and dst_place are two different GPU,
 // the copy operation is carried out on the dst_place's stream. This is
@@ -89,30 +89,30 @@ void TensorCopy(const Tensor& src,
 // stream, if this copy operation is carried out on the src_place's stream,
 // when dst is used in dst_place's stream the copy operation may be
 // not completed.
-void TensorCopy(const Tensor& src,
+void TensorCopy(const phi::DenseTensor& src,
                 const platform::Place& dst_place,
-                Tensor* dst);
+                phi::DenseTensor* dst);
 
-void TensorCopySync(const Tensor& src,
+void TensorCopySync(const phi::DenseTensor& src,
                     const platform::Place& dst_place,
-                    Tensor* dst);
+                    phi::DenseTensor* dst);
 
 template <typename T>
 void TensorFromVector(const std::vector<T>& src,
                       const platform::DeviceContext& ctx,
-                      Tensor* dst);
+                      phi::DenseTensor* dst);
 template <typename T>
-void TensorFromVector(const std::vector<T>& src, Tensor* dst);
+void TensorFromVector(const std::vector<T>& src, phi::DenseTensor* dst);
 
 template <typename T>
-void TensorToVector(const Tensor& src,
+void TensorToVector(const phi::DenseTensor& src,
                     const platform::DeviceContext& ctx,
                     std::vector<T>* dst);
 template <typename T>
-void TesnorToVector(const Tensor& src, std::vector<T>* dst);
+void TesnorToVector(const phi::DenseTensor& src, std::vector<T>* dst);
 
 // convert dlpack's DLTensor to tensor
-void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst);
+void TensorFromDLPack(const ::DLTensor& dl_tensor, phi::DenseTensor* dst);
 
 //
 // The implementation of template functions.
@@ -122,7 +122,7 @@ template <typename T>
 void TensorFromArray(const T* src,
                      const size_t& array_size,
                      const platform::DeviceContext& ctx,
-                     Tensor* dst) {
+                     phi::DenseTensor* dst) {
   auto dst_place = ctx.GetPlace();
   auto src_ptr = static_cast<const void*>(src);
   platform::CPUPlace src_place;
@@ -147,7 +147,7 @@ void TensorFromArray(const T* src,
   else if (platform::is_npu_place(dst_place)) {  // NOLINT
     //  1. vector -> npu pinned tensor
     platform::NPUPinnedPlace npu_pinned_place;
-    Tensor npu_pinned_tensor;
+    phi::DenseTensor npu_pinned_tensor;
     npu_pinned_tensor.Resize(dst->dims());
     auto npu_pinned_ptr =
         npu_pinned_tensor.mutable_data(npu_pinned_place, dst->dtype());
@@ -199,7 +199,7 @@ void TensorFromArray(const T* src,
 template <typename T>
 void TensorFromVector(const std::vector<T>& src,
                       const platform::DeviceContext& ctx,
-                      Tensor* dst) {
+                      phi::DenseTensor* dst) {
   auto dst_place = ctx.GetPlace();
   auto src_ptr = static_cast<const void*>(src.data());
   platform::CPUPlace src_place;
@@ -229,7 +229,7 @@ void TensorFromVector(const std::vector<T>& src,
   // so pass nullptr as stream to  memory::Copy().
   else if (platform::is_npu_place(dst_place)) {  // NOLINT
     //  1. vector -> npu pinned tensor
-    Tensor npu_pinned_tensor(dst->dtype());
+    phi::DenseTensor npu_pinned_tensor(dst->dtype());
     platform::NPUPinnedPlace npu_pinned_place;
     auto npu_pinned_ptr =
         npu_pinned_tensor.mutable_data<T>(dst->dims(), npu_pinned_place);
@@ -288,7 +288,7 @@ void TensorFromVector(const std::vector<T>& src,
 template <>
 inline void TensorFromVector(const std::vector<bool>& src,
                              const platform::DeviceContext& ctx,
-                             Tensor* dst) {
+                             phi::DenseTensor* dst) {
   // vector<bool> has no data() member, use array instead.
   // See details:
   // https://stackoverflow.com/questions/46115669/why-does-stdvectorbool-have-no-data/46115714
@@ -321,7 +321,7 @@ inline void TensorFromVector(const std::vector<bool>& src,
   else if (platform::is_npu_place(dst_place)) {  // NOLINT
     //  1. vector -> npu pinned tensor
     platform::NPUPinnedPlace npu_pinned_place;
-    Tensor npu_pinned_tensor;
+    phi::DenseTensor npu_pinned_tensor;
     npu_pinned_tensor.Resize(dst->dims());
     auto npu_pinned_ptr =
         npu_pinned_tensor.mutable_data(npu_pinned_place, dst->dtype());
@@ -368,7 +368,7 @@ inline void TensorFromVector(const std::vector<bool>& src,
 }
 
 template <typename T>
-void TensorFromVector(const std::vector<T>& src, Tensor* dst) {
+void TensorFromVector(const std::vector<T>& src, phi::DenseTensor* dst) {
   platform::CPUPlace dst_place = platform::CPUPlace();
   auto src_ptr = static_cast<const void*>(src.data());
   platform::CPUPlace src_place;
@@ -380,7 +380,8 @@ void TensorFromVector(const std::vector<T>& src, Tensor* dst) {
 }
 
 template <>
-inline void TensorFromVector(const std::vector<bool>& src, Tensor* dst) {
+inline void TensorFromVector(const std::vector<bool>& src,
+                             phi::DenseTensor* dst) {
   bool* array = new bool[src.size()];
   for (unsigned int i = 0; i < src.size(); i++) {
     array[i] = static_cast<bool>(src[i]);
@@ -397,7 +398,7 @@ inline void TensorFromVector(const std::vector<bool>& src, Tensor* dst) {
 }
 
 template <typename T>
-void TensorToVector(const Tensor& src,
+void TensorToVector(const phi::DenseTensor& src,
                     const platform::DeviceContext& ctx,
                     std::vector<T>* dst) {
   auto src_ptr = static_cast<const void*>(src.data<T>());
@@ -453,7 +454,7 @@ void TensorToVector(const Tensor& src,
 }
 
 template <>
-inline void TensorToVector(const Tensor& src,
+inline void TensorToVector(const phi::DenseTensor& src,
                            const platform::DeviceContext& ctx,
                            std::vector<bool>* dst) {
   auto src_ptr = static_cast<const void*>(src.data<bool>());
@@ -505,7 +506,7 @@ inline void TensorToVector(const Tensor& src,
 }
 
 template <typename T>
-void TensorToVector(const Tensor& src, std::vector<T>* dst) {
+void TensorToVector(const phi::DenseTensor& src, std::vector<T>* dst) {
   auto src_ptr = static_cast<const void*>(src.data<T>());
   auto size = src.numel() * sizeof(T);
 
@@ -524,7 +525,8 @@ void TensorToVector(const Tensor& src, std::vector<T>* dst) {
 }
 
 template <>
-inline void TensorToVector(const Tensor& src, std::vector<bool>* dst) {
+inline void TensorToVector(const phi::DenseTensor& src,
+                           std::vector<bool>* dst) {
   auto src_ptr = static_cast<const void*>(src.data<bool>());
   auto size = src.numel() * sizeof(bool);
 
@@ -551,31 +553,32 @@ inline void TensorToVector(const Tensor& src, std::vector<bool>* dst) {
 
 std::ostream& operator<<(std::ostream& os, const LoD& lod);
 
-inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
+inline phi::DenseTensor ReshapeToMatrix(const phi::DenseTensor& src,
+                                        int num_col_dims) {
   int rank = src.dims().size();
   PADDLE_ENFORCE_GE(
       rank,
       2,
       platform::errors::InvalidArgument(
           "'ReshapeToMatrix()' is only used for flatten high rank "
-          "tensors to matrixs. The dimensions of Tensor must be "
+          "tensors to matrixs. The dimensions of phi::DenseTensor must be "
           "greater or equal than 2. "
-          "But received dimensions of Tensor is %d",
+          "But received dimensions of phi::DenseTensor is %d",
           rank));
   if (rank == 2) {
     return src;
   }
-  Tensor res;
+  phi::DenseTensor res;
   res.ShareDataWith(src);
   res.Resize(phi::flatten_to_2d(src.dims(), num_col_dims));
   return res;
 }
 
 template <typename T>
-inline T GetValue(const framework::Tensor* x) {
+inline T GetValue(const phi::DenseTensor* x) {
   T value = static_cast<T>(0);
   if (!platform::is_cpu_place(x->place())) {
-    framework::Tensor cpu_x;
+    phi::DenseTensor cpu_x;
     framework::TensorCopy(*x, platform::CPUPlace(), &cpu_x);
 #if defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc
index c7db2186e5db7..3d3c7de73b729 100644
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -12,18 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/tensor_util.h"
+#include <cmath>
+
 #include <gtest/gtest.h>
-#include "paddle/fluid/operators/isfinite_op.h"
 
-#include <cmath>
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/isfinite_op.h"
 
 namespace paddle {
 namespace framework {
 
 TEST(TensorCopy, Tensor) {
-  Tensor src_tensor;
-  Tensor dst_tensor;
+  phi::DenseTensor src_tensor;
+  phi::DenseTensor dst_tensor;
   phi::CPUContext cpu_ctx((platform::CPUPlace()));
 
   int* src_ptr = src_tensor.mutable_data<int>(phi::make_ddim({3, 3}),
@@ -49,7 +50,7 @@ TEST(TensorCopy, Tensor) {
 
   EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
 
-  Tensor slice_tensor = src_tensor.Slice(1, 2);
+  phi::DenseTensor slice_tensor = src_tensor.Slice(1, 2);
   TensorCopy(slice_tensor, *cpu_place, &dst_tensor);
   const int* slice_ptr = slice_tensor.data<int>();
   dst_ptr = dst_tensor.data<int>();
@@ -61,9 +62,9 @@ TEST(TensorCopy, Tensor) {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   {
-    Tensor src_tensor;
-    Tensor gpu_tensor;
-    Tensor dst_tensor;
+    phi::DenseTensor src_tensor;
+    phi::DenseTensor gpu_tensor;
+    phi::DenseTensor dst_tensor;
 
     int* src_ptr = src_tensor.mutable_data<int>(phi::make_ddim({3, 3}),
                                                 platform::CPUPlace());
@@ -71,7 +72,7 @@ TEST(TensorCopy, Tensor) {
     int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
     memcpy(src_ptr, arr, 9 * sizeof(int));
 
-    // CPU Tensor to GPU Tensor
+    // CPU phi::DenseTensor to GPU phi::DenseTensor
     auto gpu_place = new platform::CUDAPlace(0);
     phi::GPUContext gpu_ctx(*gpu_place);
     gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
@@ -80,7 +81,7 @@ TEST(TensorCopy, Tensor) {
     gpu_ctx.PartialInitWithAllocator();
     TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
 
-    // GPU Tensor to CPU Tensor
+    // GPU phi::DenseTensor to CPU phi::DenseTensor
     auto cpu_place = new platform::CPUPlace();
     TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
 
@@ -101,12 +102,12 @@ TEST(TensorCopy, Tensor) {
       EXPECT_EQ(src_ptr[i], dst_ptr_tmp[i]);
     }
 
-    Tensor slice_tensor = src_tensor.Slice(1, 2);
+    phi::DenseTensor slice_tensor = src_tensor.Slice(1, 2);
 
-    // CPU Slice Tensor to GPU Tensor
+    // CPU Slice phi::DenseTensor to GPU phi::DenseTensor
     TensorCopy(slice_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
 
-    // GPU Tensor to CPU Tensor
+    // GPU phi::DenseTensor to CPU phi::DenseTensor
     TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
 
     // Sync before Compare Slice Tensors
@@ -126,9 +127,9 @@ TEST(TensorCopy, Tensor) {
 TEST(TensorFromVector, Tensor) {
   {
     std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    paddle::framework::Tensor cpu_tensor;
+    phi::DenseTensor cpu_tensor;
 
-    // Copy to CPU Tensor
+    // Copy to CPU phi::DenseTensor
     cpu_tensor.Resize(phi::make_ddim({3, 3}));
     auto cpu_place = new paddle::platform::CPUPlace();
     paddle::framework::TensorFromVector<int>(src_vec, &cpu_tensor);
@@ -157,11 +158,11 @@ TEST(TensorFromVector, Tensor) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   {
     std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    paddle::framework::Tensor cpu_tensor;
-    paddle::framework::Tensor gpu_tensor;
-    paddle::framework::Tensor dst_tensor;
+    phi::DenseTensor cpu_tensor;
+    phi::DenseTensor gpu_tensor;
+    phi::DenseTensor dst_tensor;
 
-    // Copy to CPU Tensor
+    // Copy to CPU phi::DenseTensor
     cpu_tensor.Resize(phi::make_ddim({3, 3}));
     auto cpu_place = new paddle::platform::CPUPlace();
     phi::CPUContext cpu_ctx(*cpu_place);
@@ -219,7 +220,7 @@ TEST(TensorFromVector, Tensor) {
 
 TEST(TensorToVector, Tensor) {
   {
-    paddle::framework::Tensor src;
+    phi::DenseTensor src;
     int* src_ptr = src.mutable_data<int>({3, 3}, paddle::platform::CPUPlace());
     for (int i = 0; i < 3 * 3; ++i) {
       src_ptr[i] = i;
@@ -236,7 +237,7 @@ TEST(TensorToVector, Tensor) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   {
     std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    paddle::framework::Tensor gpu_tensor;
+    phi::DenseTensor gpu_tensor;
     paddle::platform::CUDAPlace place;
     phi::GPUContext gpu_ctx(place);
     gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
@@ -255,19 +256,22 @@ TEST(TensorToVector, Tensor) {
 #endif
 }
 
-TEST(TensorToVector, Tensor_bool){{paddle::framework::Tensor src;
-bool* src_ptr = src.mutable_data<bool>({3, 3}, paddle::platform::CPUPlace());
-for (int i = 0; i < 3 * 3; ++i) {
-  src_ptr[i] = static_cast<bool>(i % 2);
-}
+TEST(TensorToVector, Tensor_bool) {
+  phi::DenseTensor src;
+  bool* src_ptr = src.mutable_data<bool>({3, 3}, paddle::platform::CPUPlace());
+  for (int i = 0; i < 3 * 3; ++i) {
+    src_ptr[i] = static_cast<bool>(i % 2);
+  }
 
-paddle::platform::CPUPlace place;
-std::vector<bool> dst;
-paddle::framework::TensorToVector<bool>(src, &dst);
+  paddle::platform::CPUPlace place;
+  std::vector<bool> dst;
+  paddle::framework::TensorToVector<bool>(src, &dst);
 
-for (int i = 0; i < 3 * 3; ++i) {
-  EXPECT_EQ(src_ptr[i], dst[i]);
+  for (int i = 0; i < 3 * 3; ++i) {
+    EXPECT_EQ(src_ptr[i], dst[i]);
+  }
 }
+
 }  // namespace framework
 
 #ifdef PADDLE_WITH_CUDA
@@ -283,7 +287,7 @@ for (int i = 0; i < 3 * 3; ++i) {
       true,
       false,
   };
-  paddle::framework::Tensor gpu_tensor;
+  phi::DenseTensor gpu_tensor;
   paddle::platform::CUDAPlace place;
   phi::GPUContext gpu_ctx(place);
   gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
@@ -313,7 +317,7 @@ for (int i = 0; i < 3 * 3; ++i) {
       true,
       false,
   };
-  paddle::framework::Tensor npu_tensor;
+  phi::DenseTensor npu_tensor;
   paddle::platform::NPUPlace place(0);
   paddle::platform::NPUDeviceContext npu_ctx(place);
   paddle::framework::TensorFromVector<bool>(src_vec, npu_ctx, &npu_tensor);
@@ -331,7 +335,7 @@ for (int i = 0; i < 3 * 3; ++i) {
 TEST(TensorFromDLPack, Tensor) {
   {
     std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    paddle::framework::Tensor cpu_tensor;
+    phi::DenseTensor cpu_tensor;
 
     cpu_tensor.Resize(phi::make_ddim({3, 3}));
     paddle::platform::CPUPlace cpu_place;
@@ -339,7 +343,7 @@ TEST(TensorFromDLPack, Tensor) {
     paddle::framework::TensorFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
     paddle::framework::DLPackTensor dlpack_tensor(cpu_tensor, 1);
 
-    paddle::framework::Tensor dst_tensor;
+    phi::DenseTensor dst_tensor;
     paddle::framework::TensorFromDLPack(dlpack_tensor, &dst_tensor);
 
     auto cpu_ptr = cpu_tensor.data<int>();
@@ -353,12 +357,12 @@ TEST(TensorFromDLPack, Tensor) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   {
     std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    paddle::framework::Tensor cpu_tensor;
-    paddle::framework::Tensor gpu_tensor;
-    paddle::framework::Tensor dst_tensor;
-    paddle::framework::Tensor gpu_tensor_from_dlpack;
+    phi::DenseTensor cpu_tensor;
+    phi::DenseTensor gpu_tensor;
+    phi::DenseTensor dst_tensor;
+    phi::DenseTensor gpu_tensor_from_dlpack;
 
-    // Copy to CPU Tensor
+    // Copy to CPU phi::DenseTensor
     cpu_tensor.Resize(phi::make_ddim({3, 3}));
     paddle::platform::CPUPlace cpu_place;
     phi::CPUContext cpu_ctx(cpu_place);
@@ -396,7 +400,7 @@ TEST(TensorFromDLPack, Tensor) {
 
 TEST(TensorContainsNAN, CPU) {
   {
-    paddle::framework::Tensor src;
+    phi::DenseTensor src;
     float* buf = src.mutable_data<float>({3}, paddle::platform::CPUPlace());
     buf[0] = 0.0;
     buf[1] = NAN;
@@ -407,7 +411,7 @@ TEST(TensorContainsNAN, CPU) {
   }
 
   {
-    paddle::framework::Tensor src;
+    phi::DenseTensor src;
     paddle::platform::float16* buf =
         src.mutable_data<paddle::platform::float16>(
             {3}, paddle::platform::CPUPlace());
@@ -422,7 +426,7 @@ TEST(TensorContainsNAN, CPU) {
 
 TEST(TensorContainsInf, CPU) {
   {
-    paddle::framework::Tensor src;
+    phi::DenseTensor src;
     double* buf = src.mutable_data<double>({3}, paddle::platform::CPUPlace());
     buf[0] = 1.0;
     buf[1] = INFINITY;
@@ -433,7 +437,7 @@ TEST(TensorContainsInf, CPU) {
   }
 
   {
-    paddle::framework::Tensor src;
+    phi::DenseTensor src;
     paddle::platform::float16* buf =
         src.mutable_data<paddle::platform::float16>(
             {3}, paddle::platform::CPUPlace());
@@ -448,7 +452,7 @@ TEST(TensorContainsInf, CPU) {
 
 TEST(TensorIsfinite, CPU) {
   {
-    paddle::framework::Tensor src, out;
+    phi::DenseTensor src, out;
     double* buf = src.mutable_data<double>({3}, paddle::platform::CPUPlace());
     buf[0] = 1.0;
     buf[1] = INFINITY;
@@ -461,7 +465,7 @@ TEST(TensorIsfinite, CPU) {
   }
 
   {
-    paddle::framework::Tensor src, out;
+    phi::DenseTensor src, out;
     double* buf = src.mutable_data<double>({3}, paddle::platform::CPUPlace());
     buf[0] = 1.0;
     buf[1] = NAN;
@@ -474,7 +478,7 @@ TEST(TensorIsfinite, CPU) {
   }
 
   {
-    paddle::framework::Tensor src, out;
+    phi::DenseTensor src, out;
     paddle::platform::float16* buf =
         src.mutable_data<paddle::platform::float16>(
             {3}, paddle::platform::CPUPlace());
@@ -493,7 +497,7 @@ TEST(TensorIsfinite, CPU) {
 }
 
 TEST(Tensor, FromAndToStream) {
-  framework::Tensor src_tensor;
+  phi::DenseTensor src_tensor;
   int array[6] = {1, 2, 3, 4, 5, 6};
   src_tensor.Resize({2, 3});
   int* src_ptr = src_tensor.mutable_data<int>(platform::CPUPlace());
@@ -501,7 +505,7 @@ TEST(Tensor, FromAndToStream) {
     src_ptr[i] = array[i];
   }
   {
-    framework::Tensor dst_tensor;
+    phi::DenseTensor dst_tensor;
     auto place = new platform::CPUPlace();
     phi::CPUContext cpu_ctx(*place);
     std::ostringstream oss;
@@ -518,9 +522,9 @@ TEST(Tensor, FromAndToStream) {
   }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   {
-    Tensor gpu_tensor;
+    phi::DenseTensor gpu_tensor;
     gpu_tensor.Resize({2, 3});
-    Tensor dst_tensor;
+    phi::DenseTensor dst_tensor;
 
     auto gpu_place = new platform::CUDAPlace();
     phi::GPUContext gpu_ctx(*gpu_place);
diff --git a/paddle/fluid/framework/tensor_util_test.cu b/paddle/fluid/framework/tensor_util_test.cu
index 53807beab9171..e4e49340e6615 100644
--- a/paddle/fluid/framework/tensor_util_test.cu
+++ b/paddle/fluid/framework/tensor_util_test.cu
@@ -62,7 +62,7 @@ TEST(TensorContainsNAN, GPU) {
   auto& pool = paddle::platform::DeviceContextPool::Instance();
   auto* cuda_ctx = pool.GetByPlace(gpu);
   {
-    Tensor tensor;
+    phi::DenseTensor tensor;
     float* buf = tensor.mutable_data<float>({3}, gpu);
 #ifdef PADDLE_WITH_HIP
     hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
@@ -73,7 +73,7 @@ TEST(TensorContainsNAN, GPU) {
     ASSERT_TRUE(TensorContainsNAN(tensor));
   }
   {
-    Tensor tensor;
+    phi::DenseTensor tensor;
     paddle::platform::float16* buf =
         tensor.mutable_data<paddle::platform::float16>({3}, gpu);
 #ifdef PADDLE_WITH_HIP
@@ -91,7 +91,7 @@ TEST(TensorContainsInf, GPU) {
   auto& pool = paddle::platform::DeviceContextPool::Instance();
   auto* cuda_ctx = pool.GetByPlace(gpu);
   {
-    Tensor tensor;
+    phi::DenseTensor tensor;
     float* buf = tensor.mutable_data<float>({3}, gpu);
 #ifdef PADDLE_WITH_HIP
     hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
@@ -102,7 +102,7 @@ TEST(TensorContainsInf, GPU) {
     ASSERT_TRUE(TensorContainsInf(tensor));
   }
   {
-    Tensor tensor;
+    phi::DenseTensor tensor;
     paddle::platform::float16* buf =
         tensor.mutable_data<paddle::platform::float16>({3}, gpu);
 #ifdef PADDLE_WITH_HIP
@@ -122,7 +122,7 @@ TEST(TensorIsfinite, GPU) {
   auto* cuda_ctx = pool.GetByPlace(gpu);
   // contains inf
   {
-    Tensor tensor;
+    phi::DenseTensor tensor;
     float* buf = tensor.mutable_data<float>({3}, gpu);
 #ifdef PADDLE_WITH_HIP
     hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
@@ -133,7 +133,7 @@ TEST(TensorIsfinite, GPU) {
     EXPECT_TRUE(!TensorIsfinite(tensor));
   }
   {
-    Tensor tensor;
+    phi::DenseTensor tensor;
     float16* buf = tensor.mutable_data<float16>({3}, gpu);
 #ifdef PADDLE_WITH_HIP
     hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
@@ -146,7 +146,7 @@ TEST(TensorIsfinite, GPU) {
 
   // contains nan
   {
-    Tensor tensor;
+    phi::DenseTensor tensor;
     float* buf = tensor.mutable_data<float>({3}, gpu);
 #ifdef PADDLE_WITH_HIP
     hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
@@ -157,7 +157,7 @@ TEST(TensorIsfinite, GPU) {
     EXPECT_TRUE(!TensorIsfinite(tensor));
   }
   {
-    Tensor tensor;
+    phi::DenseTensor tensor;
     float16* buf = tensor.mutable_data<float16>({3}, gpu);
 #ifdef PADDLE_WITH_HIP
     hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
@@ -170,7 +170,7 @@ TEST(TensorIsfinite, GPU) {
 
   // all element are finite
   {
-    Tensor tensor;
+    phi::DenseTensor tensor;
     float* buf = tensor.mutable_data<float>({3}, gpu);
 #ifdef PADDLE_WITH_HIP
     hipLaunchKernelGGL(
@@ -182,7 +182,7 @@ TEST(TensorIsfinite, GPU) {
     EXPECT_TRUE(TensorIsfinite(tensor));
   }
   {
-    Tensor tensor;
+    phi::DenseTensor tensor;
     float16* buf = tensor.mutable_data<float16>({3}, gpu);
 #ifdef PADDLE_WITH_HIP
     hipLaunchKernelGGL(
@@ -200,7 +200,7 @@ TEST(TensorContainsInf, GPUWithoutWait) {
   auto& pool = paddle::platform::DeviceContextPool::Instance();
   auto* cuda_ctx = pool.GetByPlace(gpu);
   {
-    Tensor tensor, out;
+    phi::DenseTensor tensor, out;
     float* buf = tensor.mutable_data<float>({3}, gpu);
 #ifdef PADDLE_WITH_HIP
     hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
@@ -210,13 +210,13 @@ TEST(TensorContainsInf, GPUWithoutWait) {
     cuda_ctx->Wait();
     TensorContainsInf(tensor, &out);
     platform::CPUPlace cpu;
-    Tensor tmp;
+    phi::DenseTensor tmp;
     TensorCopy(out, cpu, *cuda_ctx, &tmp);
     cuda_ctx->Wait();
     ASSERT_EQ(tmp.data<bool>()[0], true);
   }
   {
-    Tensor tensor, out;
+    phi::DenseTensor tensor, out;
     paddle::platform::float16* buf =
         tensor.mutable_data<paddle::platform::float16>({3}, gpu);
 #ifdef PADDLE_WITH_HIP
@@ -227,7 +227,7 @@ TEST(TensorContainsInf, GPUWithoutWait) {
     cuda_ctx->Wait();
     TensorContainsInf(tensor, &out);
     platform::CPUPlace cpu;
-    Tensor tmp;
+    phi::DenseTensor tmp;
     TensorCopy(out, cpu, *cuda_ctx, &tmp);
     cuda_ctx->Wait();
     ASSERT_EQ(tmp.data<bool>()[0], true);
@@ -239,7 +239,7 @@ TEST(TensorContainsNAN, GPUWithoutWait) {
   auto& pool = paddle::platform::DeviceContextPool::Instance();
   auto* cuda_ctx = pool.GetByPlace(gpu);
   {
-    Tensor tensor, out;
+    phi::DenseTensor tensor, out;
     float* buf = tensor.mutable_data<float>({3}, gpu);
 #ifdef PADDLE_WITH_HIP
     hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
@@ -249,13 +249,13 @@ TEST(TensorContainsNAN, GPUWithoutWait) {
     cuda_ctx->Wait();
     TensorContainsNAN(tensor, &out);
     platform::CPUPlace cpu;
-    Tensor tmp;
+    phi::DenseTensor tmp;
     TensorCopy(out, cpu, *cuda_ctx, &tmp);
     cuda_ctx->Wait();
     ASSERT_EQ(tmp.data<bool>()[0], true);
   }
   {
-    Tensor tensor, out;
+    phi::DenseTensor tensor, out;
     paddle::platform::float16* buf =
         tensor.mutable_data<paddle::platform::float16>({3}, gpu);
 #ifdef PADDLE_WITH_HIP
@@ -266,7 +266,7 @@ TEST(TensorContainsNAN, GPUWithoutWait) {
     cuda_ctx->Wait();
     TensorContainsNAN(tensor, &out);
     platform::CPUPlace cpu;
-    Tensor tmp;
+    phi::DenseTensor tmp;
     TensorCopy(out, cpu, *cuda_ctx, &tmp);
     cuda_ctx->Wait();
     ASSERT_EQ(tmp.data<bool>()[0], true);
@@ -278,7 +278,7 @@ TEST(TensorIsfinite, GPUWithoutWait) {
   auto& pool = paddle::platform::DeviceContextPool::Instance();
   auto* cuda_ctx = pool.GetByPlace(gpu);
   {
-    Tensor tensor, out;
+    phi::DenseTensor tensor, out;
     float* buf = tensor.mutable_data<float>({3}, gpu);
 #ifdef PADDLE_WITH_HIP
     hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
@@ -288,13 +288,13 @@ TEST(TensorIsfinite, GPUWithoutWait) {
     cuda_ctx->Wait();
     TensorIsfinite(tensor, &out);
     platform::CPUPlace cpu;
-    Tensor tmp;
+    phi::DenseTensor tmp;
     TensorCopy(out, cpu, *cuda_ctx, &tmp);
     cuda_ctx->Wait();
     EXPECT_EQ(tmp.data<bool>()[0], false);
   }
   {
-    Tensor tensor, out;
+    phi::DenseTensor tensor, out;
     float* buf = tensor.mutable_data<float>({3}, gpu);
 #ifdef PADDLE_WITH_HIP
     hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
@@ -304,13 +304,13 @@ TEST(TensorIsfinite, GPUWithoutWait) {
     cuda_ctx->Wait();
     TensorIsfinite(tensor, &out);
     platform::CPUPlace cpu;
-    Tensor tmp;
+    phi::DenseTensor tmp;
     TensorCopy(out, cpu, *cuda_ctx, &tmp);
     cuda_ctx->Wait();
     EXPECT_EQ(tmp.data<bool>()[0], false);
   }
   {
-    Tensor tensor, out;
+    phi::DenseTensor tensor, out;
     float* buf = tensor.mutable_data<float>({3}, gpu);
 #ifdef PADDLE_WITH_HIP
     hipLaunchKernelGGL(
@@ -321,7 +321,7 @@ TEST(TensorIsfinite, GPUWithoutWait) {
     cuda_ctx->Wait();
     TensorIsfinite(tensor, &out);
     platform::CPUPlace cpu;
-    Tensor tmp;
+    phi::DenseTensor tmp;
     TensorCopy(out, cpu, *cuda_ctx, &tmp);
     cuda_ctx->Wait();
     EXPECT_EQ(tmp.data<bool>()[0], true);
diff --git a/paddle/fluid/framework/tuple.h b/paddle/fluid/framework/tuple.h
index a06f92f32d28c..0cf7c70b9bf81 100644
--- a/paddle/fluid/framework/tuple.h
+++ b/paddle/fluid/framework/tuple.h
@@ -31,7 +31,7 @@ typedef paddle::variant<int,
                         float,
                         double,
                         std::string,
-                        Tensor,
+                        phi::DenseTensor,
                         LoDTensor /*, ChannelHolder*/>
     ElementVar;
 
diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h
index ea7ebce2dae6b..a0a40682a9667 100644
--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -178,7 +178,7 @@ struct VarTypeRegistryImpl {
 // Users should add other variable types below.
 // Paddle would generate unique Ids for each registered variable types.
 using VarTypeRegistry = detail::VarTypeRegistryImpl<
-    Tensor,
+    phi::DenseTensor,
     phi::SelectedRows,
     std::vector<Scope *>,
     LoDRankTable,
diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h
index 68876de0f795a..5524433ce7b3a 100644
--- a/paddle/fluid/framework/variable.h
+++ b/paddle/fluid/framework/variable.h
@@ -123,18 +123,19 @@ inline phi::DenseTensor::InplaceVersion* Variable::InplaceVersionCounter() {
   if (IsType<framework::LoDTensor>()) {
     version_counter_ptr =
         &GetMutable<framework::LoDTensor>()->InplaceVersionCounter();
-  } else if (IsType<framework::Tensor>()) {
+  } else if (IsType<phi::DenseTensor>()) {
     version_counter_ptr =
-        &GetMutable<framework::Tensor>()->InplaceVersionCounter();
+        &GetMutable<phi::DenseTensor>()->InplaceVersionCounter();
 
   } else if (IsType<phi::SelectedRows>()) {
     version_counter_ptr = &GetMutable<phi::SelectedRows>()
                                ->mutable_value()
                                ->InplaceVersionCounter();
   } else {
-    VLOG(4) << "Only supports Tensor, LoDTensor, SelectedRows to have "
-               "TensorInplaceVersion, but received type "
-            << platform::demangle(framework::ToTypeName(Type()));
+    VLOG(4)
+        << "Only supports phi::DenseTensor, LoDTensor, SelectedRows to have "
+           "TensorInplaceVersion, but received type "
+        << platform::demangle(framework::ToTypeName(Type()));
   }
   return version_counter_ptr;
 }
@@ -159,9 +160,10 @@ inline void Variable::BumpInplaceVersion() {
   if (version_counter_ptr) {
     return version_counter_ptr->Bump();
   } else {
-    VLOG(4) << "Only supports Tensor, LoDTensor, SelectedRows to have "
-               "TensorInplaceVersion, but received type "
-            << platform::demangle(framework::ToTypeName(Type()));
+    VLOG(4)
+        << "Only supports phi::DenseTensor, LoDTensor, SelectedRows to have "
+           "TensorInplaceVersion, but received type "
+        << platform::demangle(framework::ToTypeName(Type()));
   }
 }
 }  // namespace framework
diff --git a/paddle/fluid/framework/variable_test.cc b/paddle/fluid/framework/variable_test.cc
index b998efc1c230e..22af9ae934e0c 100644
--- a/paddle/fluid/framework/variable_test.cc
+++ b/paddle/fluid/framework/variable_test.cc
@@ -29,7 +29,7 @@ TEST(Variable, GetMutable) {
   EXPECT_EQ("1234", tt);
 
   try {
-    v->GetMutable<Tensor>();
+    v->GetMutable<phi::DenseTensor>();
   } catch (std::exception& e) {
     return;
   }
diff --git a/paddle/fluid/framework/version.cc b/paddle/fluid/framework/version.cc
index c01bef79cdccd..9f07f0f5587b1 100644
--- a/paddle/fluid/framework/version.cc
+++ b/paddle/fluid/framework/version.cc
@@ -20,7 +20,7 @@ namespace paddle {
 namespace framework {
 
 bool IsProgramVersionSupported(int64_t version) {
-  /* So far, all old versions of Tensor are supported in the
+  /* So far, all old versions of phi::DenseTensor are supported in the
    * new version. The compatibility judgment cannot be made only
    * by the version number. Please do not use this interface,
    * it may be discarded because backward compatibility.
@@ -29,7 +29,7 @@ bool IsProgramVersionSupported(int64_t version) {
 }
 
 bool IsTensorVersionSupported(uint32_t version) {
-  /* So far, all old versions of Tensor are supported in the
+  /* So far, all old versions of phi::DenseTensor are supported in the
    * new version. The compatibility judgment cannot be made only
    * by the version number. Please do not use this interface,
    * it may be discarded because backward compatibility.
diff --git a/paddle/fluid/framework/version.h b/paddle/fluid/framework/version.h
index 1bb627775942a..b7ec4ecc11ca5 100644
--- a/paddle/fluid/framework/version.h
+++ b/paddle/fluid/framework/version.h
@@ -21,7 +21,7 @@ namespace paddle {
 namespace framework {
 
 // Note:
-// Program and Tensor that pass the IsXXXVersionSupported should
+// Program and phi::DenseTensor that pass the IsXXXVersionSupported should
 // be supported by the current codes. Otherwise, it's a compatibility
 // bug.
 
diff --git a/paddle/fluid/imperative/all_reduce.cc b/paddle/fluid/imperative/all_reduce.cc
index c9d3d2591d000..4064c65be6708 100644
--- a/paddle/fluid/imperative/all_reduce.cc
+++ b/paddle/fluid/imperative/all_reduce.cc
@@ -53,8 +53,8 @@ static const platform::Place &GetVarPlace(const framework::Variable &src) {
   }
 }
 
-static void AllReduce(const framework::Tensor &src,
-                      framework::Tensor *dst,
+static void AllReduce(const phi::DenseTensor &src,
+                      phi::DenseTensor *dst,
                       const gpuStream_t stream,
                       const platform::NCCLComm *comm) {
   const auto &place = src.place();
diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index c4b622f98505f..3b2e299a2a745 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -150,7 +150,7 @@ void BasicEngine::CheckBackwardInputs(const OpBase& op) {
       }
 
       auto* inner_var = var->MutableVar();
-      framework::Tensor* tensor = nullptr;
+      phi::DenseTensor* tensor = nullptr;
       if (!inner_var->IsInitialized() ||
           inner_var->IsType<framework::LoDTensor>()) {
         tensor = inner_var->GetMutable<framework::LoDTensor>();
diff --git a/paddle/fluid/imperative/bkcl_context.cc b/paddle/fluid/imperative/bkcl_context.cc
index 831e7dae942ae..4fc21b05c44c7 100644
--- a/paddle/fluid/imperative/bkcl_context.cc
+++ b/paddle/fluid/imperative/bkcl_context.cc
@@ -33,8 +33,8 @@
 namespace paddle {
 namespace imperative {
 
-static void AllReduce(const framework::Tensor &src,
-                      framework::Tensor *dst,
+static void AllReduce(const phi::DenseTensor &src,
+                      phi::DenseTensor *dst,
                       const XPUStream stream,
                       const platform::BKCLComm *comm) {
   const auto &place = src.place();
@@ -181,7 +181,7 @@ void BKCLParallelContext::AllReduceByStream(const framework::Variable &src,
 
 void BKCLParallelContext::Broadcast(framework::Variable *src, int ring_id) {
   VLOG(3) << "/// DEBUG /// start inter broadcast with ring_id: " << ring_id;
-  framework::Tensor *src_tensor = src->GetMutable<framework::LoDTensor>();
+  phi::DenseTensor *src_tensor = src->GetMutable<framework::LoDTensor>();
   const auto &place = src_tensor->place();
   platform::BKCLComm *comm =
       platform::BKCLCommContext::Instance().Get(ring_id, place);
diff --git a/paddle/fluid/imperative/cncl_context.cc b/paddle/fluid/imperative/cncl_context.cc
index 9fc2cd6408b21..02242f9593e1b 100644
--- a/paddle/fluid/imperative/cncl_context.cc
+++ b/paddle/fluid/imperative/cncl_context.cc
@@ -34,8 +34,8 @@ class Variable;
 namespace paddle {
 namespace imperative {
 
-static void AllReduce(const framework::Tensor &src,
-                      framework::Tensor *dst,
+static void AllReduce(const phi::DenseTensor &src,
+                      phi::DenseTensor *dst,
                       const mluStream stream,
                       const platform::CNCLComm *comm) {
   const auto &place = src.place();
@@ -174,7 +174,7 @@ void CNCLParallelContext::AllReduceByStream(const framework::Variable &src,
 
 void CNCLParallelContext::Broadcast(framework::Variable *src, int ring_id) {
   VLOG(3) << "/// DEBUG /// start inter broadcast with ring_id: " << ring_id;
-  framework::Tensor *src_tensor = src->GetMutable<framework::LoDTensor>();
+  phi::DenseTensor *src_tensor = src->GetMutable<framework::LoDTensor>();
   const auto &place = src_tensor->place();
   platform::CNCLComm *comm =
       platform::CNCLCommContext::Instance().Get(ring_id, place);
diff --git a/paddle/fluid/imperative/gloo_context.cc b/paddle/fluid/imperative/gloo_context.cc
index b6c21bead4182..ea140f8ecbee4 100644
--- a/paddle/fluid/imperative/gloo_context.cc
+++ b/paddle/fluid/imperative/gloo_context.cc
@@ -108,8 +108,8 @@ void GLOOParallelContext::AllReduceByStream(const framework::Variable &src,
   }
 }
 
-void GLOOParallelContext::AllReduce(const framework::Tensor &src_tensor,
-                                    framework::Tensor *dst_tensor) {
+void GLOOParallelContext::AllReduce(const phi::DenseTensor &src_tensor,
+                                    phi::DenseTensor *dst_tensor) {
   auto gloo_wrapper = framework::GlooWrapper::GetInstance();
   dst_tensor->Resize(src_tensor.dims());
   switch (framework::TransToProtoVarType(src_tensor.dtype())) {
diff --git a/paddle/fluid/imperative/gloo_context.h b/paddle/fluid/imperative/gloo_context.h
index 5290e3d1315a4..0e82175de0b0c 100644
--- a/paddle/fluid/imperative/gloo_context.h
+++ b/paddle/fluid/imperative/gloo_context.h
@@ -60,7 +60,7 @@ class GLOOParallelContext : public ParallelContext {
   void SynchronizeCompute() override;
 
  private:
-  void AllReduce(const framework::Tensor& src, framework::Tensor* dst);
+  void AllReduce(const phi::DenseTensor& src, phi::DenseTensor* dst);
   void AllReduce(const phi::SelectedRows& src, phi::SelectedRows* dst);
 
  private:
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index 199359a960326..594b105dc6bce 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -85,8 +85,8 @@ static void MoveOrCopyVar(framework::Variable* dst,
 #ifdef PADDLE_WITH_XPU
 template <typename T>
 void XPUTensorAddFunctor(const platform::Place& place,
-                         const framework::Tensor& src,
-                         framework::Tensor* dst) {
+                         const phi::DenseTensor& src,
+                         phi::DenseTensor* dst) {
   using XPUType = typename XPUTypeTrait<T>::Type;
   platform::XPUDeviceContext* ctx = dynamic_cast<platform::XPUDeviceContext*>(
       platform::DeviceContextPool::Instance().Get(place));
diff --git a/paddle/fluid/imperative/hccl_context.cc b/paddle/fluid/imperative/hccl_context.cc
index 975f7896f0d48..5ee4417bd29cd 100644
--- a/paddle/fluid/imperative/hccl_context.cc
+++ b/paddle/fluid/imperative/hccl_context.cc
@@ -32,8 +32,8 @@ class Variable;
 namespace paddle {
 namespace imperative {
 
-static void AllReduce(const framework::Tensor &src,
-                      framework::Tensor *dst,
+static void AllReduce(const phi::DenseTensor &src,
+                      phi::DenseTensor *dst,
                       const aclrtStream stream,
                       const platform::HCCLComm *comm) {
   const auto &place = src.place();
@@ -175,7 +175,7 @@ void HCCLParallelContext::AllReduceByStream(const framework::Variable &src,
 void HCCLParallelContext::Broadcast(framework::Variable *src, int ring_id) {
   VLOG(3) << "/// DEBUG /// start inter broadcast with ring_id: " << ring_id;
   if (src->IsType<framework::LoDTensor>()) {
-    framework::Tensor *src_tensor = src->GetMutable<framework::LoDTensor>();
+    phi::DenseTensor *src_tensor = src->GetMutable<framework::LoDTensor>();
     const auto &place = src_tensor->place();
     platform::HCCLComm *comm =
         platform::HCCLCommContext::Instance().Get(ring_id, place);
diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc
index 94ac86e97e157..c069d7ed10908 100644
--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
@@ -143,7 +143,7 @@ void NCCLParallelContext::AllReduceByStream(const framework::Variable &src,
 
 void NCCLParallelContext::Broadcast(framework::Variable *src, int ring_id) {
   VLOG(3) << "/// DEBUG /// start inter broadcast with ring_id: " << ring_id;
-  framework::Tensor *src_tensor = src->GetMutable<framework::LoDTensor>();
+  phi::DenseTensor *src_tensor = src->GetMutable<framework::LoDTensor>();
   const auto &place = src_tensor->place();
   platform::NCCLComm *comm =
       platform::NCCLCommContext::Instance().Get(ring_id, place);
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 62bbf77a2df1d..61ac4b90b5154 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -58,7 +58,7 @@ const std::shared_ptr<VariableWrapper>& GetVariableWrapper(
   return var;
 }
 
-const framework::Tensor* GetTensorFromVar(const framework::Variable& var) {
+const phi::DenseTensor* GetTensorFromVar(const framework::Variable& var) {
   if (var.IsType<framework::LoDTensor>()) {
     return &(var.Get<framework::LoDTensor>());
   } else if (var.IsType<phi::SelectedRows>()) {
@@ -91,7 +91,7 @@ void HandleComplexGradToRealGrad(const NameVarMap<VarType>& outs) {
                 << " var `" << var->Name() << "` to "
                 << framework::DataTypeToString(var->ForwardDataType())
                 << " real var in dynamic graph.";
-        framework::Tensor out;
+        phi::DenseTensor out;
         framework::TransComplexToReal(
             var->ForwardDataType(), var->DataType(), *tensor, &out);
         SetTensorToVariable(var->Var(), out, var->MutableVar());
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index 58cae0faead9f..dfa18814de958 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -38,7 +38,7 @@ DECLARE_bool(use_mkldnn);
 namespace paddle {
 namespace imperative {
 
-const framework::Tensor* GetTensorFromVar(const framework::Variable& var);
+const phi::DenseTensor* GetTensorFromVar(const framework::Variable& var);
 
 template <typename VarType>
 static void SetForwardDataTypeOfGradVar(const std::shared_ptr<VarType>& var);
@@ -110,7 +110,7 @@ std::shared_ptr<NameVarMap<VarType>> PrepareData(
                 cache_var->Var(), *tensor, tmp_var->MutableVar());
             (*tmp_ins_ptr)[name_pair.first][i] = tmp_var;
           } else {
-            framework::Tensor out;
+            phi::DenseTensor out;
             TransformData(
                 expected_kernel_key, kernel_type_for_var, *tensor, &out);
             if (NeedTransformDataType(kernel_type_for_var,
@@ -656,7 +656,7 @@ void PreparePhiData(const phi::Kernel& phi_kernel,
         VLOG(3) << "Phi Transform Variable " << input_names[i] << " from "
                 << tensor_in->place() << " to " << expected_place;
 
-        framework::Tensor tmp_tensor;
+        phi::DenseTensor tmp_tensor;
         framework::TensorCopySync(*tensor_in, expected_place, &tmp_tensor);
 
         SetTensorToVariable(var->Var(), tmp_tensor, var->MutableVar());
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index 24181eec59c4a..4492ca9257d28 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -34,7 +34,7 @@ namespace imperative {
     defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CNCL)
 // div the nranks
 void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) {
-  framework::Tensor *tensor =
+  phi::DenseTensor *tensor =
       is_sparse_
           ? sparse_contents_->GetMutable<phi::SelectedRows>()->mutable_value()
           : dense_contents_.GetMutable<framework::LoDTensor>();
@@ -76,7 +76,7 @@ void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) {
 template <typename DeviceContext, typename T>
 static void ConcatTensorsForAllReduce(
     const DeviceContext &context,
-    const std::vector<framework::Tensor> &dense_tensors_,
+    const std::vector<phi::DenseTensor> &dense_tensors_,
     framework::Variable *p_dense_contents) {
   operators::math::ConcatFunctor<DeviceContext, T> concat_functor_;
   concat_functor_(context,
@@ -89,10 +89,10 @@ template <typename DeviceContext, typename T>
 static void SplitTensorsForAllReduce(
     const DeviceContext &context,
     framework::Variable *p_dense_contents,
-    std::vector<framework::Tensor> *p_dense_tensors) {
+    std::vector<phi::DenseTensor> *p_dense_tensors) {
   auto *in = p_dense_contents->GetMutable<framework::LoDTensor>();
-  std::vector<framework::Tensor *> outs;
-  std::vector<const framework::Tensor *> shape_refer;
+  std::vector<phi::DenseTensor *> outs;
+  std::vector<const phi::DenseTensor *> shape_refer;
 
   outs.reserve(p_dense_tensors->size());
   shape_refer.reserve(p_dense_tensors->size());
@@ -114,7 +114,7 @@ static void SplitTensorsForAllReduce(
 template <typename DeviceContext>
 static void ConcatTensorsWithType(
     const DeviceContext &context,
-    const std::vector<framework::Tensor> &dense_tensors_,
+    const std::vector<phi::DenseTensor> &dense_tensors_,
     framework::Variable *p_dense_contents,
     framework::proto::VarType::Type type) {
   switch (type) {
@@ -140,11 +140,10 @@ static void ConcatTensorsWithType(
 
 // context is used to select the stream for split
 template <typename DeviceContext>
-static void SplitTensorsWithType(
-    const DeviceContext &context,
-    framework::Variable *p_dense_contents,
-    std::vector<framework::Tensor> *p_dense_tensors,
-    framework::proto::VarType::Type type) {
+static void SplitTensorsWithType(const DeviceContext &context,
+                                 framework::Variable *p_dense_contents,
+                                 std::vector<phi::DenseTensor> *p_dense_tensors,
+                                 framework::proto::VarType::Type type) {
   switch (type) {
     case framework::proto::VarType::FP16:
       SplitTensorsForAllReduce<DeviceContext, platform::float16>(
@@ -171,10 +170,10 @@ template <>
 void SplitTensorsForAllReduce<platform::XPUDeviceContext, float>(
     const platform::XPUDeviceContext &context,
     framework::Variable *p_dense_contents,
-    std::vector<framework::Tensor> *p_dense_tensors) {
+    std::vector<phi::DenseTensor> *p_dense_tensors) {
   auto *in = p_dense_contents->GetMutable<framework::LoDTensor>();
-  std::vector<framework::Tensor *> outs;
-  std::vector<const framework::Tensor *> shape_refer;
+  std::vector<phi::DenseTensor *> outs;
+  std::vector<const phi::DenseTensor *> shape_refer;
 
   outs.reserve(p_dense_tensors->size());
   shape_refer.reserve(p_dense_tensors->size());
@@ -192,7 +191,7 @@ void SplitTensorsForAllReduce<platform::XPUDeviceContext, float>(
 template <>
 void ConcatTensorsWithType<platform::XPUDeviceContext>(
     const platform::XPUDeviceContext &context,
-    const std::vector<framework::Tensor> &dense_tensors_,
+    const std::vector<phi::DenseTensor> &dense_tensors_,
     framework::Variable *p_dense_contents,
     framework::proto::VarType::Type type) {
   switch (type) {
@@ -213,7 +212,7 @@ template <>
 void SplitTensorsWithType<platform::XPUDeviceContext>(
     const platform::XPUDeviceContext &context,
     framework::Variable *p_dense_contents,
-    std::vector<framework::Tensor> *p_dense_tensors,
+    std::vector<phi::DenseTensor> *p_dense_tensors,
     framework::proto::VarType::Type type) {
   switch (type) {
     case framework::proto::VarType::FP32:
@@ -234,7 +233,7 @@ void SplitTensorsWithType<platform::XPUDeviceContext>(
 template <>
 void ConcatTensorsWithType<platform::MLUDeviceContext>(
     const platform::MLUDeviceContext &context,
-    const std::vector<framework::Tensor> &dense_tensors_,
+    const std::vector<phi::DenseTensor> &dense_tensors_,
     framework::Variable *p_dense_contents,
     framework::proto::VarType::Type type) {
   switch (type) {
@@ -259,7 +258,7 @@ template <>
 void SplitTensorsWithType<platform::MLUDeviceContext>(
     const platform::MLUDeviceContext &context,
     framework::Variable *p_dense_contents,
-    std::vector<framework::Tensor> *p_dense_tensors,
+    std::vector<phi::DenseTensor> *p_dense_tensors,
     framework::proto::VarType::Type type) {
   switch (type) {
     case framework::proto::VarType::FP16:
@@ -479,7 +478,7 @@ void Reducer::InitializeDenseGroups(
     p_group->length_.push_back(size);
 
     // for concat operator
-    p_group->dense_tensors_.push_back(framework::Tensor());
+    p_group->dense_tensors_.push_back(phi::DenseTensor());
 
     // check the dtype and place, it must be same.
     const auto &dtype = var->DataType();
diff --git a/paddle/fluid/imperative/reducer.cu b/paddle/fluid/imperative/reducer.cu
index a3f840f38bfad..59b7ecf915423 100644
--- a/paddle/fluid/imperative/reducer.cu
+++ b/paddle/fluid/imperative/reducer.cu
@@ -18,7 +18,7 @@ namespace paddle {
 namespace imperative {
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-void Group::DivNRanks(framework::Tensor *tensor,
+void Group::DivNRanks(phi::DenseTensor *tensor,
                       int64_t nranks,
                       const platform::DeviceContext &context) {
 #ifdef PADDLE_WITH_HIP
diff --git a/paddle/fluid/imperative/reducer.h b/paddle/fluid/imperative/reducer.h
index e6ac357565145..c455f962788b8 100644
--- a/paddle/fluid/imperative/reducer.h
+++ b/paddle/fluid/imperative/reducer.h
@@ -61,10 +61,10 @@ struct DivNRanksFunctor {
 
 template <typename Dex>
 struct DivNRanksForAllReduce {
-  framework::Tensor* in_;
+  phi::DenseTensor* in_;
   int64_t nranks_;
   const platform::DeviceContext& ctx_;
-  DivNRanksForAllReduce(framework::Tensor* in,
+  DivNRanksForAllReduce(phi::DenseTensor* in,
                         int64_t nranks,
                         const platform::DeviceContext& ctx)
       : in_(in), nranks_(nranks), ctx_(ctx) {}
@@ -89,7 +89,7 @@ class Group {
   bool is_sparse_ = false;
 
   // for concat kernel
-  std::vector<framework::Tensor> dense_tensors_;
+  std::vector<phi::DenseTensor> dense_tensors_;
 
   std::vector<size_t> length_;
 
@@ -111,7 +111,7 @@ class Group {
   void SplitTensors(const platform::DeviceContext& context);
 
   // use it in CUDA
-  void DivNRanks(framework::Tensor* tensor,
+  void DivNRanks(phi::DenseTensor* tensor,
                  int64_t nranks,
                  const platform::DeviceContext& context);
 
diff --git a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
index 1f54ec8a132ba..9b417eefc4b32 100644
--- a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
+++ b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
@@ -278,7 +278,7 @@ static bool IsEqualVar(const framework::Variable& var1,
     return false;
   }
 
-  framework::Tensor t1, t2;
+  phi::DenseTensor t1, t2;
 
   if (var1.IsType<framework::LoDTensor>()) {
     framework::TensorCopySync(
@@ -328,7 +328,7 @@ static framework::Variable RandomTensor(const framework::DDim& dims,
                                         const platform::Place& place,
                                         int low = -10,
                                         int high = 10) {
-  framework::Tensor cpu_tensor;
+  phi::DenseTensor cpu_tensor;
   cpu_tensor.Resize(dims);
   auto* ptr = cpu_tensor.mutable_data<T>(platform::CPUPlace());
   std::uniform_int_distribution<int> dist(low, high);
diff --git a/paddle/fluid/imperative/tests/test_group.cc b/paddle/fluid/imperative/tests/test_group.cc
index 9df5e6a735bd0..570f72605b586 100644
--- a/paddle/fluid/imperative/tests/test_group.cc
+++ b/paddle/fluid/imperative/tests/test_group.cc
@@ -84,7 +84,7 @@ void GroupConcatSplit(Place place, size_t size) {
           place, data, cpu_place, value.data(), sizeof(T) * value.size());
     }
 
-    framework::Tensor tmp;
+    phi::DenseTensor tmp;
     tmp.ShareDataWith(*tensor).Resize({static_cast<int64_t>(len)});
     group.dense_tensors_.push_back(std::move(tmp));
     group.all_length_ += len;
@@ -103,7 +103,7 @@ void GroupConcatSplit(Place place, size_t size) {
 
     group.DivNRanks(*dev_ctx, 1);
 
-    framework::Tensor tmp;
+    phi::DenseTensor tmp;
     framework::TensorCopySync(*tensor, cpu_place, &tmp);
     auto* data = tmp.data<T>();
     size_t offset = 0;
@@ -124,7 +124,7 @@ void GroupConcatSplit(Place place, size_t size) {
     for (size_t i = 0; i < size; ++i) {
       auto len = i + 1;
       auto& tensor = group.dense_tensors_[i];
-      framework::Tensor tmp;
+      phi::DenseTensor tmp;
       framework::TensorCopySync(tensor, cpu_place, &tmp);
       auto* data = tmp.data<T>();
 
diff --git a/paddle/fluid/imperative/tests/test_prepare_op.cc b/paddle/fluid/imperative/tests/test_prepare_op.cc
index e35568eb50c9a..f9501dedfe240 100644
--- a/paddle/fluid/imperative/tests/test_prepare_op.cc
+++ b/paddle/fluid/imperative/tests/test_prepare_op.cc
@@ -114,7 +114,7 @@ TEST(test_prepare_op, test_prepare_op) {
                               {}));
 }
 
-const framework::Tensor* GetTensorFromVar(const framework::Variable& var);
+const phi::DenseTensor* GetTensorFromVar(const framework::Variable& var);
 
 TEST(test_prepare_op, test_get_tensor_from_var) {
   std::shared_ptr<imperative::VarBase> vout_error(
diff --git a/paddle/fluid/imperative/variable_wrapper.h b/paddle/fluid/imperative/variable_wrapper.h
index 98214f8d62bfe..eb4654e28e339 100644
--- a/paddle/fluid/imperative/variable_wrapper.h
+++ b/paddle/fluid/imperative/variable_wrapper.h
@@ -103,7 +103,7 @@ class VariableWrapper {
   bool IsEmpty() const {
     bool is_empty = true;
     if (var_.IsInitialized()) {
-      const framework::Tensor* tensor = nullptr;
+      const phi::DenseTensor* tensor = nullptr;
       if (var_.IsType<framework::LoDTensor>()) {
         tensor = &(var_.Get<framework::LoDTensor>());
       } else if (var_.IsType<phi::SelectedRows>()) {
@@ -150,7 +150,7 @@ class VariableWrapper {
   }
 
   framework::proto::VarType::Type DataType() const {
-    const framework::Tensor* tensor = nullptr;
+    const phi::DenseTensor* tensor = nullptr;
     if (var_.IsInitialized()) {
       if (type_ == framework::proto::VarType::LOD_TENSOR) {
         tensor = &(var_.Get<framework::LoDTensor>());
@@ -194,7 +194,7 @@ class VariableWrapper {
   }
 
   const platform::Place Place() const {
-    const framework::Tensor* tensor = nullptr;
+    const phi::DenseTensor* tensor = nullptr;
     auto place =
         platform::CPUPlace();  // Default place for var not initialized.
     if (var_.IsInitialized()) {
diff --git a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
index b49ad4c145d55..20dd3ad560921 100644
--- a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
+++ b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
@@ -160,11 +160,11 @@ void SaveMixedModel(
   for (const auto& param_name : parameters) {
     auto* var = scope->FindLocalVar(param_name);
     if (var->IsType<framework::LoDTensor>() ||
-        var->IsType<framework::Tensor>()) {
+        var->IsType<phi::DenseTensor>()) {
       auto* t = var->GetMutable<framework::LoDTensor>();
       if (t->dtype() != phi::DataType::FLOAT32) continue;
 
-      framework::Tensor mixed_tensor;
+      phi::DenseTensor mixed_tensor;
       mixed_tensor.Resize(t->dims());
       auto* data = t->mutable_data<float>(platform::CPUPlace());
 
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
index 7f63eeaad2bf1..7f01b3401728d 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -61,7 +61,7 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToNpu(Argument *argument) {
         platform::errors::PreconditionNotMet("The var should not be nullptr"));
 
     if (var->IsType<framework::LoDTensor>() ||
-        var->IsType<framework::Tensor>()) {
+        var->IsType<phi::DenseTensor>()) {
       auto *t = var->GetMutable<framework::LoDTensor>();
 
       platform::CPUPlace cpu_place;
@@ -126,7 +126,7 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
       auto var_name = var_node->Var()->Name();
       auto *var = scope->FindLocalVar(var_name);
       if (var->IsType<framework::LoDTensor>() ||
-          var->IsType<framework::Tensor>()) {
+          var->IsType<phi::DenseTensor>()) {
         auto *t = var->GetMutable<framework::LoDTensor>();
         params_total_bytes += t->numel() * experimental::SizeOf(t->dtype());
       }
@@ -135,7 +135,7 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
 
   {
     // Alloc memory in pool to store all parameters.
-    framework::Tensor ts;
+    phi::DenseTensor ts;
     ts.mutable_data(place, params_total_bytes);
   }
 
@@ -160,14 +160,14 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
                               platform::errors::PreconditionNotMet(
                                   "The var should not be nullptr"));
       if (var->IsType<framework::LoDTensor>() ||
-          var->IsType<framework::Tensor>()) {
+          var->IsType<phi::DenseTensor>()) {
         auto *t = var->GetMutable<framework::LoDTensor>();
         auto var_data_type = var_node->Var()->GetDataType();
         VLOG(5) << "var_name is " << var_name << ", data type is "
                 << var_data_type;
         if (var_data_type == paddle::framework::proto::VarType::FP16 &&
             t->dtype() != paddle::experimental::DataType::FLOAT16) {
-          framework::Tensor half_tensor;
+          phi::DenseTensor half_tensor;
           half_tensor.set_type(paddle::experimental::DataType::FLOAT16);
           half_tensor.Resize(t->dims());
           auto *half_data =
@@ -179,7 +179,7 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
           t->clear();
           paddle::framework::TensorCopySync(half_tensor, place, t);
         } else if (var_data_type == paddle::framework::proto::VarType::BF16) {
-          framework::Tensor bf16_tensor;
+          phi::DenseTensor bf16_tensor;
           bf16_tensor.set_type(paddle::experimental::DataType::BFLOAT16);
           bf16_tensor.Resize(t->dims());
           auto *bf16_data = bf16_tensor.mutable_data<platform::bfloat16>(
diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.h b/paddle/fluid/inference/api/details/reset_tensor_array.h
index 6db0a204dbc3d..a7ce7f2205d73 100644
--- a/paddle/fluid/inference/api/details/reset_tensor_array.h
+++ b/paddle/fluid/inference/api/details/reset_tensor_array.h
@@ -39,7 +39,7 @@ namespace details {
 // training phase.
 struct TensorArrayBatchCleaner {
   TensorArrayBatchCleaner() {
-    constexpr auto kTensorId = framework::VarTypeTrait<framework::Tensor>::kId;
+    constexpr auto kTensorId = framework::VarTypeTrait<phi::DenseTensor>::kId;
     constexpr auto kLoDTensorId =
         framework::VarTypeTrait<framework::LoDTensor>::kId;
     constexpr auto kSelectedRowsId =
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index 022ba1483b955..6f496b86897e6 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -367,7 +367,7 @@ void Tensor::CopyToCpuImpl(T *data,
   auto *t_data = tensor->data<T>();
   auto t_place = tensor->place();
 
-  paddle::framework::Tensor out;
+  phi::DenseTensor out;
   auto mem_allocation =
       std::make_shared<paddle::memory::allocation::Allocation>(
           static_cast<void *>(data),
@@ -843,7 +843,7 @@ void InternalUtils::CopyToCpuWithIoStream(paddle_infer::Tensor *t,
   auto *t_data = tensor->data<T>();
   auto t_place = tensor->place();
 
-  paddle::framework::Tensor out;
+  phi::DenseTensor out;
   auto mem_allocation =
       std::make_shared<paddle::memory::allocation::Allocation>(
           static_cast<void *>(data),
diff --git a/paddle/fluid/inference/tensorrt/convert/fill_constant_op.cc b/paddle/fluid/inference/tensorrt/convert/fill_constant_op.cc
index 4d524c01b783f..d6aefd320678d 100644
--- a/paddle/fluid/inference/tensorrt/convert/fill_constant_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fill_constant_op.cc
@@ -36,7 +36,7 @@ class FillConstantOpConverter : public OpConverter {
       float value = PADDLE_GET_CONST(float, op_desc.GetAttr("value"));
       str_value = std::to_string(value);
     }
-    std::unique_ptr<framework::Tensor> out_tensor(new framework::Tensor());
+    std::unique_ptr<phi::DenseTensor> out_tensor(new phi::DenseTensor());
     out_tensor->Resize(phi::make_ddim(shape));
     nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT;
     void* trt_data = nullptr;
diff --git a/paddle/fluid/inference/tensorrt/convert/gelu_op.cc b/paddle/fluid/inference/tensorrt/convert/gelu_op.cc
index 845e5c7d704ca..9d44c83d46243 100644
--- a/paddle/fluid/inference/tensorrt/convert/gelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/gelu_op.cc
@@ -59,7 +59,7 @@ class GeluOpConverter : public OpConverter {
       }
       std::string out_name = op_desc.Output("Out").front();
       auto create_weights = [&](float data, std::string type) -> float* {
-        std::unique_ptr<framework::Tensor> tmp_tensor(new framework::Tensor());
+        std::unique_ptr<phi::DenseTensor> tmp_tensor(new phi::DenseTensor());
         tmp_tensor->Resize({1});
         auto* tmp_data = tmp_tensor->mutable_data<float>(platform::CPUPlace());
         tmp_data[0] = data;
@@ -166,7 +166,7 @@ class GeluOpConverter : public OpConverter {
       }
       std::string out_name = op_desc.Output("Out").front();
       auto create_weights = [&](float data, std::string type) -> float* {
-        std::unique_ptr<framework::Tensor> tmp_tensor(new framework::Tensor());
+        std::unique_ptr<phi::DenseTensor> tmp_tensor(new phi::DenseTensor());
         tmp_tensor->Resize({1});
         auto* tmp_data = tmp_tensor->mutable_data<float>(platform::CPUPlace());
         tmp_data[0] = data;
diff --git a/paddle/fluid/inference/tensorrt/convert/matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/matmul_op.cc
index 6752bf1d49768..03ec113311175 100644
--- a/paddle/fluid/inference/tensorrt/convert/matmul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/matmul_op.cc
@@ -141,8 +141,7 @@ class MatMulOpConverter : public OpConverter {
 
         auto create_weights = [&](float data,
                                   const std::string& type) -> float* {
-          std::unique_ptr<framework::Tensor> tmp_tensor(
-              new framework::Tensor());
+          std::unique_ptr<phi::DenseTensor> tmp_tensor(new phi::DenseTensor());
           tmp_tensor->Resize({1});
           auto* tmp_data =
               tmp_tensor->mutable_data<float>(platform::CPUPlace());
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index 85a9b9d2fb346..da500014b8e81 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -495,7 +495,7 @@ class OpConverter {
 
     int data_size = std::accumulate(
         shape.d, shape.d + shape.nbDims, 1, std::multiplies<int>());
-    std::unique_ptr<framework::Tensor> tmp_tensor(new framework::Tensor());
+    std::unique_ptr<phi::DenseTensor> tmp_tensor(new phi::DenseTensor());
     tmp_tensor->Resize({data_size});
     auto* tmp_data = tmp_tensor->mutable_data<T>(platform::CPUPlace());
     for (int i = 0; i < data_size; i++) {
@@ -530,7 +530,7 @@ class OpConverter {
           "supports float, half or int32_t."));
     }
 
-    std::unique_ptr<framework::Tensor> tmp_tensor(new framework::Tensor());
+    std::unique_ptr<phi::DenseTensor> tmp_tensor(new phi::DenseTensor());
     int data_size = data.size();
     tmp_tensor->Resize({data_size});
     auto* tmp_data = tmp_tensor->mutable_data<T>(platform::CPUPlace());
diff --git a/paddle/fluid/inference/tensorrt/convert/scale_op.cc b/paddle/fluid/inference/tensorrt/convert/scale_op.cc
index 9b0798d9f354f..a3b2e65ac4976 100644
--- a/paddle/fluid/inference/tensorrt/convert/scale_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/scale_op.cc
@@ -50,7 +50,7 @@ class ScaleOpConverter : public OpConverter {
     float bias = PADDLE_GET_CONST(float, op_desc.GetAttr("bias"));
     float scale = PADDLE_GET_CONST(float, op_desc.GetAttr("scale"));
     auto create_weights = [&](float data, std::string type) -> float* {
-      std::unique_ptr<framework::Tensor> tmp_tensor(new framework::Tensor());
+      std::unique_ptr<phi::DenseTensor> tmp_tensor(new phi::DenseTensor());
       tmp_tensor->Resize({1});
       auto* tmp_data = tmp_tensor->mutable_data<float>(platform::CPUPlace());
       tmp_data[0] = data;
diff --git a/paddle/fluid/inference/tensorrt/convert/strided_slice_op.cc b/paddle/fluid/inference/tensorrt/convert/strided_slice_op.cc
index cb67957c79cbf..2302d96e23564 100644
--- a/paddle/fluid/inference/tensorrt/convert/strided_slice_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/strided_slice_op.cc
@@ -107,7 +107,7 @@ class StridedSliceOpConverter : public OpConverter {
 
       auto create_weights = [&](const std::vector<int>& data,
                                 const std::string& type) -> int* {
-        std::unique_ptr<framework::Tensor> tmp_tensor(new framework::Tensor());
+        std::unique_ptr<phi::DenseTensor> tmp_tensor(new phi::DenseTensor());
         int data_size = data.size();
         tmp_tensor->Resize({data_size});
         auto* tmp_data = tmp_tensor->mutable_data<int>(platform::CPUPlace());
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 0cf1d6352c340..cf53e51d62fa7 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -423,7 +423,7 @@ void TensorRTEngine::SetRuntimeBatch(size_t batch_size) {
 
 // Note: Only for support plugin.
 TensorRTEngine::Weight TensorRTEngine::GetFp16TrtWeight(
-    const std::string &name, const framework::Tensor &weight_tensor) {
+    const std::string &name, const phi::DenseTensor &weight_tensor) {
   static int name_suffix_counter = 0;
   std::string name_suffix = std::to_string(name_suffix_counter);
   std::string splitter = "__";
@@ -435,7 +435,7 @@ TensorRTEngine::Weight TensorRTEngine::GetFp16TrtWeight(
                         "The weight named %s is set into the weight map "
                         "twice in TRT OP converter.",
                         name_with_suffix));
-  weight_map[name_with_suffix].reset(new framework::Tensor());
+  weight_map[name_with_suffix].reset(new phi::DenseTensor());
   weight_map[name_with_suffix]->Resize(weight_tensor.dims());
 
   TensorRTEngine::Weight weight;
@@ -445,7 +445,7 @@ TensorRTEngine::Weight TensorRTEngine::GetFp16TrtWeight(
 
   // if trt not support dtype, we need to cast to  fp16.
   if (weight_tensor.dtype() == phi::DataType::BFLOAT16) {
-    framework::Tensor bf16_tensor;
+    phi::DenseTensor bf16_tensor;
     bf16_tensor.clear();
     paddle::framework::TensorCopySync(
         weight_tensor, platform::CPUPlace(), &bf16_tensor);
@@ -459,7 +459,7 @@ TensorRTEngine::Weight TensorRTEngine::GetFp16TrtWeight(
       fp16_data[i] = static_cast<float16>(bf16_data[i]);
     }
   } else if (weight_tensor.dtype() == phi::DataType::FLOAT32) {
-    framework::Tensor fp32_tensor;
+    phi::DenseTensor fp32_tensor;
     fp32_tensor.clear();
     paddle::framework::TensorCopySync(
         weight_tensor, platform::CPUPlace(), &fp32_tensor);
@@ -483,7 +483,7 @@ TensorRTEngine::Weight TensorRTEngine::GetFp16TrtWeight(
 
 // Note: Only for support plugin.
 TensorRTEngine::Weight TensorRTEngine::GetFp32TrtWeight(
-    const std::string &name, const framework::Tensor &weight_tensor) {
+    const std::string &name, const phi::DenseTensor &weight_tensor) {
   static int name_suffix_counter = 0;
   std::string name_suffix = std::to_string(name_suffix_counter);
   std::string splitter = "__";
@@ -495,7 +495,7 @@ TensorRTEngine::Weight TensorRTEngine::GetFp32TrtWeight(
                         "The weight named %s is set into the weight map "
                         "twice in TRT OP converter.",
                         name_with_suffix));
-  weight_map[name_with_suffix].reset(new framework::Tensor());
+  weight_map[name_with_suffix].reset(new phi::DenseTensor());
   weight_map[name_with_suffix]->Resize(weight_tensor.dims());
 
   TensorRTEngine::Weight weight;
@@ -505,7 +505,7 @@ TensorRTEngine::Weight TensorRTEngine::GetFp32TrtWeight(
 
   // if trt not support dtype, we need to cast to  fp32.
   if (weight_tensor.dtype() == phi::DataType::BFLOAT16) {
-    framework::Tensor bf16_tensor;
+    phi::DenseTensor bf16_tensor;
     bf16_tensor.clear();
     paddle::framework::TensorCopySync(
         weight_tensor, platform::CPUPlace(), &bf16_tensor);
@@ -519,7 +519,7 @@ TensorRTEngine::Weight TensorRTEngine::GetFp32TrtWeight(
       fp32_data[i] = static_cast<float>(bf16_data[i]);
     }
   } else if (weight_tensor.dtype() == phi::DataType::FLOAT16) {
-    framework::Tensor fp16_tensor;
+    phi::DenseTensor fp16_tensor;
     fp16_tensor.clear();
     paddle::framework::TensorCopySync(
         weight_tensor, platform::CPUPlace(), &fp16_tensor);
@@ -542,7 +542,7 @@ TensorRTEngine::Weight TensorRTEngine::GetFp32TrtWeight(
 }
 
 TensorRTEngine::Weight TensorRTEngine::GetTrtWeight(
-    const std::string &name, const framework::Tensor &weight_tensor) {
+    const std::string &name, const phi::DenseTensor &weight_tensor) {
   static int name_suffix_counter = 0;
   std::string name_suffix = std::to_string(name_suffix_counter);
   std::string splitter = "__";
@@ -555,7 +555,7 @@ TensorRTEngine::Weight TensorRTEngine::GetTrtWeight(
                         "twice in TRT OP converter.",
                         name_with_suffix));
 
-  weight_map[name_with_suffix].reset(new framework::Tensor());
+  weight_map[name_with_suffix].reset(new phi::DenseTensor());
   weight_map[name_with_suffix]->Resize(weight_tensor.dims());
 
   TensorRTEngine::Weight weight;
@@ -563,7 +563,7 @@ TensorRTEngine::Weight TensorRTEngine::GetTrtWeight(
 
   // if trt not support dtype, we need to cast to fp32.
   if (weight_tensor.dtype() == phi::DataType::BFLOAT16) {
-    framework::Tensor bf16_tensor;
+    phi::DenseTensor bf16_tensor;
     bf16_tensor.clear();
     paddle::framework::TensorCopySync(
         weight_tensor, platform::CPUPlace(), &bf16_tensor);
@@ -578,7 +578,7 @@ TensorRTEngine::Weight TensorRTEngine::GetTrtWeight(
     weight.SetDataType(phi::DataType::FLOAT32);
     weight.SetValues(fp32_data);
   } else if (weight_tensor.dtype() == phi::DataType::INT64) {
-    framework::Tensor int64_tensor;
+    phi::DenseTensor int64_tensor;
     int64_tensor.clear();
     paddle::framework::TensorCopySync(
         weight_tensor, platform::CPUPlace(), &int64_tensor);
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index 209f297a0668f..2f742dbdb50a6 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -451,15 +451,15 @@ class TensorRTEngine {
 
   // Get fp16 trt weight. If src weight is not fp16, we will cast.
   Weight GetFp16TrtWeight(const std::string& name,
-                          const framework::Tensor& weight_tensor);
+                          const phi::DenseTensor& weight_tensor);
 
   // Get fp32 trt weight. If src weight is not fp32, we will cast.
   Weight GetFp32TrtWeight(const std::string& name,
-                          const framework::Tensor& weight_tensor);
+                          const phi::DenseTensor& weight_tensor);
 
   // if the src weight type is fp16, then return fp16 trt weight, etc.
   Weight GetTrtWeight(const std::string& name,
-                      const framework::Tensor& weight_tensor);
+                      const phi::DenseTensor& weight_tensor);
 
   float GetTensorDynamicRange(nvinfer1::ITensor* tensor) {
     return quant_dynamic_range_[tensor];
@@ -474,13 +474,13 @@ class TensorRTEngine {
   // so we need to copy the weights from GPU to CPU in our op converter.
   // We use a map to store these weights for the weight memory is not released
   // in advance, which affecting the construction of TRT Op.
-  std::unordered_map<std::string /*name*/, std::unique_ptr<framework::Tensor>>
+  std::unordered_map<std::string /*name*/, std::unique_ptr<phi::DenseTensor>>
       weight_map;
 
   // When setting weight_map, a self-increasing suffix is needed for the names
   // so as to avoid repeatedly setting weights with the same name.
   void SetWeights(std::string w_name,
-                  std::unique_ptr<framework::Tensor> w_tensor) {
+                  std::unique_ptr<phi::DenseTensor> w_tensor) {
     static int suffix_counter = 0;
     std::string suffix = std::to_string(suffix_counter);
     std::string splitter = "__";
diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
index ec7d9545ce387..d0815798a6e47 100644
--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
@@ -95,7 +95,7 @@ class EmbEltwiseLayernormPluginDynamicImpl
   int hidden_size_;
   float eps_;
 
-  framework::Tensor in_ptr_tensor_, emb_ptr_tensor_;
+  phi::DenseTensor in_ptr_tensor_, emb_ptr_tensor_;
   int device_id_{0};
   bool is_initialized_{false};
 };
@@ -303,10 +303,11 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
     SerializeValue(&buffer, eps_);
   }
 
-  nvinfer1::DimsExprs getOutputDimensions(int output_index,
-                                          const nvinfer1::DimsExprs* inputs,
-                                          int nb_inputs,
-                                          nvinfer1::IExprBuilder& expr_builder)
+  nvinfer1::DimsExprs getOutputDimensions(
+      int output_index,
+      const nvinfer1::DimsExprs* inputs,
+      int nb_inputs,
+      nvinfer1::IExprBuilder& expr_builder)  // NOLINT
       TRT_NOEXCEPT override;
 
   bool supportsFormatCombination(int pos,
diff --git a/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu
index f52ef0c52ff0e..e49bf16bf6878 100644
--- a/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu
@@ -469,7 +469,7 @@ inline void enqueueImpl(const nvinfer1::PluginTensorDesc* input_desc,
       sizeof(T) * 8,
       stream));
   int64_t temp_size = temp_storage_bytes;
-  framework::Tensor temp_storage;
+  phi::DenseTensor temp_storage;
   auto* temp_storage_data = temp_storage.mutable_data<uint8_t>(
       {temp_size}, platform::CUDAPlace(device_id));
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.cu
index 294677e6ac5de..ca4126d5aefcf 100644
--- a/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.cu
@@ -87,7 +87,7 @@ int GroupNormPlugin::enqueue(int batch_size,
   float *variance_d =
       variance_t.mutable_data<float>(platform::CUDAPlace(device_id));
 
-  framework::Tensor temp_variance_t;
+  phi::DenseTensor temp_variance_t;
   temp_variance_t.Resize(phi::make_ddim(variance_shape_));
   float *temp_variance_d =
       temp_variance_t.mutable_data<float>(platform::CUDAPlace(device_id));
@@ -220,7 +220,7 @@ int GroupNormPluginDynamic::enqueue(
     float *variance_d =
         variance_t.mutable_data<float>(platform::CUDAPlace(device_id));
 
-    framework::Tensor temp_variance_t;
+    phi::DenseTensor temp_variance_t;
     temp_variance_t.Resize(phi::make_ddim(batched_variance_shape));
     float *temp_variance_d =
         temp_variance_t.mutable_data<float>(platform::CUDAPlace(device_id));
diff --git a/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.h
index fdcb93e29f042..757ff33443455 100644
--- a/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.h
@@ -105,10 +105,10 @@ class GroupNormPlugin : public PluginTensorRT {
  private:
   std::vector<float> scale_;
   std::vector<float> bias_;
-  framework::Tensor scale_t;
-  framework::Tensor bias_t;
-  framework::Tensor mean_t;
-  framework::Tensor variance_t;
+  phi::DenseTensor scale_t;
+  phi::DenseTensor bias_t;
+  phi::DenseTensor mean_t;
+  phi::DenseTensor variance_t;
   int groups_;
   float eps_;
   std::vector<int64_t> mean_shape_;
@@ -187,10 +187,11 @@ class GroupNormPluginDynamic : public DynamicPluginTensorRT {
     SerializeValue(&buffer, mean_shape_);
     SerializeValue(&buffer, variance_shape_);
   }
-  nvinfer1::DimsExprs getOutputDimensions(int output_index,
-                                          const nvinfer1::DimsExprs* inputs,
-                                          int nb_inputs,
-                                          nvinfer1::IExprBuilder& expr_builder)
+  nvinfer1::DimsExprs getOutputDimensions(
+      int output_index,
+      const nvinfer1::DimsExprs* inputs,
+      int nb_inputs,
+      nvinfer1::IExprBuilder& expr_builder)  // NOLINT
       TRT_NOEXCEPT override;
 
   bool supportsFormatCombination(int pos,
@@ -226,10 +227,10 @@ class GroupNormPluginDynamic : public DynamicPluginTensorRT {
  private:
   std::vector<float> scale_;
   std::vector<float> bias_;
-  framework::Tensor scale_t;
-  framework::Tensor bias_t;
-  framework::Tensor mean_t;
-  framework::Tensor variance_t;
+  phi::DenseTensor scale_t;
+  phi::DenseTensor bias_t;
+  phi::DenseTensor mean_t;
+  phi::DenseTensor variance_t;
   int groups_;
   float eps_;
   std::vector<int64_t> mean_shape_;
diff --git a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.h
index 30ee5dbea8fc8..90a01d076f367 100644
--- a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.h
@@ -33,8 +33,8 @@ class InstanceNormPlugin : public PluginTensorRT {
   std::vector<float> scale_;
   std::vector<float> bias_;
 
-  framework::Tensor scale_t;
-  framework::Tensor bias_t;
+  phi::DenseTensor scale_t;
+  phi::DenseTensor bias_t;
   cudnnHandle_t handle_;
   cudnnTensorDescriptor_t x_desc_, y_desc_, b_desc_;
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h
index a8ccabb3cff59..84d1898d50f84 100644
--- a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h
@@ -31,10 +31,10 @@ namespace plugin {
 class LayerNormPlugin : public PluginTensorRT {
   std::vector<float> bias_;
   std::vector<float> scale_;
-  framework::Tensor scale_t;
-  framework::Tensor bias_t;
-  framework::Tensor mean_t;
-  framework::Tensor variance_t;
+  phi::DenseTensor scale_t;
+  phi::DenseTensor bias_t;
+  phi::DenseTensor mean_t;
+  phi::DenseTensor variance_t;
   int begin_norm_axis_;
   float eps_;
   std::vector<int64_t> mean_shape_;
@@ -201,10 +201,11 @@ class LayerNormPluginDynamic : public DynamicPluginTensorRT {
     SerializeValue(&buffer, variance_shape_);
   }
 
-  nvinfer1::DimsExprs getOutputDimensions(int output_index,
-                                          const nvinfer1::DimsExprs* inputs,
-                                          int nb_inputs,
-                                          nvinfer1::IExprBuilder& expr_builder)
+  nvinfer1::DimsExprs getOutputDimensions(
+      int output_index,
+      const nvinfer1::DimsExprs* inputs,
+      int nb_inputs,
+      nvinfer1::IExprBuilder& expr_builder)  // NOLINT
       TRT_NOEXCEPT override;
 
   bool supportsFormatCombination(int pos,
@@ -240,10 +241,10 @@ class LayerNormPluginDynamic : public DynamicPluginTensorRT {
  private:
   std::vector<float> bias_;
   std::vector<float> scale_;
-  framework::Tensor scale_t;
-  framework::Tensor bias_t;
-  framework::Tensor mean_t;
-  framework::Tensor variance_t;
+  phi::DenseTensor scale_t;
+  phi::DenseTensor bias_t;
+  phi::DenseTensor mean_t;
+  phi::DenseTensor variance_t;
   int begin_norm_axis_;
   float eps_;
   std::vector<int64_t> mean_shape_;
diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
index e4a9504d8c869..336fcb5531799 100644
--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
@@ -301,7 +301,7 @@ void QkvToContextPluginDynamic::configurePlugin(
     } else if (in[0].desc.type == nvinfer1::DataType::kFLOAT) {
       fake_qk_bias_ = reinterpret_cast<float *>(
           tensor_.mutable_data<int32_t>(platform::CUDAPlace(device_id)));
-      long size = sizeof(int32_t) * batch * seq_len * seq_len * head_number_;
+      int64_t size = sizeof(int32_t) * batch * seq_len * seq_len * head_number_;
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipMemsetAsync(fake_qk_bias_, 0, size, dev_ctx.stream()));
@@ -408,7 +408,7 @@ int QkvToContextPluginDynamic::enqueue(
   // input[0], (B, S, 3 * N * H, 1, 1)
   int batch = input_dims.d[0];
   int seq_len = input_dims.d[1];
-  framework::Tensor multihead_temp_tensor;
+  phi::DenseTensor multihead_temp_tensor;
   int scratch_size = batch * head_number_ * seq_len * seq_len * 1;
 
   int device_id;
@@ -425,7 +425,7 @@ int QkvToContextPluginDynamic::enqueue(
 
     const float *input0_data = static_cast<const float *>(inputs[0]);
     // fit to [batch, head_num, length, length] + [batch, 1, 1, length]
-    framework::Tensor temp_qk_bias_tensor;
+    phi::DenseTensor temp_qk_bias_tensor;
     float *qk_bias = const_cast<float *>(static_cast<const float *>(inputs[1]));
     if (ProductDim(input_desc[1].dims) == (batch * seq_len)) {
       temp_qk_bias_tensor.Resize({batch, head_number_, seq_len, seq_len});
@@ -494,7 +494,7 @@ int QkvToContextPluginDynamic::enqueue(
 
     const half *input0_data = static_cast<const half *>(inputs[0]);
     // fit to [batch, head_num, length, length] + [batch, 1, 1, length]
-    framework::Tensor temp_qk_bias_tensor;
+    phi::DenseTensor temp_qk_bias_tensor;
     half *qk_bias = const_cast<half *>(static_cast<const half *>(inputs[1]));
     if (ProductDim(input_desc[1].dims) == (batch * seq_len)) {
       temp_qk_bias_tensor.Resize({batch, head_number_, seq_len, seq_len});
diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h
index 17c9e904d4228..dd3dc71e956a4 100644
--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h
@@ -83,10 +83,11 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
     SerializeValue(&buffer, with_fp16_);
   }
 
-  nvinfer1::DimsExprs getOutputDimensions(int output_index,
-                                          const nvinfer1::DimsExprs* inputs,
-                                          int nb_inputs,
-                                          nvinfer1::IExprBuilder& expr_builder)
+  nvinfer1::DimsExprs getOutputDimensions(
+      int output_index,
+      const nvinfer1::DimsExprs* inputs,
+      int nb_inputs,
+      nvinfer1::IExprBuilder& expr_builder)  // NOLINT
       TRT_NOEXCEPT override;
 
   bool supportsFormatCombination(int pos,
@@ -124,7 +125,7 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
   int head_number_;
   int head_size_;
   float scale_;
-  framework::Tensor tensor_;
+  phi::DenseTensor tensor_;
   half* mask_half_;
   float* fake_qk_bias_;
 };
diff --git a/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc b/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc
index 43e219232d111..6a253d2815941 100644
--- a/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc
@@ -91,8 +91,8 @@ class TensorRTDynamicEngineTest : public ::testing::Test {
   }
 
  protected:
-  framework::Tensor input_;
-  framework::Tensor output_;
+  phi::DenseTensor input_;
+  phi::DenseTensor output_;
   TensorRTEngine *engine_;
   phi::GPUContext *ctx_;
 };
@@ -276,8 +276,8 @@ class TensorRTDynamicTestFusedTokenPrune : public ::testing::Test {
   }
 
  protected:
-  std::vector<framework::Tensor> inputs_;
-  std::vector<framework::Tensor> outputs_;
+  std::vector<phi::DenseTensor> inputs_;
+  std::vector<phi::DenseTensor> outputs_;
   TensorRTEngine *engine_;
   phi::GPUContext *ctx_;
 };
diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc
index dc8065ab2a628..027c593d73c6f 100644
--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -66,8 +66,8 @@ class TensorRTEngineTest : public ::testing::Test {
   }
 
  protected:
-  framework::Tensor input_;
-  framework::Tensor output_;
+  phi::DenseTensor input_;
+  phi::DenseTensor output_;
   TensorRTEngine *engine_;
   phi::GPUContext *ctx_;
 };
diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc
index 948a3e105f3d4..dbcdc8b8b7c1c 100644
--- a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc
+++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc
@@ -33,7 +33,7 @@ TRTInt8Calibrator::TRTInt8Calibrator(
   int i = 0;
   VLOG(4) << "Init a new calibrator: " << engine_name_;
   for (const auto it : buffers) {
-    framework::Tensor temp_tensor;
+    phi::DenseTensor temp_tensor;
     std::string input_name = it.first;
     int data_size = it.second;
     int num_ele = data_size / sizeof(int16_t);
diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h
index 35b018c1a0262..d0d4c46b4dc6b 100644
--- a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h
+++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h
@@ -73,7 +73,7 @@ struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator2 {
   std::condition_variable cond_;
 
   std::unordered_map<std::string, std::pair<void*, size_t>> data_buffers_;
-  std::vector<framework::Tensor> data_tensors_;
+  std::vector<phi::DenseTensor> data_tensors_;
 
   std::string engine_name_;
   std::string calibration_table_;
diff --git a/paddle/fluid/operators/abs_op.cc b/paddle/fluid/operators/abs_op.cc
index 9a2a75a642ab7..d8fd433c0417c 100644
--- a/paddle/fluid/operators/abs_op.cc
+++ b/paddle/fluid/operators/abs_op.cc
@@ -153,7 +153,7 @@ class AbsDoubleGradOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const framework::Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     return framework::OpKernelType(
         framework::TransToProtoVarType(tensor.dtype()),
diff --git a/paddle/fluid/operators/abs_op_mlu.cc b/paddle/fluid/operators/abs_op_mlu.cc
index 284ef4902be4b..9afa4c28e0544 100644
--- a/paddle/fluid/operators/abs_op_mlu.cc
+++ b/paddle/fluid/operators/abs_op_mlu.cc
@@ -18,14 +18,14 @@ limitations under the Licnse. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class AbsMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
+    auto* input = ctx.Input<phi::DenseTensor>("X");
+    auto* output = ctx.Output<phi::DenseTensor>("Out");
 
     output->mutable_data<T>(ctx.GetPlace());
 
@@ -44,9 +44,9 @@ template <typename T>
 class AbsGradMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     dx->mutable_data<T>(ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/abs_op_npu.cc b/paddle/fluid/operators/abs_op_npu.cc
index 490350ab980cd..a1ca88ae5b572 100644
--- a/paddle/fluid/operators/abs_op_npu.cc
+++ b/paddle/fluid/operators/abs_op_npu.cc
@@ -18,14 +18,14 @@ limitations under the Licnse. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class AbsNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     out->mutable_data<T>(ctx.GetPlace());
 
@@ -47,9 +47,9 @@ template <typename DeviceContext, typename T>
 class AbsGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     dx->mutable_data<T>(ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/activation_cudnn.cu.cc b/paddle/fluid/operators/activation_cudnn.cu.cc
index 2ad92e36272b3..3afe6b4608fc4 100644
--- a/paddle/fluid/operators/activation_cudnn.cu.cc
+++ b/paddle/fluid/operators/activation_cudnn.cu.cc
@@ -18,7 +18,7 @@
 
 namespace paddle {
 namespace operators {
-using framework::Tensor;
+
 using platform::ActivationDescriptor;
 using platform::TensorDescriptor;
 
@@ -27,7 +27,7 @@ class CudnnActivationKernel
     : public framework::OpKernel<Functor::ElEWISE_TYPE> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    framework::Tensor *X, *Out;
+    phi::DenseTensor *X, *Out;
     ExtractActivationTensor(context, X, Out);
     ActivationDescriptor act_desc;
     TensorDescriptor x_desc, out_desc;
diff --git a/paddle/fluid/operators/activation_cudnn_op.cu.cc b/paddle/fluid/operators/activation_cudnn_op.cu.cc
index 49f78715c2cf5..c4e2685dd5958 100644
--- a/paddle/fluid/operators/activation_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/activation_cudnn_op.cu.cc
@@ -18,7 +18,7 @@
 
 namespace paddle {
 namespace operators {
-using framework::Tensor;
+
 using phi::GPUContext;
 using platform::ActivationDescriptor;
 using platform::TensorDescriptor;
@@ -49,7 +49,7 @@ struct CudnnActivationFunctor {
                          const cudnnActivationMode_t& m)
       : ctx_(ctx), coef_(c), mode_(m) {}
 #endif
-  void operator()(const Tensor& x, Tensor* out) {
+  void operator()(const phi::DenseTensor& x, phi::DenseTensor* out) {
     ActivationDescriptor act_desc;
     act_desc.set(mode_, coef_);
     TensorDescriptor x_desc, out_desc;
@@ -100,10 +100,10 @@ struct CudnnActivationGradFunctor {
                              const cudnnActivationMode_t& m)
       : ctx_(ctx), coef_(c), mode_(m) {}
 #endif
-  void operator()(const Tensor& x,
-                  const Tensor& out,
-                  const Tensor dout,
-                  Tensor* dx) {
+  void operator()(const phi::DenseTensor& x,
+                  const phi::DenseTensor& out,
+                  const phi::DenseTensor dout,
+                  phi::DenseTensor* dx) {
     ActivationDescriptor act_desc;
     act_desc.set(mode_, coef_);
     TensorDescriptor x_desc, out_desc, dout_desc, dx_desc;
@@ -217,8 +217,8 @@ class CudnnActivationKernel
  public:
   using T = typename Functor::ELEMENT_TYPE;
   void Compute(const framework::ExecutionContext& context) const override {
-    const framework::Tensor* X = nullptr;
-    framework::Tensor* Out = nullptr;
+    const phi::DenseTensor* X = nullptr;
+    phi::DenseTensor* Out = nullptr;
     ExtractActivationTensor(context, &X, &Out);
     Out->mutable_data<T>(context.GetPlace());
     auto& dev_ctx = context.template device_context<phi::GPUContext>();
@@ -236,9 +236,9 @@ class CudnnActivationGradKernel
     static_assert(Functor::FwdDeps() == ActBwdOpFwdDeps::kDepOut,
                   "Forward deps must be Out.");
 
-    const framework::Tensor *X, *Out, *dOut;
+    const phi::DenseTensor *X, *Out, *dOut;
     X = Out = dOut = nullptr;
-    framework::Tensor* dX = nullptr;
+    phi::DenseTensor* dX = nullptr;
     ExtractActivationGradTensor<Functor::FwdDeps()>(
         context, &X, &Out, &dOut, &dX);
     dX->mutable_data<T>(context.GetPlace());
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index d0ac97b2d002b..f921ad844b310 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -30,8 +30,6 @@ DECLARE_bool(use_mkldnn);
 namespace paddle {
 namespace operators {
 
-using paddle::framework::Tensor;
-
 template <typename GradFunctor>
 static constexpr bool CanInplaceAct() {
   return GradFunctor::FwdDeps() == ActBwdOpFwdDeps::kDepOut ||
@@ -124,7 +122,7 @@ class ActivationOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
 #ifdef PADDLE_WITH_MKLDNN
     // When activation is first oneDNN op (there was some non oneDNN op
@@ -1345,7 +1343,7 @@ class PowOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     if (var_name == "FactorTensor") {
       return expected_kernel_type;
@@ -1373,7 +1371,7 @@ class PowOpGrad : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     if (var_name == "FactorTensor") {
       return expected_kernel_type;
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index eeec8c300daac..9421240c14c67 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -52,8 +52,8 @@ static std::unordered_set<std::string> CanBeUsedBySelectedRows = {
     "abs", "abs_grad", "square", "square_grad", "sqrt", "sqrt_grad"};
 
 inline void ExtractActivationTensor(const framework::ExecutionContext& context,
-                                    const framework::Tensor** X,
-                                    framework::Tensor** Out) {
+                                    const phi::DenseTensor** X,
+                                    phi::DenseTensor** Out) {
   auto x_var = context.InputVar("X");
   auto out_var = context.OutputVar("Out");
   PADDLE_ENFORCE_NOT_NULL(x_var,
@@ -70,8 +70,8 @@ inline void ExtractActivationTensor(const framework::ExecutionContext& context,
     *Out = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
         out_var);
   } else {
-    *X = context.Input<framework::Tensor>("X");
-    *Out = context.Output<framework::Tensor>("Out");
+    *X = context.Input<phi::DenseTensor>("X");
+    *Out = context.Output<phi::DenseTensor>("Out");
   }
 
   PADDLE_ENFORCE_NOT_NULL(
@@ -84,10 +84,10 @@ inline void ExtractActivationTensor(const framework::ExecutionContext& context,
 template <ActBwdOpFwdDeps kDepValue>
 inline void ExtractActivationGradTensor(
     const framework::ExecutionContext& context,
-    const framework::Tensor** X,
-    const framework::Tensor** Out,
-    const framework::Tensor** dOut,
-    framework::Tensor** dX) {
+    const phi::DenseTensor** X,
+    const phi::DenseTensor** Out,
+    const phi::DenseTensor** dOut,
+    phi::DenseTensor** dX) {
   auto out_grad_var = context.InputVar(framework::GradVarName("Out"));
   auto x_grad_var = context.OutputVar(framework::GradVarName("X"));
   const framework::Variable* out_var = nullptr;
@@ -129,9 +129,9 @@ inline void ExtractActivationGradTensor(
     }
 
   } else {
-    *Out = context.Input<framework::Tensor>("Out");
-    *dOut = context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    *dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    *Out = context.Input<phi::DenseTensor>("Out");
+    *dOut = context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    *dX = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     if (out_var) {
       *Out = &(out_var->Get<framework::LoDTensor>());
@@ -156,7 +156,7 @@ inline void ExtractActivationGradTensor(
     if (CanBeUsedBySelectedRows.count(context.Type())) {
       *X = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var);
     } else {
-      *X = context.Input<framework::Tensor>("X");
+      *X = context.Input<phi::DenseTensor>("X");
     }
   } else {
     VLOG(10) << " Inplace activation of Op : " << context.Type();
@@ -171,8 +171,8 @@ class ActivationKernel
   using T = typename Functor::ELEMENT_TYPE;
 
   void Compute(const framework::ExecutionContext& context) const override {
-    const framework::Tensor* X = nullptr;
-    framework::Tensor* Out = nullptr;
+    const phi::DenseTensor* X = nullptr;
+    phi::DenseTensor* Out = nullptr;
     ExtractActivationTensor(context, &X, &Out);
     Out->mutable_data<T>(context.GetPlace());
 
@@ -205,8 +205,8 @@ class ActivationGradKernel
  public:
   using T = typename Functor::ELEMENT_TYPE;
   void Compute(const framework::ExecutionContext& context) const override {
-    const framework::Tensor *X, *Out, *dOut;
-    framework::Tensor* dX = nullptr;
+    const phi::DenseTensor *X, *Out, *dOut;
+    phi::DenseTensor* dX = nullptr;
     X = Out = dOut = nullptr;
     ExtractActivationGradTensor<Functor::FwdDeps()>(
         context, &X, &Out, &dOut, &dX);
@@ -391,11 +391,10 @@ template <typename DeviceContext, typename T>
 class ELUGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<framework::Tensor>("X");
-    auto* Out = context.Input<framework::Tensor>("Out");
-    auto* dOut =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* X = context.Input<phi::DenseTensor>("X");
+    auto* Out = context.Input<phi::DenseTensor>("Out");
+    auto* dOut = context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dX = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
     const float alpha = context.Attr<float>("alpha");
     dX->mutable_data<T>(context.GetPlace());
 
@@ -426,12 +425,12 @@ template <typename T>
 struct AbsGradGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device>
   void operator()(const Device& dev,
-                  const framework::Tensor* X,
-                  const framework::Tensor* Out,
-                  const framework::Tensor* ddX,
-                  framework::Tensor* ddOut,
-                  framework::Tensor* dOut,
-                  framework::Tensor* dX) const {
+                  const phi::DenseTensor* X,
+                  const phi::DenseTensor* Out,
+                  const phi::DenseTensor* ddX,
+                  phi::DenseTensor* ddOut,
+                  phi::DenseTensor* dOut,
+                  phi::DenseTensor* dX) const {
     auto* d = dev.eigen_device();
     auto ddx = framework::EigenVector<T>::Flatten(
         GET_DATA_SAFELY(ddX, "Input", "DDX", "AbsGradGrad"));
@@ -451,11 +450,11 @@ struct AbsGradGradFunctor : public BaseActivationFunctor<T> {
 // others. Impliment extraction kernel separately here.
 inline void ExtractDoubleGradTensorWithInputDOut(
     const framework::ExecutionContext& ctx,
-    const framework::Tensor** X,
-    const framework::Tensor** ddX,
-    framework::Tensor** dX,
-    const framework::Tensor** dOut,
-    framework::Tensor** ddOut) {
+    const phi::DenseTensor** X,
+    const phi::DenseTensor** ddX,
+    phi::DenseTensor** dX,
+    const phi::DenseTensor** dOut,
+    phi::DenseTensor** ddOut) {
   // extract ddX(output), ddOut(input)
   auto ddx_var = ctx.InputVar("DDX");
   auto ddo_var = ctx.OutputVar("DDOut");
@@ -464,9 +463,9 @@ inline void ExtractDoubleGradTensorWithInputDOut(
       platform::errors::NotFound(
           "Cannot get input Variable Out, variable name = %s",
           ctx.InputName("DDX")));
-  *ddX = ctx.Input<framework::Tensor>("DDX");
+  *ddX = ctx.Input<phi::DenseTensor>("DDX");
   if (ddo_var) {
-    *ddOut = ctx.Output<framework::Tensor>("DDOut");
+    *ddOut = ctx.Output<phi::DenseTensor>("DDOut");
   }
   PADDLE_ENFORCE_NOT_NULL(
       ddX,
@@ -482,15 +481,15 @@ inline void ExtractDoubleGradTensorWithInputDOut(
           "Cannot get input Variable Out, variable name = %s",
           ctx.InputName("X")));
   auto dx_var = ctx.OutputVar("DX");
-  *X = ctx.Input<framework::Tensor>("X");
+  *X = ctx.Input<phi::DenseTensor>("X");
   if (dx_var) {
-    *dX = ctx.Output<framework::Tensor>("DX");
+    *dX = ctx.Output<phi::DenseTensor>("DX");
   }
 
   // extract dOut(input)
   auto dout_var = ctx.InputVar("DOut");
   if (dout_var) {
-    *dOut = ctx.Input<framework::Tensor>("DOut");
+    *dOut = ctx.Input<phi::DenseTensor>("DOut");
   }
 }
 
diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps
index 76a05aa37a646..8cc5e925f7490 100644
--- a/paddle/fluid/operators/activation_op.kps
+++ b/paddle/fluid/operators/activation_op.kps
@@ -72,13 +72,13 @@ class ActivationCudaKernel
  public:
   using T = typename Functor::ELEMENT_TYPE;
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor* x = nullptr;
-    framework::Tensor* out = nullptr;
+    const phi::DenseTensor* x = nullptr;
+    phi::DenseTensor* out = nullptr;
     ExtractActivationTensor(ctx, &x, &out);
     out->mutable_data<T>(ctx.GetPlace());
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    std::vector<const framework::Tensor*> ins = {x};
-    std::vector<framework::Tensor*> outs = {out};
+    std::vector<const phi::DenseTensor*> ins = {x};
+    std::vector<phi::DenseTensor*> outs = {out};
     auto functor = Functor();
     auto attrs = functor.GetAttrs();
     for (auto& attr : attrs) {
@@ -95,8 +95,8 @@ class ActivationGradCudaKernel
  public:
   using T = typename Functor::ELEMENT_TYPE;
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor *x, *out, *d_out;
-    framework::Tensor* d_x = nullptr;
+    const phi::DenseTensor *x, *out, *d_out;
+    phi::DenseTensor* d_x = nullptr;
     x = out = d_out = nullptr;
     ExtractActivationGradTensor<Functor::FwdDeps()>(
         ctx, &x, &out, &d_out, &d_x);
@@ -108,8 +108,8 @@ class ActivationGradCudaKernel
       *attr.second = ctx.Attr<float>(attr.first);
     }
 
-    std::vector<const framework::Tensor*> ins = {d_out};
-    std::vector<framework::Tensor*> outs = {d_x};
+    std::vector<const phi::DenseTensor*> ins = {d_out};
+    std::vector<phi::DenseTensor*> outs = {d_x};
 
     if (static_cast<int>(Functor::FwdDeps()) ==
         static_cast<int>(ActBwdOpFwdDeps::kDepOut)) {
diff --git a/paddle/fluid/operators/activation_op_mlu.cc b/paddle/fluid/operators/activation_op_mlu.cc
index 6cfe4738d777b..736b398996b45 100644
--- a/paddle/fluid/operators/activation_op_mlu.cc
+++ b/paddle/fluid/operators/activation_op_mlu.cc
@@ -21,14 +21,14 @@ limitations under the Licnse. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <cnnlActivationMode_t act_mode, typename T>
 class ActivationMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
+    auto* input = ctx.Input<phi::DenseTensor>("X");
+    auto* output = ctx.Output<phi::DenseTensor>("Out");
     float alpha = ctx.HasAttr("alpha") ? ctx.Attr<float>("alpha") : 1.0f;
 
     output->mutable_data<T>(ctx.GetPlace());
@@ -51,9 +51,9 @@ template <cnnlActivationMode_t act_mode, typename T>
 class ActivationGradMLUKernelV1 : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     float alpha = ctx.HasAttr("alpha") ? ctx.Attr<float>("alpha") : 1.0f;
 
     dx->mutable_data<T>(ctx.GetPlace());
@@ -82,9 +82,9 @@ template <cnnlActivationMode_t act_mode, typename T>
 class ActivationGradMLUKernelV2 : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<Tensor>("Out");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* out = ctx.Input<phi::DenseTensor>("Out");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     float alpha = ctx.HasAttr("alpha") ? ctx.Attr<float>("alpha") : 1.0f;
 
     dx->mutable_data<T>(ctx.GetPlace());
@@ -113,9 +113,9 @@ template <cnnlActivationMode_t act_mode, typename T>
 class ActivationGradMLUKernelV3 : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<Tensor>("Out");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* out = ctx.Input<phi::DenseTensor>("Out");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     float alpha = ctx.HasAttr("alpha") ? ctx.Attr<float>("alpha") : 1.0f;
 
     dx->mutable_data<T>(ctx.GetPlace());
@@ -144,8 +144,8 @@ template <typename T>
 class SqrtMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     auto place = ctx.GetPlace();
 
     out->mutable_data<T>(place);
@@ -167,9 +167,9 @@ template <typename T>
 class SqrtGradMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<Tensor>("Out");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* out = ctx.Input<phi::DenseTensor>("Out");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto place = ctx.GetPlace();
 
     dx->mutable_data<T>(place);
@@ -190,8 +190,8 @@ template <cnnlLogBase_t Log_base, typename T>
 class LogMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
+    auto* input = ctx.Input<phi::DenseTensor>("X");
+    auto* output = ctx.Output<phi::DenseTensor>("Out");
     output->mutable_data<T>(ctx.GetPlace());
 
     MLUCnnlTensorDesc input_desc(*input);
@@ -212,8 +212,8 @@ template <typename T>
 class ExpMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
+    auto* input = ctx.Input<phi::DenseTensor>("X");
+    auto* output = ctx.Output<phi::DenseTensor>("Out");
     output->mutable_data<T>(ctx.GetPlace());
 
     MLUCnnlTensorDesc input_desc(*input);
@@ -233,9 +233,9 @@ template <typename T>
 class ExpGradMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<Tensor>("Out");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* out = ctx.Input<phi::DenseTensor>("Out");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     dx->mutable_data<T>(ctx.GetPlace());
     MLUCnnlTensorDesc dout_desc(*dout);
     MLUCnnlTensorDesc dx_desc(*dx);
@@ -260,8 +260,8 @@ template <typename T>
 class HardSwishMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
+    auto* input = ctx.Input<phi::DenseTensor>("X");
+    auto* output = ctx.Output<phi::DenseTensor>("Out");
     output->mutable_data<T>(ctx.GetPlace());
     float threshold = ctx.Attr<float>("threshold");
     float scale = ctx.Attr<float>("scale");
@@ -312,9 +312,9 @@ class HardSwishGradMLUKernel : public framework::OpKernel<T> {
         offset,
         3.0f,
         platform::errors::External("Not support offset [%f] in MLU", offset));
-    auto* out = ctx.Input<Tensor>("X");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* out = ctx.Input<phi::DenseTensor>("X");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     dx->mutable_data<T>(ctx.GetPlace());
 
@@ -342,8 +342,8 @@ template <typename T>
 class HardSigmoidMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
+    auto* input = ctx.Input<phi::DenseTensor>("X");
+    auto* output = ctx.Output<phi::DenseTensor>("Out");
     float slope = ctx.Attr<float>("slope");
     float offset = ctx.Attr<float>("offset");
     output->mutable_data<T>(ctx.GetPlace());
@@ -369,9 +369,9 @@ template <typename T>
 class HardSigmoidGradMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* x = ctx.Input<Tensor>("X");
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     float slope = ctx.Attr<float>("slope");
     float offset = ctx.Attr<float>("offset");
     dx->mutable_data<T>(ctx.GetPlace());
@@ -403,8 +403,8 @@ template <typename T>
 class FloorMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
+    auto* input = ctx.Input<phi::DenseTensor>("X");
+    auto* output = ctx.Output<phi::DenseTensor>("Out");
     output->mutable_data<T>(ctx.GetPlace());
 
     MLUCnnlTensorDesc input_desc(*input);
@@ -422,8 +422,8 @@ template <typename DeviceContext, typename T>
 class ReciprocalMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     auto place = ctx.GetPlace();
     out->mutable_data<T>(place);
     MLUCnnlTensorDesc x_desc(*x);
@@ -437,9 +437,9 @@ template <typename DeviceContext, typename T>
 class ReciprocalGradMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<Tensor>("Out");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* out = ctx.Input<phi::DenseTensor>("Out");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto place = ctx.GetPlace();
     dx->mutable_data<T>(place);
     Tensor square_out;
diff --git a/paddle/fluid/operators/activation_op_npu.cc b/paddle/fluid/operators/activation_op_npu.cc
index 141e5832cceb8..52a472a595a92 100644
--- a/paddle/fluid/operators/activation_op_npu.cc
+++ b/paddle/fluid/operators/activation_op_npu.cc
@@ -24,7 +24,7 @@ limitations under the Licnse. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class PowNPUKernel : public framework::OpKernel<T> {
diff --git a/paddle/fluid/operators/addmm_op.cc b/paddle/fluid/operators/addmm_op.cc
index 833285615f169..a2dbd8e2aa269 100644
--- a/paddle/fluid/operators/addmm_op.cc
+++ b/paddle/fluid/operators/addmm_op.cc
@@ -31,7 +31,6 @@ namespace operators {
 constexpr int kMULMKLDNNINT8 = 1;
 
 using framework::OpKernelType;
-using framework::Tensor;
 
 class AddMMOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/affine_channel_op.cc b/paddle/fluid/operators/affine_channel_op.cc
index 8c6360bfd89cf..23bccf9d8319c 100644
--- a/paddle/fluid/operators/affine_channel_op.cc
+++ b/paddle/fluid/operators/affine_channel_op.cc
@@ -188,11 +188,11 @@ template <typename DeviceContext, typename T>
 class AffineChannelKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* scale = ctx.Input<framework::Tensor>("Scale");
-    auto* bias = ctx.Input<framework::Tensor>("Bias");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* scale = ctx.Input<phi::DenseTensor>("Scale");
+    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
 
-    auto* y = ctx.Output<framework::Tensor>("Out");
+    auto* y = ctx.Output<phi::DenseTensor>("Out");
     y->mutable_data<T>(ctx.GetPlace());
 
     const framework::DataLayout layout =
@@ -233,14 +233,14 @@ template <typename DeviceContext, typename T>
 class AffineChannelGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* scale = ctx.Input<framework::Tensor>("Scale");
-    auto* dy = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* scale = ctx.Input<phi::DenseTensor>("Scale");
+    auto* dy = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
 
-    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto* dscale =
-        ctx.Output<framework::Tensor>(framework::GradVarName("Scale"));
-    auto* dbias = ctx.Output<framework::Tensor>(framework::GradVarName("Bias"));
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
+    auto* dbias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
 
     const framework::DataLayout layout =
         framework::StringToDataLayout(ctx.Attr<std::string>("data_layout"));
diff --git a/paddle/fluid/operators/affine_channel_op.cu b/paddle/fluid/operators/affine_channel_op.cu
index 8fcdb32388418..cbbbd96ad845e 100644
--- a/paddle/fluid/operators/affine_channel_op.cu
+++ b/paddle/fluid/operators/affine_channel_op.cu
@@ -52,11 +52,11 @@ template <typename DeviceContext, typename T>
 class AffineChannelCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* scale = ctx.Input<framework::Tensor>("Scale");
-    auto* bias = ctx.Input<framework::Tensor>("Bias");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* scale = ctx.Input<phi::DenseTensor>("Scale");
+    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
 
-    auto* y = ctx.Output<framework::Tensor>("Out");
+    auto* y = ctx.Output<phi::DenseTensor>("Out");
     y->mutable_data<T>(ctx.GetPlace());
 
     const framework::DataLayout layout =
@@ -137,15 +137,15 @@ template <typename DeviceContext, typename T>
 class AffineChannelGradCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* scale = ctx.Input<framework::Tensor>("Scale");
-    auto* bias = ctx.Input<framework::Tensor>("Bias");
-    auto* dy = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* scale = ctx.Input<phi::DenseTensor>("Scale");
+    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
+    auto* dy = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
 
-    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto* dscale =
-        ctx.Output<framework::Tensor>(framework::GradVarName("Scale"));
-    auto* dbias = ctx.Output<framework::Tensor>(framework::GradVarName("Bias"));
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
+    auto* dbias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
 
     const framework::DataLayout layout =
         framework::StringToDataLayout(ctx.Attr<std::string>("data_layout"));
diff --git a/paddle/fluid/operators/affine_channel_op_xpu.cc b/paddle/fluid/operators/affine_channel_op_xpu.cc
index f31ad6378912c..b3b64cb0b0684 100644
--- a/paddle/fluid/operators/affine_channel_op_xpu.cc
+++ b/paddle/fluid/operators/affine_channel_op_xpu.cc
@@ -29,11 +29,11 @@ template <typename DeviceContext, typename T>
 class AffineChannelXPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* scale = ctx.Input<framework::Tensor>("Scale");
-    auto* bias = ctx.Input<framework::Tensor>("Bias");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* scale = ctx.Input<phi::DenseTensor>("Scale");
+    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
 
-    auto* y = ctx.Output<framework::Tensor>("Out");
+    auto* y = ctx.Output<phi::DenseTensor>("Out");
     y->mutable_data<T>(ctx.GetPlace());
 
     const framework::DataLayout layout =
@@ -90,14 +90,14 @@ template <typename DeviceContext, typename T>
 class AffineChannelGradXPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* scale = ctx.Input<framework::Tensor>("Scale");
-    auto* dy = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* scale = ctx.Input<phi::DenseTensor>("Scale");
+    auto* dy = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
 
-    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto* dscale =
-        ctx.Output<framework::Tensor>(framework::GradVarName("Scale"));
-    auto* dbias = ctx.Output<framework::Tensor>(framework::GradVarName("Bias"));
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
+    auto* dbias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
 
     const framework::DataLayout layout =
         framework::StringToDataLayout(ctx.Attr<std::string>("data_layout"));
diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc
index 1c0b8800f7bf5..2411860aa9e74 100644
--- a/paddle/fluid/operators/affine_grid_op.cc
+++ b/paddle/fluid/operators/affine_grid_op.cc
@@ -28,7 +28,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class AffineGridOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc b/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc
index 78bacc3016178..508c51de723c0 100644
--- a/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc
+++ b/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc
@@ -21,13 +21,13 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class AllocFloatStatusKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* float_status = ctx.Output<framework::Tensor>("FloatStatus");
+    auto* float_status = ctx.Output<phi::DenseTensor>("FloatStatus");
     float_status->mutable_data<T>(ctx.GetPlace());
 
     const auto& runner =
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc
index 41ba11ac04609..5f5415ffd37d0 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class CheckFiniteAndUnscaleMLUKernel : public framework::OpKernel<T> {
@@ -28,10 +28,10 @@ class CheckFiniteAndUnscaleMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
     auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
-    const auto xs = ctx.MultiInput<framework::Tensor>("X");
-    const auto* scale = ctx.Input<framework::Tensor>("Scale");
-    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
-    auto* found_inf = ctx.Output<framework::Tensor>("FoundInfinite");
+    const auto xs = ctx.MultiInput<phi::DenseTensor>("X");
+    const auto* scale = ctx.Input<phi::DenseTensor>("Scale");
+    auto outs = ctx.MultiOutput<phi::DenseTensor>("Out");
+    auto* found_inf = ctx.Output<phi::DenseTensor>("FoundInfinite");
 
     found_inf->mutable_data<bool>(dev_ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
index 98768afa9362a..3b6e2ba7184c0 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
@@ -22,7 +22,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 // NOTE(zhiqiu): The CheckFiniteAndUnscaleNPUKernel is different from CUDA.
 // On NPU, we do not really check the data of input tensors,
@@ -34,11 +34,11 @@ template <typename T>
 class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
-    const auto xs = ctx.MultiInput<framework::Tensor>("X");
-    const auto* scale = ctx.Input<framework::Tensor>("Scale");
-    const auto* float_status = ctx.Input<framework::Tensor>("FloatStatus");
-    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
-    auto* found_inf = ctx.Output<framework::Tensor>("FoundInfinite");
+    const auto xs = ctx.MultiInput<phi::DenseTensor>("X");
+    const auto* scale = ctx.Input<phi::DenseTensor>("Scale");
+    const auto* float_status = ctx.Input<phi::DenseTensor>("FloatStatus");
+    auto outs = ctx.MultiOutput<phi::DenseTensor>("Out");
+    auto* found_inf = ctx.Output<phi::DenseTensor>("FoundInfinite");
 
     found_inf->mutable_data<bool>(ctx.GetPlace());
 
@@ -52,7 +52,7 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
     FillNpuTensorWithConstant<T>(&const_tensor, static_cast<T>(1.0));
 
     // Inverse(1.0/scale)
-    Tensor* tmp_inverse_out = const_cast<Tensor*>(scale);
+    phi::DenseTensor* tmp_inverse_out = const_cast<phi::DenseTensor*>(scale);
     Tensor inverse_out(scale->type());
     inverse_out.Resize(scale->dims());
     inverse_out.mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
index cc60476c2690b..02dadf385e102 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
@@ -31,7 +31,7 @@ limitations under the License. */
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 
-using Tensor = paddle::framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 USE_OP_ITSELF(check_finite_and_unscale);
 USE_OP_DEVICE_KERNEL(check_finite_and_unscale, NPU);
diff --git a/paddle/fluid/operators/amp/clear_float_status_op_npu.cc b/paddle/fluid/operators/amp/clear_float_status_op_npu.cc
index 1f3669a4f13d7..b5750181139d4 100644
--- a/paddle/fluid/operators/amp/clear_float_status_op_npu.cc
+++ b/paddle/fluid/operators/amp/clear_float_status_op_npu.cc
@@ -21,14 +21,14 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class ClearFloatStatusKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* float_status = ctx.Input<framework::Tensor>("FloatStatus");
-    auto* float_status_out = ctx.Output<framework::Tensor>("FloatStatusOut");
+    const auto* float_status = ctx.Input<phi::DenseTensor>("FloatStatus");
+    auto* float_status_out = ctx.Output<phi::DenseTensor>("FloatStatusOut");
     // NOTE(zhiqiu): NPUClearFloatStatus modifies the input.
     PADDLE_ENFORCE_EQ(float_status_out,
                       float_status,
diff --git a/paddle/fluid/operators/amp/get_float_status_op_npu.cc b/paddle/fluid/operators/amp/get_float_status_op_npu.cc
index c1e958ea4d237..8befb2df9b835 100644
--- a/paddle/fluid/operators/amp/get_float_status_op_npu.cc
+++ b/paddle/fluid/operators/amp/get_float_status_op_npu.cc
@@ -21,14 +21,14 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class GetFloatStatusKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* float_status = ctx.Input<framework::Tensor>("FloatStatus");
-    auto* float_status_out = ctx.Output<framework::Tensor>("FloatStatusOut");
+    const auto* float_status = ctx.Input<phi::DenseTensor>("FloatStatus");
+    auto* float_status_out = ctx.Output<phi::DenseTensor>("FloatStatusOut");
     // GetClearFloatStatus modifies the input.
     PADDLE_ENFORCE_EQ(float_status_out,
                       float_status,
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cc b/paddle/fluid/operators/amp/update_loss_scaling_op.cc
index 03a5f734c2dc8..f8ccac27c19c9 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cc
@@ -41,7 +41,7 @@ class UpdateLossScalingOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const framework::Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
 #ifndef PADDLE_WITH_XPU
     if (var_name == "FoundInfinite" || var_name == "StopUpdate") {
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
index 24784803f1732..dc1cd958f458c 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
@@ -25,21 +25,21 @@ DECLARE_int32(min_loss_scaling);
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 void Update(const platform::NPUDeviceContext& ctx,
             const std::vector<bool> found_inf_vec,
-            const Tensor* pre_loss_scaling_tensor,
-            const Tensor* good_in_tensor,
-            const Tensor* bad_in_tensor,
+            const phi::DenseTensor* pre_loss_scaling_tensor,
+            const phi::DenseTensor* good_in_tensor,
+            const phi::DenseTensor* bad_in_tensor,
             const int incr_every_n_steps,
             const int decr_every_n_nan_or_inf,
             const float incr_ratio,
             const float decr_ratio,
-            Tensor* updated_loss_scaling_tensor,
-            Tensor* good_out_tensor,
-            Tensor* bad_out_tensor) {
+            phi::DenseTensor* updated_loss_scaling_tensor,
+            phi::DenseTensor* good_out_tensor,
+            phi::DenseTensor* bad_out_tensor) {
   auto place = ctx.GetPlace();
   auto stream = ctx.stream();
   if (found_inf_vec[0]) {
@@ -154,16 +154,16 @@ class UpdateLossScalingFunctor {
  public:
   void operator()(const platform::NPUDeviceContext& dev_ctx,
                   const std::vector<bool> found_inf_vec,
-                  const Tensor* pre_loss_scaling_tensor,
-                  const Tensor* good_in_tensor,
-                  const Tensor* bad_in_tensor,
+                  const phi::DenseTensor* pre_loss_scaling_tensor,
+                  const phi::DenseTensor* good_in_tensor,
+                  const phi::DenseTensor* bad_in_tensor,
                   const int incr_every_n_steps,
                   const int decr_every_n_nan_or_inf,
                   const float incr_ratio,
                   const float decr_ratio,
-                  Tensor* updated_loss_scaling_tensor,
-                  Tensor* good_out_tensor,
-                  Tensor* bad_out_tensor) const {
+                  phi::DenseTensor* updated_loss_scaling_tensor,
+                  phi::DenseTensor* good_out_tensor,
+                  phi::DenseTensor* bad_out_tensor) const {
     Update<T>(dev_ctx,
               found_inf_vec,
               pre_loss_scaling_tensor,
@@ -184,14 +184,14 @@ class LazyZerosNPU {
  public:
   void operator()(const platform::NPUDeviceContext& dev_ctx,
                   const std::vector<bool> found_inf_vec,
-                  const std::vector<const framework::Tensor*>& xs,
-                  const std::vector<framework::Tensor*>& outs) const {
+                  const std::vector<const phi::DenseTensor*>& xs,
+                  const std::vector<phi::DenseTensor*>& outs) const {
     if (!xs.size()) {
       return;
     }
     auto place = dev_ctx.GetPlace();
     auto stream = dev_ctx.stream();
-    Tensor* zero_tensor = nullptr;
+    phi::DenseTensor* zero_tensor = nullptr;
     void* zero_ptr = nullptr;
     if (found_inf_vec[0]) {
       int max_num = -1;
@@ -234,9 +234,9 @@ class UpdateLossScalingNPUKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
 
-    const auto xs = ctx.MultiInput<framework::Tensor>("X");
-    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
-    const auto* found_inf = ctx.Input<Tensor>("FoundInfinite");
+    const auto xs = ctx.MultiInput<phi::DenseTensor>("X");
+    auto outs = ctx.MultiOutput<phi::DenseTensor>("Out");
+    const auto* found_inf = ctx.Input<phi::DenseTensor>("FoundInfinite");
     PADDLE_ENFORCE_EQ(found_inf->numel(),
                       1,
                       platform::errors::InvalidArgument(
@@ -252,12 +252,13 @@ class UpdateLossScalingNPUKernel : public framework::OpKernel<T> {
       return;
     }
 
-    const auto* pre_loss_scaling = ctx.Input<Tensor>("PrevLossScaling");
-    const auto* good_in = ctx.Input<Tensor>("InGoodSteps");
-    const auto* bad_in = ctx.Input<Tensor>("InBadSteps");
-    auto* updated_loss_scaling = ctx.Output<Tensor>("LossScaling");
-    auto* good_out = ctx.Output<Tensor>("OutGoodSteps");
-    auto* bad_out = ctx.Output<Tensor>("OutBadSteps");
+    const auto* pre_loss_scaling =
+        ctx.Input<phi::DenseTensor>("PrevLossScaling");
+    const auto* good_in = ctx.Input<phi::DenseTensor>("InGoodSteps");
+    const auto* bad_in = ctx.Input<phi::DenseTensor>("InBadSteps");
+    auto* updated_loss_scaling = ctx.Output<phi::DenseTensor>("LossScaling");
+    auto* good_out = ctx.Output<phi::DenseTensor>("OutGoodSteps");
+    auto* bad_out = ctx.Output<phi::DenseTensor>("OutBadSteps");
 
     updated_loss_scaling->mutable_data<MPDType>(dev_ctx.GetPlace());
     good_out->mutable_data<int>(dev_ctx.GetPlace());
diff --git a/paddle/fluid/operators/arg_max_op_mlu.cc b/paddle/fluid/operators/arg_max_op_mlu.cc
index 44f74f016c065..6d61526bc0c96 100644
--- a/paddle/fluid/operators/arg_max_op_mlu.cc
+++ b/paddle/fluid/operators/arg_max_op_mlu.cc
@@ -22,8 +22,8 @@ template <typename T>
 class ArgMaxMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     auto axis = static_cast<int>(ctx.Attr<int64_t>("axis"));
     auto dtype = ctx.Attr<int>("dtype");
     const bool& flatten = ctx.Attr<bool>("flatten");
@@ -49,7 +49,7 @@ class ArgMaxMLUKernel : public framework::OpKernel<T> {
       axis += x_dims.size();
     }
 
-    framework::Tensor flatten_x(x->type());
+    phi::DenseTensor flatten_x(x->type());
     flatten_x.ShareDataWith(*x);
     if (flatten) {
       flatten_x.Resize(phi::make_ddim({x->numel()}));
@@ -66,7 +66,7 @@ class ArgMaxMLUKernel : public framework::OpKernel<T> {
     }
     size_t indices_size_inbytes = out_count * sizeof(int32_t);
     auto& dev_ctx = ctx.template device_context<MLUDeviceContext>();
-    framework::Tensor value_out =
+    phi::DenseTensor value_out =
         ctx.AllocateTmpTensor<T, MLUDeviceContext>(out->dims(), dev_ctx);
     MLUCnnlTensorDesc value_out_desc(value_out);
     MLUCnnlTensorDesc input_desc(
@@ -93,7 +93,7 @@ class ArgMaxMLUKernel : public framework::OpKernel<T> {
                       GetBasePtr(&value_out));
     } else {
       out->template mutable_data<int64_t>(ctx.GetPlace());
-      framework::Tensor out_int32 =
+      phi::DenseTensor out_int32 =
           ctx.AllocateTmpTensor<int32_t, MLUDeviceContext>(out->dims(),
                                                            dev_ctx);
       MLUCnnl::Reduce(ctx,
diff --git a/paddle/fluid/operators/arg_max_op_npu.cc b/paddle/fluid/operators/arg_max_op_npu.cc
index e35b70754ae71..6e5048db47ead 100644
--- a/paddle/fluid/operators/arg_max_op_npu.cc
+++ b/paddle/fluid/operators/arg_max_op_npu.cc
@@ -18,7 +18,7 @@ limitations under the Licnse. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using NPUDeviceContext = platform::NPUDeviceContext;
 
 template <typename T>
@@ -29,8 +29,8 @@ struct VisitDataArgNPUMaxFunctor {
       : ctx(ctx) {}
   template <typename Tout>
   void apply() const {
-    auto& x = *(ctx.Input<framework::Tensor>("X"));
-    auto& out = *(ctx.Output<framework::Tensor>("Out"));
+    auto& x = *(ctx.Input<phi::DenseTensor>("X"));
+    auto& out = *(ctx.Output<phi::DenseTensor>("Out"));
     out.template mutable_data<Tout>(ctx.GetPlace());
     auto axis = ctx.Attr<int64_t>("axis");
     auto dtype = ctx.Attr<int>("dtype");
diff --git a/paddle/fluid/operators/arg_min_op_npu.cc b/paddle/fluid/operators/arg_min_op_npu.cc
index 0419bbdf9f170..fe917140b7b9f 100644
--- a/paddle/fluid/operators/arg_min_op_npu.cc
+++ b/paddle/fluid/operators/arg_min_op_npu.cc
@@ -17,17 +17,17 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class ArgMinNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
     int64_t axis = ctx.Attr<int64_t>("axis");
     auto dtype = ctx.Attr<int>("dtype");
 
-    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     out->mutable_data<int32_t>(ctx.GetPlace());
 
     NpuOpRunner runner;
diff --git a/paddle/fluid/operators/argsort_op_mlu.cc b/paddle/fluid/operators/argsort_op_mlu.cc
index edbffb6e0cfae..e1791a8356438 100644
--- a/paddle/fluid/operators/argsort_op_mlu.cc
+++ b/paddle/fluid/operators/argsort_op_mlu.cc
@@ -44,7 +44,7 @@ class ArgsortMLUKernel : public framework::OpKernel<T> {
     indices->mutable_data<int64_t>(place);
 
     // cnnl only support int32/int16 type of indices
-    framework::Tensor indices_int32(framework::TransToPhiDataType(VT::INT32));
+    phi::DenseTensor indices_int32(framework::TransToPhiDataType(VT::INT32));
     indices_int32.Resize(indices->dims());
     indices_int32.mutable_data<int32_t>(place);
 
@@ -79,9 +79,9 @@ template <typename T>
 class ArgsortGradMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* indices = ctx.Input<Tensor>("Indices");
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* indices = ctx.Input<phi::DenseTensor>("Indices");
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
     int axis = ctx.Attr<int>("axis");
     dx->mutable_data<T>(ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/argsort_op_npu.cc b/paddle/fluid/operators/argsort_op_npu.cc
index 7d9c4ffdaf6da..7aedb41c9fde3 100644
--- a/paddle/fluid/operators/argsort_op_npu.cc
+++ b/paddle/fluid/operators/argsort_op_npu.cc
@@ -18,15 +18,15 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using NPUDeviceContext = platform::NPUDeviceContext;
 
 template <typename T>
 static void TranposeNPU(const framework::ExecutionContext& ctx,
                         const aclrtStream& stream,
                         std::vector<int64_t>* perm,
-                        const Tensor& in,
-                        Tensor* out) {
+                        const phi::DenseTensor& in,
+                        phi::DenseTensor* out) {
   out->mutable_data<T>(ctx.GetPlace());
   NpuOpRunner runner;
   runner.SetType("Transpose")
@@ -38,8 +38,8 @@ static void TranposeNPU(const framework::ExecutionContext& ctx,
 
 static void CastToInt64(const framework::ExecutionContext& ctx,
                         const aclrtStream& stream,
-                        const Tensor& in,
-                        Tensor* out) {
+                        const phi::DenseTensor& in,
+                        phi::DenseTensor* out) {
   out->mutable_data<int64_t>(ctx.GetPlace());
   NpuOpRunner runner;
   runner.SetType("Cast")
@@ -51,8 +51,8 @@ static void CastToInt64(const framework::ExecutionContext& ctx,
 
 static void CastToFP32(const framework::ExecutionContext& ctx,
                        const aclrtStream& stream,
-                       const Tensor& in,
-                       Tensor* out) {
+                       const phi::DenseTensor& in,
+                       phi::DenseTensor* out) {
   out->mutable_data<float>(ctx.GetPlace());
   NpuOpRunner runner;
   runner.SetType("Cast")
@@ -66,9 +66,9 @@ template <typename T>
 class ArgsortNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<framework::Tensor>("X");
-    auto* output = ctx.Output<framework::Tensor>("Out");
-    auto* indices = ctx.Output<framework::Tensor>("Indices");
+    auto* input = ctx.Input<phi::DenseTensor>("X");
+    auto* output = ctx.Output<phi::DenseTensor>("Out");
+    auto* indices = ctx.Output<phi::DenseTensor>("Indices");
     int axis = ctx.Attr<int>("axis");
     bool descending = ctx.Attr<bool>("descending");
 
@@ -176,9 +176,9 @@ template <typename T, typename Type>
 static void FullAssignNPU(const framework::ExecutionContext& ctx,
                           const aclrtStream& stream,
                           const framework::DDim in_dims,
-                          const Tensor& input,
-                          const Tensor& indices,
-                          Tensor* t_out) {
+                          const phi::DenseTensor& input,
+                          const phi::DenseTensor& indices,
+                          phi::DenseTensor* t_out) {
   const int64_t input_height =
       phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
   const int64_t input_width = in_dims[in_dims.size() - 1];
@@ -226,9 +226,9 @@ template <typename T>
 class ArgsortGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* indices = ctx.Input<Tensor>("Indices");
-    auto* dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* indices = ctx.Input<phi::DenseTensor>("Indices");
+    auto* dX = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* dO = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
     int axis = ctx.Attr<int>("axis");
 
     auto in_dims = indices->dims();
diff --git a/paddle/fluid/operators/array_operator.h b/paddle/fluid/operators/array_operator.h
index 19b90d360201e..990ef8d8556b3 100644
--- a/paddle/fluid/operators/array_operator.h
+++ b/paddle/fluid/operators/array_operator.h
@@ -55,7 +55,7 @@ class ArrayOp : public framework::OperatorBase {
         platform::is_npu_place(i_tensor.place()) ||
         platform::is_custom_place(i_tensor.place())) {
       // FIXME: Avoid copy from GPU to CPU
-      framework::Tensor t;
+      phi::DenseTensor t;
       framework::TensorCopy(i_tensor, platform::CPUPlace(), dev_ctx, &t);
       dev_ctx.Wait();
       offset = static_cast<size_t>(*t.data<int64_t>());
diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc
index 89c817889f144..9236c0b6ae5c9 100644
--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
@@ -44,8 +44,8 @@ struct ArrayToLoDFunctorImpl {
 };
 
 struct ArrayToLoDFunctor : public std::unary_function<platform::Place, void> {
-  std::vector<framework::Tensor> in;
-  mutable framework::Tensor *out;
+  std::vector<phi::DenseTensor> in;
+  mutable phi::DenseTensor *out;
 
   template <typename Place>
   void operator()(Place place) const {
diff --git a/paddle/fluid/operators/ascend_trigger_op.h b/paddle/fluid/operators/ascend_trigger_op.h
index d1eaa00c2a3e0..943960e1bb1c5 100644
--- a/paddle/fluid/operators/ascend_trigger_op.h
+++ b/paddle/fluid/operators/ascend_trigger_op.h
@@ -33,8 +33,8 @@ class AscendTriggerCPUKernel : public framework::OpKernel<T> {
     auto ascend_ptr = paddle::framework::AscendInstance::GetInstance();
     auto graph_idx = ctx.Attr<int>("graph_idx");
     VLOG(4) << "AscendTrigger Kernel, begin to run graph: " << graph_idx;
-    auto inputs = ctx.MultiInput<framework::Tensor>("FeedList");
-    auto outputs = ctx.MultiOutput<framework::Tensor>("FetchList");
+    auto inputs = ctx.MultiInput<phi::DenseTensor>("FeedList");
+    auto outputs = ctx.MultiOutput<phi::DenseTensor>("FetchList");
     ascend_ptr->RunAscendSubgraph(graph_idx, inputs, &outputs);
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/operators/assign_op.cc b/paddle/fluid/operators/assign_op.cc
index ab6684ae33f7a..91bc5019f3f07 100644
--- a/paddle/fluid/operators/assign_op.cc
+++ b/paddle/fluid/operators/assign_op.cc
@@ -43,7 +43,7 @@ class AssignOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const framework::Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     return framework::OpKernelType(expected_kernel_type.data_type_,
                                    expected_kernel_type.place_,
diff --git a/paddle/fluid/operators/assign_op_test.cc b/paddle/fluid/operators/assign_op_test.cc
index 0b6245f17d38d..8586329e501c8 100644
--- a/paddle/fluid/operators/assign_op_test.cc
+++ b/paddle/fluid/operators/assign_op_test.cc
@@ -87,7 +87,7 @@ TEST(AssignOp, AssignSelectedRows) {
   int64_t height = 10;
 
   phi::SelectedRows input(rows, height);
-  paddle::framework::Tensor* input_tensor = input.mutable_value();
+  phi::DenseTensor* input_tensor = input.mutable_value();
 
   paddle::framework::DDim in_dims = phi::make_ddim({3, 4});
   int* in_data = input_tensor->mutable_data<int>(in_dims, cpu_place);
@@ -104,7 +104,7 @@ TEST(AssignOp, AssignSelectedRows) {
     EXPECT_EQ(rows[i], out_rows[i]);
   }
   EXPECT_EQ(height, out_selected_row.height());
-  const paddle::framework::Tensor& out_tensor = out_selected_row.value();
+  const phi::DenseTensor& out_tensor = out_selected_row.value();
   paddle::framework::DDim out_dims = out_tensor.dims();
   EXPECT_EQ(in_dims, out_dims);
   auto* out_data = out_tensor.data<int>();
diff --git a/paddle/fluid/operators/assign_pos_op.cu b/paddle/fluid/operators/assign_pos_op.cu
index 3f36e8b13476d..f5704b6a08617 100644
--- a/paddle/fluid/operators/assign_pos_op.cu
+++ b/paddle/fluid/operators/assign_pos_op.cu
@@ -73,7 +73,7 @@ class AssignPosCUDAKernel : public framework::OpKernel<T> {
     T* cum_data = const_cast<T*>(cum_count->data<T>());
     auto cum_size = cum_count->numel();
 
-    framework::Tensor cpu_eff_num_len;
+    phi::DenseTensor cpu_eff_num_len;
     int64_t cpu_eff_num_len_data = 0;
     if (platform::is_cpu_place(eff_num_len->place())) {
       cpu_eff_num_len_data = eff_num_len->data<T>()[0];
diff --git a/paddle/fluid/operators/assign_value_op.h b/paddle/fluid/operators/assign_value_op.h
index 775f0788aea3f..1954c1ee1571d 100644
--- a/paddle/fluid/operators/assign_value_op.h
+++ b/paddle/fluid/operators/assign_value_op.h
@@ -24,17 +24,15 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
-
 template <typename T>
 typename std::enable_if<std::is_same<T, bool>::value>::type CopyVectorToTensor(
     const char* value_name,
-    framework::Tensor* out,
+    phi::DenseTensor* out,
     const framework::ExecutionContext& ctx) {
-  // If attribute value dtype is vector<bool>, it will be converted to
-  // vector<int>.
-  // at the same time, we can not use vector<bool> to hold the value, because
-  // the c++ use bit value to replace byte value.
+  // phi::DenseTensore dtype is vector<bool>, it will be converted to
+  //  vector<int>.
+  //  at the same time, we can not use vector<bool> to hold the value, because
+  //  the c++ use bit value to replace byte value.
   auto values = ctx.Attr<std::vector<int>>(value_name);
   framework::TensorFromVector(values, ctx.device_context(), out);
 
@@ -51,7 +49,7 @@ typename std::enable_if<std::is_same<T, bool>::value>::type CopyVectorToTensor(
 template <typename T>
 typename std::enable_if<!std::is_same<T, bool>::value>::type CopyVectorToTensor(
     const char* value_name,
-    framework::Tensor* out,
+    phi::DenseTensor* out,
     const framework::ExecutionContext& ctx) {
   auto values = ctx.Attr<std::vector<T>>(value_name);
   framework::TensorFromVector(values, ctx.device_context(), out);
@@ -62,7 +60,7 @@ class AssignValueKernel : public framework::OpKernel<T> {
  public:
   virtual void Compute(const framework::ExecutionContext& ctx) const {
     auto shape = ctx.Attr<std::vector<int>>("shape");
-    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     int dtype = ctx.Attr<int>("dtype");
     const char* value_name = nullptr;
     switch (dtype) {
diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index 203ccd8e6034d..49a847eecaeaa 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -340,21 +340,22 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
     using DeviceContext = phi::CPUContext;
 
     auto* x = ctx.Input<LoDTensor>("X");
-    auto* h0 = ctx.Input<Tensor>("H0");
-    auto* c0 = ctx.Input<Tensor>("C0");
-    auto* atten_w = ctx.Input<Tensor>("AttentionWeight");
-    auto* atten_b = ctx.Input<Tensor>("AttentionBias");
-    auto* atten_scalar = ctx.Input<Tensor>("AttentionScalar");
-    auto* atten_scalar_bias = ctx.Input<Tensor>("AttentionScalarBias");
-    auto* lstm_w = ctx.Input<Tensor>("LSTMWeight");
-    auto* lstm_b = ctx.Input<Tensor>("LSTMBias");
+    auto* h0 = ctx.Input<phi::DenseTensor>("H0");
+    auto* c0 = ctx.Input<phi::DenseTensor>("C0");
+    auto* atten_w = ctx.Input<phi::DenseTensor>("AttentionWeight");
+    auto* atten_b = ctx.Input<phi::DenseTensor>("AttentionBias");
+    auto* atten_scalar = ctx.Input<phi::DenseTensor>("AttentionScalar");
+    auto* atten_scalar_bias =
+        ctx.Input<phi::DenseTensor>("AttentionScalarBias");
+    auto* lstm_w = ctx.Input<phi::DenseTensor>("LSTMWeight");
+    auto* lstm_b = ctx.Input<phi::DenseTensor>("LSTMBias");
 
     auto* hidden_out = ctx.Output<LoDTensor>("Hidden");
     auto* cell_out = ctx.Output<LoDTensor>("Cell");
-    auto* atted_x = ctx.Output<Tensor>("AttentionedX");
-    auto* fc_out = ctx.Output<Tensor>("AttentionFCOut");
-    auto* lstm_x = ctx.Output<Tensor>("LSTMX");
-    auto* lstm_out = ctx.Output<Tensor>("LSTMOUT");
+    auto* atted_x = ctx.Output<phi::DenseTensor>("AttentionedX");
+    auto* fc_out = ctx.Output<phi::DenseTensor>("AttentionFCOut");
+    auto* lstm_x = ctx.Output<phi::DenseTensor>("LSTMX");
+    auto* lstm_out = ctx.Output<phi::DenseTensor>("LSTMOUT");
 
     // some shape should be reshape here since infershape can not get lod info
     auto x_lod = x->lod();
diff --git a/paddle/fluid/operators/attention_lstm_op.h b/paddle/fluid/operators/attention_lstm_op.h
index 6ede3a7f3c96d..16142be6d1e35 100644
--- a/paddle/fluid/operators/attention_lstm_op.h
+++ b/paddle/fluid/operators/attention_lstm_op.h
@@ -19,7 +19,7 @@ namespace paddle {
 namespace operators {
 
 using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class AttentionLSTMOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/batch_fc_op.cu b/paddle/fluid/operators/batch_fc_op.cu
index 362489e51acc2..b8b67d344d2d6 100644
--- a/paddle/fluid/operators/batch_fc_op.cu
+++ b/paddle/fluid/operators/batch_fc_op.cu
@@ -22,7 +22,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using framework::Tensor;
 
 const int CUDA_NUM_THREADS = 1024;
 static inline int GET_BLOCKS(const int N) {
@@ -95,8 +94,8 @@ class BatchFCCUDAKernel : public framework::OpKernel<T> {
     // b.dim = slot_pairs_num * out_dim
     // output.dim = slot_pairs_num * ins_num * out_dim
     auto* input = ctx.Input<framework::LoDTensor>("Input");
-    auto* w = ctx.Input<Tensor>("W");
-    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* w = ctx.Input<phi::DenseTensor>("W");
+    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
     auto* output = ctx.Output<framework::LoDTensor>("Out");
     auto input_dims = input->dims();
     auto w_dims = w->dims();
@@ -154,13 +153,13 @@ template <typename DeviceContext, typename T>
 class BatchFCGradOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("Input");
-    auto* w = ctx.Input<Tensor>("W");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* input = ctx.Input<phi::DenseTensor>("Input");
+    auto* w = ctx.Input<phi::DenseTensor>("W");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
 
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    auto* dw = ctx.Output<Tensor>(framework::GradVarName("W"));
-    auto* db = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
+    auto* dw = ctx.Output<phi::DenseTensor>(framework::GradVarName("W"));
+    auto* db = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
 
     auto input_dims = input->dims();
     auto w_dims = w->dims();
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 84f22ebff4084..4979ab0345200 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -178,21 +178,24 @@ framework::OpKernelType BatchNormOp::GetExpectedKernelType(
   }
   PADDLE_ENFORCE_EQ(
       bn_param_type,
-      framework::TransToProtoVarType(ctx.Input<Tensor>("Scale")->dtype()),
+      framework::TransToProtoVarType(
+          ctx.Input<phi::DenseTensor>("Scale")->dtype()),
       platform::errors::InvalidArgument("Scale input should be of float type"));
   PADDLE_ENFORCE_EQ(
       bn_param_type,
-      framework::TransToProtoVarType(ctx.Input<Tensor>("Bias")->dtype()),
+      framework::TransToProtoVarType(
+          ctx.Input<phi::DenseTensor>("Bias")->dtype()),
       platform::errors::InvalidArgument("Bias input should be of float type"));
   PADDLE_ENFORCE_EQ(
       bn_param_type,
-      framework::TransToProtoVarType(ctx.Input<Tensor>("Mean")->dtype()),
+      framework::TransToProtoVarType(
+          ctx.Input<phi::DenseTensor>("Mean")->dtype()),
       platform::errors::InvalidArgument("Mean input should be of float type"));
-  PADDLE_ENFORCE_EQ(
-      bn_param_type,
-      framework::TransToProtoVarType(ctx.Input<Tensor>("Variance")->dtype()),
-      platform::errors::InvalidArgument(
-          "Variance input should be of float type"));
+  PADDLE_ENFORCE_EQ(bn_param_type,
+                    framework::TransToProtoVarType(
+                        ctx.Input<phi::DenseTensor>("Variance")->dtype()),
+                    platform::errors::InvalidArgument(
+                        "Variance input should be of float type"));
 
   // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
 #ifdef PADDLE_WITH_MKLDNN
diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu
index a19b087245a89..a9d1968d9fe58 100644
--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
@@ -34,7 +34,7 @@ DECLARE_bool(cudnn_batchnorm_spatial_persistent);
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using DataLayout = framework::DataLayout;
 template <typename T>
 using CudnnDataType = platform::CudnnDataType<T>;
diff --git a/paddle/fluid/operators/batch_norm_op.h b/paddle/fluid/operators/batch_norm_op.h
index 1efabccb45e60..95008b19f377d 100644
--- a/paddle/fluid/operators/batch_norm_op.h
+++ b/paddle/fluid/operators/batch_norm_op.h
@@ -27,7 +27,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 using DataLayout = framework::DataLayout;
 
@@ -54,7 +54,7 @@ class BatchNormOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override;
 };
 
@@ -69,7 +69,7 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override;
 };
 
diff --git a/paddle/fluid/operators/batch_norm_op_mlu.cc b/paddle/fluid/operators/batch_norm_op_mlu.cc
index 1aa445bda3717..ef97e07ec71d1 100644
--- a/paddle/fluid/operators/batch_norm_op_mlu.cc
+++ b/paddle/fluid/operators/batch_norm_op_mlu.cc
@@ -38,7 +38,7 @@ class MLUBatchNormOpKernel : public framework::OpKernel<T> {
     const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
     DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
 
-    const auto *x = ctx.Input<Tensor>("X");
+    const auto *x = ctx.Input<phi::DenseTensor>("X");
     const auto &x_dims = x->dims();
     PADDLE_ENFORCE_GE(
         x_dims.size(),
@@ -60,16 +60,16 @@ class MLUBatchNormOpKernel : public framework::OpKernel<T> {
                                           : x_dims[x_dims.size() - 1]);
     const int sample_size = x->numel() / N / C;
 
-    const auto *running_mean = ctx.Input<Tensor>("Mean");
-    const auto *running_var = ctx.Input<Tensor>("Variance");
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
+    const auto *running_mean = ctx.Input<phi::DenseTensor>("Mean");
+    const auto *running_var = ctx.Input<phi::DenseTensor>("Variance");
+    const auto *scale = ctx.Input<phi::DenseTensor>("Scale");
+    const auto *bias = ctx.Input<phi::DenseTensor>("Bias");
 
-    auto *y = ctx.Output<Tensor>("Y");
-    auto *mean_out = ctx.Output<Tensor>("MeanOut");
-    auto *variance_out = ctx.Output<Tensor>("VarianceOut");
-    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
-    auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
+    auto *y = ctx.Output<phi::DenseTensor>("Y");
+    auto *mean_out = ctx.Output<phi::DenseTensor>("MeanOut");
+    auto *variance_out = ctx.Output<phi::DenseTensor>("VarianceOut");
+    auto *saved_mean = ctx.Output<phi::DenseTensor>("SavedMean");
+    auto *saved_variance = ctx.Output<phi::DenseTensor>("SavedVariance");
 
     // alloc memory
     y->mutable_data<T>(place);
@@ -115,7 +115,7 @@ class MLUBatchNormOpKernel : public framework::OpKernel<T> {
     }
 
     if (ctx.HasInput("MomentumTensor")) {
-      const auto *mom_tensor = ctx.Input<Tensor>("MomentumTensor");
+      const auto *mom_tensor = ctx.Input<phi::DenseTensor>("MomentumTensor");
       Tensor mom_cpu;
       framework::TensorCopySync(*mom_tensor, platform::CPUPlace(), &mom_cpu);
       momentum = mom_cpu.data<float>()[0];
@@ -161,22 +161,24 @@ class MLUBatchNormGradOpKernel : public framework::OpKernel<T> {
 
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
+    const auto *x = ctx.Input<phi::DenseTensor>("X");
+    const auto *d_y = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
+    const auto *scale = ctx.Input<phi::DenseTensor>("Scale");
+    const auto *bias = ctx.Input<phi::DenseTensor>("Bias");
+    const auto *saved_mean = ctx.Input<phi::DenseTensor>("SavedMean");
     // SavedVariance have been reverted in forward operator
-    const auto *saved_inv_variance = ctx.Input<Tensor>("SavedVariance");
+    const auto *saved_inv_variance =
+        ctx.Input<phi::DenseTensor>("SavedVariance");
     const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
     bool use_global_stats = ctx.Attr<bool>("use_global_stats");
     const bool is_test = ctx.Attr<bool>("is_test");
     const float epsilon = ctx.Attr<float>("epsilon");
     DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
 
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+    auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto *d_scale =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
+    auto *d_bias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
 
     auto &dev_ctx = ctx.template device_context<MLUDeviceContext>();
     auto d_x_tmp =
@@ -270,8 +272,8 @@ class MLUBatchNormGradOpKernel : public framework::OpKernel<T> {
     }
 
     if (use_global_stats) {
-      const auto *running_mean = ctx.Input<Tensor>("Mean");
-      const auto *running_variance = ctx.Input<Tensor>("Variance");
+      const auto *running_mean = ctx.Input<phi::DenseTensor>("Mean");
+      const auto *running_variance = ctx.Input<phi::DenseTensor>("Variance");
       MLUCnnl::FusedBatchNormGrad(ctx,
                                   false /*is_training*/,
                                   transformed_desc.get(),
diff --git a/paddle/fluid/operators/batch_norm_op_npu.cc b/paddle/fluid/operators/batch_norm_op_npu.cc
index b369a2011aff9..034c578ddde58 100644
--- a/paddle/fluid/operators/batch_norm_op_npu.cc
+++ b/paddle/fluid/operators/batch_norm_op_npu.cc
@@ -36,7 +36,7 @@ class NPUBatchNormOpKernel : public framework::OpKernel<T> {
     const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
     DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
 
-    const auto *x = ctx.Input<Tensor>("X");
+    const auto *x = ctx.Input<phi::DenseTensor>("X");
     const auto &x_dims = x->dims();
     PADDLE_ENFORCE_EQ(
         (x_dims.size() == 4UL || x_dims.size() == 3UL),
@@ -47,12 +47,12 @@ class NPUBatchNormOpKernel : public framework::OpKernel<T> {
             x_dims.to_str(),
             x_dims.size()));
 
-    const auto *running_mean = ctx.Input<Tensor>("Mean");
-    const auto *running_var = ctx.Input<Tensor>("Variance");
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
+    const auto *running_mean = ctx.Input<phi::DenseTensor>("Mean");
+    const auto *running_var = ctx.Input<phi::DenseTensor>("Variance");
+    const auto *scale = ctx.Input<phi::DenseTensor>("Scale");
+    const auto *bias = ctx.Input<phi::DenseTensor>("Bias");
 
-    auto *y = ctx.Output<Tensor>("Y");
+    auto *y = ctx.Output<phi::DenseTensor>("Y");
     y->mutable_data<T>(ctx.GetPlace());
 
     auto &dev_ctx = ctx.template device_context<NPUDeviceContext>();
@@ -76,10 +76,10 @@ class NPUBatchNormOpKernel : public framework::OpKernel<T> {
                       {{"epsilon", epsilon}});
       runner_infer.Run(stream);
     } else {
-      auto *mean_out = ctx.Output<Tensor>("MeanOut");
-      auto *variance_out = ctx.Output<Tensor>("VarianceOut");
-      auto *saved_mean = ctx.Output<Tensor>("SavedMean");
-      auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
+      auto *mean_out = ctx.Output<phi::DenseTensor>("MeanOut");
+      auto *variance_out = ctx.Output<phi::DenseTensor>("VarianceOut");
+      auto *saved_mean = ctx.Output<phi::DenseTensor>("SavedMean");
+      auto *saved_variance = ctx.Output<phi::DenseTensor>("SavedVariance");
       mean_out->mutable_data<float>(ctx.GetPlace());
       variance_out->mutable_data<float>(ctx.GetPlace());
       saved_mean->mutable_data<float>(ctx.GetPlace());
@@ -88,14 +88,14 @@ class NPUBatchNormOpKernel : public framework::OpKernel<T> {
       // if MomentumTensor is set, use MomentumTensor value, momentum
       // is only used in this training branch
       if (ctx.HasInput("MomentumTensor")) {
-        const auto *mom_tensor = ctx.Input<Tensor>("MomentumTensor");
+        const auto *mom_tensor = ctx.Input<phi::DenseTensor>("MomentumTensor");
         Tensor mom_cpu;
         paddle::framework::TensorCopySync(
             *mom_tensor, platform::CPUPlace(), &mom_cpu);
         momentum = mom_cpu.data<float>()[0];
       }
 
-      framework::Tensor sum, square_sum;
+      phi::DenseTensor sum, square_sum;
       sum.mutable_data<float>(running_mean->dims(), ctx.GetPlace());
       square_sum.mutable_data<float>(running_mean->dims(), ctx.GetPlace());
 
@@ -137,22 +137,24 @@ template <typename T>
 class NPUBatchNormGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
+    const auto *x = ctx.Input<phi::DenseTensor>("X");
+    const auto *d_y = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
+    const auto *scale = ctx.Input<phi::DenseTensor>("Scale");
+    const auto *bias = ctx.Input<phi::DenseTensor>("Bias");
+    const auto *saved_mean = ctx.Input<phi::DenseTensor>("SavedMean");
     // SavedVariance have been reverted in forward operator
-    const auto *saved_inv_variance = ctx.Input<Tensor>("SavedVariance");
+    const auto *saved_inv_variance =
+        ctx.Input<phi::DenseTensor>("SavedVariance");
     const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
     bool use_global_stats = ctx.Attr<bool>("use_global_stats");
     const bool is_test = ctx.Attr<bool>("is_test");
     const float epsilon = ctx.Attr<float>("epsilon");
     DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
 
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+    auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto *d_scale =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
+    auto *d_bias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
 
     use_global_stats = is_test || use_global_stats;
 
@@ -184,8 +186,8 @@ class NPUBatchNormGradOpKernel : public framework::OpKernel<T> {
       d_scale->mutable_data<float>(ctx.GetPlace());
       d_bias->mutable_data<float>(ctx.GetPlace());
       if (use_global_stats) {
-        const auto *running_mean = ctx.Input<Tensor>("Mean");
-        const auto *running_variance = ctx.Input<Tensor>("Variance");
+        const auto *running_mean = ctx.Input<phi::DenseTensor>("Mean");
+        const auto *running_variance = ctx.Input<phi::DenseTensor>("Variance");
         const auto &runner_update =
             NpuOpRunner("BNTrainingUpdateGrad",
                         {dy_tensor, x_tensor, *running_mean, *running_variance},
@@ -223,7 +225,7 @@ class NPUBatchNormGradOpKernel : public framework::OpKernel<T> {
           dx_tensor.Resize(x_new_shape);
           dy_tensor.Resize(x_new_shape);
         }
-        const auto *running_var = ctx.Input<Tensor>("Variance");
+        const auto *running_var = ctx.Input<phi::DenseTensor>("Variance");
         const auto &runner_infer =
             NpuOpRunner("BNInferGrad",
                         {dy_tensor, *scale, *running_var},
diff --git a/paddle/fluid/operators/bce_loss_op.cc b/paddle/fluid/operators/bce_loss_op.cc
index 6e3bea17863ab..3c775ced3f434 100644
--- a/paddle/fluid/operators/bce_loss_op.cc
+++ b/paddle/fluid/operators/bce_loss_op.cc
@@ -23,8 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class BCELossOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/bce_loss_op_mlu.cc b/paddle/fluid/operators/bce_loss_op_mlu.cc
index c194da4d65bcf..99fd402424e7c 100644
--- a/paddle/fluid/operators/bce_loss_op_mlu.cc
+++ b/paddle/fluid/operators/bce_loss_op_mlu.cc
@@ -18,15 +18,15 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class BCELossMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* labels = ctx.Input<Tensor>("Label");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* labels = ctx.Input<phi::DenseTensor>("Label");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     out->mutable_data<T>(ctx.GetPlace());
 
@@ -50,10 +50,10 @@ template <typename T>
 class BCELossGradMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* labels = ctx.Input<Tensor>("Label");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* labels = ctx.Input<phi::DenseTensor>("Label");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     dx->mutable_data<T>(ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/bce_loss_op_npu.cc b/paddle/fluid/operators/bce_loss_op_npu.cc
index 57dd53e5968c1..c6b2d12ac535e 100644
--- a/paddle/fluid/operators/bce_loss_op_npu.cc
+++ b/paddle/fluid/operators/bce_loss_op_npu.cc
@@ -18,15 +18,15 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class BCELossNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* labels = ctx.Input<Tensor>("Label");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* labels = ctx.Input<phi::DenseTensor>("Label");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     out->mutable_data<T>(ctx.GetPlace());
 
@@ -47,10 +47,10 @@ template <typename DeviceContext, typename T>
 class BCELossGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* labels = ctx.Input<Tensor>("Label");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* labels = ctx.Input<phi::DenseTensor>("Label");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     dx->mutable_data<T>(ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/beam_search_op.h b/paddle/fluid/operators/beam_search_op.h
index e9991e697903a..09adff97c1ce4 100644
--- a/paddle/fluid/operators/beam_search_op.h
+++ b/paddle/fluid/operators/beam_search_op.h
@@ -49,7 +49,7 @@ class BeamSearchOpKernel : public framework::OpKernel<T> {
     auto selected_ids = context.Output<framework::LoDTensor>("selected_ids");
     auto selected_scores =
         context.Output<framework::LoDTensor>("selected_scores");
-    auto* parent_idx = context.Output<framework::Tensor>("parent_idx");
+    auto* parent_idx = context.Output<phi::DenseTensor>("parent_idx");
     PADDLE_ENFORCE_NOT_NULL(
         selected_ids,
         platform::errors::NotFound(
diff --git a/paddle/fluid/operators/bilateral_slice_op.cc b/paddle/fluid/operators/bilateral_slice_op.cc
index 19347abac5e79..db3ebba8edc5a 100644
--- a/paddle/fluid/operators/bilateral_slice_op.cc
+++ b/paddle/fluid/operators/bilateral_slice_op.cc
@@ -20,7 +20,6 @@
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
 using DataLayout = framework::DataLayout;
 
 class BilateralSliceOp : public framework::OperatorWithKernel {
@@ -94,7 +93,7 @@ class BilateralSliceOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     return framework::OpKernelType(
         expected_kernel_type.data_type_, tensor.place(), tensor.layout());
diff --git a/paddle/fluid/operators/bilateral_slice_op.cu b/paddle/fluid/operators/bilateral_slice_op.cu
index 81afe68dbd23c..590b0d8ab39d5 100644
--- a/paddle/fluid/operators/bilateral_slice_op.cu
+++ b/paddle/fluid/operators/bilateral_slice_op.cu
@@ -19,7 +19,6 @@
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
 using DataLayout = framework::DataLayout;
 
 template <typename T>
@@ -131,10 +130,10 @@ template <typename T>
 class BilateralSliceOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* grid = ctx.Input<Tensor>("Grid");
-    auto* guide = ctx.Input<Tensor>("Guide");
-    auto* output = ctx.Output<Tensor>("Out");
+    auto* input = ctx.Input<phi::DenseTensor>("X");
+    auto* grid = ctx.Input<phi::DenseTensor>("Grid");
+    auto* guide = ctx.Input<phi::DenseTensor>("Guide");
+    auto* output = ctx.Output<phi::DenseTensor>("Out");
 
     auto* output_data = output->mutable_data<T>(ctx.GetPlace());
     auto* grid_data = grid->data<T>();
@@ -447,13 +446,17 @@ template <typename T>
 class BilateralSliceGradOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* guide = ctx.Input<Tensor>("Guide");
-    auto* grid = ctx.Input<Tensor>("Grid");
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* grid_grad = ctx.Output<Tensor>(framework::GradVarName("Grid"));
-    auto* guide_grad = ctx.Output<Tensor>(framework::GradVarName("Guide"));
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* input = ctx.Input<phi::DenseTensor>("X");
+    auto* guide = ctx.Input<phi::DenseTensor>("Guide");
+    auto* grid = ctx.Input<phi::DenseTensor>("Grid");
+    auto* input_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* grid_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Grid"));
+    auto* guide_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Guide"));
+    auto* output_grad =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
 
     const T* input_data = input->data<T>();
     const T* guide_data = guide->data<T>();
diff --git a/paddle/fluid/operators/bincount_op.cc b/paddle/fluid/operators/bincount_op.cc
index d52de7ace64ab..5f5e19c585bae 100644
--- a/paddle/fluid/operators/bincount_op.cc
+++ b/paddle/fluid/operators/bincount_op.cc
@@ -25,7 +25,6 @@ namespace paddle {
 namespace operators {
 
 using framework::OpKernelType;
-using framework::Tensor;
 
 class BincountOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/bmm_op.h b/paddle/fluid/operators/bmm_op.h
index 110cd2d2810d8..5ca8df0182049 100644
--- a/paddle/fluid/operators/bmm_op.h
+++ b/paddle/fluid/operators/bmm_op.h
@@ -26,10 +26,10 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 static void ReshapeTensorIntoMatrixSequence(
-    framework::Tensor *x, const phi::funcs::MatDescriptor &descriptor) {
+    phi::DenseTensor *x, const phi::funcs::MatDescriptor &descriptor) {
   int64_t h, w;
   h = descriptor.height_;
   w = descriptor.width_;
@@ -40,9 +40,9 @@ static void ReshapeTensorIntoMatrixSequence(
   x->Resize({descriptor.batch_size_, h, w});
 }
 
-static void ReshapeXYOutIntoMatrixSequence(framework::Tensor *x,
-                                           framework::Tensor *y,
-                                           framework::Tensor *out,
+static void ReshapeXYOutIntoMatrixSequence(phi::DenseTensor *x,
+                                           phi::DenseTensor *y,
+                                           phi::DenseTensor *out,
                                            bool trans_x,
                                            bool trans_y) {
   auto x_dim = x->dims();
diff --git a/paddle/fluid/operators/bpr_loss_op.h b/paddle/fluid/operators/bpr_loss_op.h
index 1c7f158c14b7c..2e1d62dddd2c3 100644
--- a/paddle/fluid/operators/bpr_loss_op.h
+++ b/paddle/fluid/operators/bpr_loss_op.h
@@ -21,7 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
 /*Todo:
  *Find a way to adapt TolerableValue, using blas or eigen.
  */
@@ -39,19 +38,19 @@ template <typename DeviceContext, typename T>
 class BprLossOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* label = ctx.Input<Tensor>("Label");
-    auto* y = ctx.Output<Tensor>("Y");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* label = ctx.Input<phi::DenseTensor>("Label");
+    auto* y = ctx.Output<phi::DenseTensor>("Y");
     y->mutable_data<T>(ctx.GetPlace());
     int rank = x->dims().size();
 
-    Tensor x_2d = framework::ReshapeToMatrix(*x, rank - 1);
-    Tensor labels_2d = framework::ReshapeToMatrix(*label, rank - 1);
-    Tensor y_2d = framework::ReshapeToMatrix(*y, rank - 1);
+    phi::DenseTensor x_2d = framework::ReshapeToMatrix(*x, rank - 1);
+    phi::DenseTensor labels_2d = framework::ReshapeToMatrix(*label, rank - 1);
+    phi::DenseTensor y_2d = framework::ReshapeToMatrix(*y, rank - 1);
 
-    const framework::Tensor* logits = &x_2d;
-    const framework::Tensor* labels = &labels_2d;
-    framework::Tensor* out = &y_2d;
+    const phi::DenseTensor* logits = &x_2d;
+    const phi::DenseTensor* labels = &labels_2d;
+    phi::DenseTensor* out = &y_2d;
 
     const int step_size = logits->dims()[0];
     const int class_num = logits->dims()[1];
@@ -87,10 +86,10 @@ template <typename DeviceContext, typename T>
 class BprLossGradientOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto* label = ctx.Input<Tensor>("Label");
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* dy = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
+    auto* label = ctx.Input<phi::DenseTensor>("Label");
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     const size_t step_size = static_cast<size_t>(x->dims()[0]);
     const size_t num_classes = static_cast<size_t>(x->dims()[1]);
diff --git a/paddle/fluid/operators/broadcast_tensors_op.cc b/paddle/fluid/operators/broadcast_tensors_op.cc
index 4f681bc6508d2..df91ef10b181a 100644
--- a/paddle/fluid/operators/broadcast_tensors_op.cc
+++ b/paddle/fluid/operators/broadcast_tensors_op.cc
@@ -21,7 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 using framework::DDim;
-using framework::Tensor;
 
 class BroadcastTensorsOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/cast_op.h b/paddle/fluid/operators/cast_op.h
index 83cc2e2122539..f4121573577ad 100644
--- a/paddle/fluid/operators/cast_op.h
+++ b/paddle/fluid/operators/cast_op.h
@@ -30,11 +30,11 @@ struct CastOpTransformFunctor {
 
 template <typename DeviceContext, typename InT>
 struct CastOpFunctor {
-  const framework::Tensor* in_;
-  framework::Tensor* out_;
+  const phi::DenseTensor* in_;
+  phi::DenseTensor* out_;
   const DeviceContext& ctx_;
-  CastOpFunctor(const framework::Tensor* in,
-                framework::Tensor* out,
+  CastOpFunctor(const phi::DenseTensor* in,
+                phi::DenseTensor* out,
                 const DeviceContext& ctx)
       : in_(in), out_(out), ctx_(ctx) {}
 
@@ -54,8 +54,8 @@ template <typename DeviceContext, typename InT>
 class CastOpKernel : public framework::OpKernel<InT> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
+    auto* in = context.Input<phi::DenseTensor>("X");
+    auto* out = context.Output<phi::DenseTensor>("Out");
 
     auto out_dtype = context.Attr<int>("out_dtype");
 
diff --git a/paddle/fluid/operators/cast_op_mlu.cc b/paddle/fluid/operators/cast_op_mlu.cc
index 2caa45702fc9d..7e85702eee4b1 100644
--- a/paddle/fluid/operators/cast_op_mlu.cc
+++ b/paddle/fluid/operators/cast_op_mlu.cc
@@ -19,14 +19,14 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class CastMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
+    auto* input = ctx.Input<phi::DenseTensor>("X");
+    auto* output = ctx.Output<phi::DenseTensor>("Out");
     auto src_type = static_cast<VT::Type>(ctx.Attr<int>("in_dtype"));
     auto dst_type = static_cast<VT::Type>(ctx.Attr<int>("out_dtype"));
     auto place = ctx.GetPlace();
diff --git a/paddle/fluid/operators/cast_op_npu.cc b/paddle/fluid/operators/cast_op_npu.cc
index f9ec6f0685d75..9c430fc0ffe30 100644
--- a/paddle/fluid/operators/cast_op_npu.cc
+++ b/paddle/fluid/operators/cast_op_npu.cc
@@ -32,15 +32,15 @@ static std::map<framework::proto::VarType::Type, aclDataType>
         {framework::proto::VarType::FP64, ACL_DOUBLE},
 };
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class CastNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
     int dtype = ctx.Attr<int>("out_dtype");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     auto place = ctx.GetPlace();
 
     if (framework::TransToProtoVarType(x->dtype()) == dtype) {
diff --git a/paddle/fluid/operators/center_loss_op.cu b/paddle/fluid/operators/center_loss_op.cu
index 2548b13559133..fed463d8f7cd7 100644
--- a/paddle/fluid/operators/center_loss_op.cu
+++ b/paddle/fluid/operators/center_loss_op.cu
@@ -87,10 +87,10 @@ class CenterLossCUDAKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext &ctx) const override {
     auto &device_context = ctx.template device_context<DeviceContext>();
     auto stream = device_context.stream();
-    auto *X = ctx.Input<Tensor>("X");  // deep feature
-    auto *labels = ctx.Input<Tensor>("Label");
-    auto *centers = ctx.Input<Tensor>("Centers");
-    auto *update_rate = ctx.Input<Tensor>("CenterUpdateRate");
+    auto *X = ctx.Input<phi::DenseTensor>("X");  // deep feature
+    auto *labels = ctx.Input<phi::DenseTensor>("Label");
+    auto *centers = ctx.Input<phi::DenseTensor>("Centers");
+    auto *update_rate = ctx.Input<phi::DenseTensor>("CenterUpdateRate");
     int cluster_num = ctx.Attr<int>("cluster_num");
     auto *lr_center = update_rate->data<T>();
     bool need_update = static_cast<T>(ctx.Attr<bool>("need_update"));
@@ -102,24 +102,24 @@ class CenterLossCUDAKernel : public framework::OpKernel<T> {
     int batch_size = x_dims[0];
     const int deep_feat_dim = x_dims[1];
 
-    auto *centers_diff = ctx.Output<Tensor>("SampleCenterDiff");
+    auto *centers_diff = ctx.Output<phi::DenseTensor>("SampleCenterDiff");
     auto centers_diff_data = centers_diff->mutable_data<T>(ctx.GetPlace());
 
     auto centers_data = centers->data<T>();
     auto centers_dim = centers->dims();
-    auto *out_loss = ctx.Output<Tensor>("Loss");
+    auto *out_loss = ctx.Output<phi::DenseTensor>("Loss");
     auto loss_data = out_loss->mutable_data<T>(ctx.GetPlace());
 
-    auto *centers_out = ctx.Output<Tensor>("CentersOut");
+    auto *centers_out = ctx.Output<phi::DenseTensor>("CentersOut");
     auto *centers_out_data = centers_out->mutable_data<T>(ctx.GetPlace());
 
     auto ctx_place = ctx.GetPlace();
     if (centers != centers_out) {
       framework::TensorCopy(
-          *static_cast<const framework::Tensor *>(centers),
+          *static_cast<const phi::DenseTensor *>(centers),
           ctx_place,
           *platform::DeviceContextPool::Instance().Get(ctx_place),
-          static_cast<framework::Tensor *>(centers_out));
+          static_cast<phi::DenseTensor *>(centers_out));
     }
 
     int64_t numel = X->numel();
diff --git a/paddle/fluid/operators/center_loss_op.h b/paddle/fluid/operators/center_loss_op.h
index 49aec390599a5..989a27f552118 100644
--- a/paddle/fluid/operators/center_loss_op.h
+++ b/paddle/fluid/operators/center_loss_op.h
@@ -26,7 +26,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
@@ -45,10 +45,10 @@ template <typename DeviceContext, typename T>
 class CenterLossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *X = ctx.Input<Tensor>("X");  // deep feature
-    auto *labels = ctx.Input<Tensor>("Label");
-    auto *centers = ctx.Input<Tensor>("Centers");
-    auto *update_rate = ctx.Input<Tensor>("CenterUpdateRate");
+    auto *X = ctx.Input<phi::DenseTensor>("X");  // deep feature
+    auto *labels = ctx.Input<phi::DenseTensor>("Label");
+    auto *centers = ctx.Input<phi::DenseTensor>("Centers");
+    auto *update_rate = ctx.Input<phi::DenseTensor>("CenterUpdateRate");
     int cluster_num = ctx.Attr<int>("cluster_num");
     auto *lr_center = update_rate->data<T>();
     T alpha = lr_center[0];
@@ -64,11 +64,11 @@ class CenterLossKernel : public framework::OpKernel<T> {
     int batch_size = x_dims[0];
     int deep_feat_dim = x_dims[1];
 
-    auto centers_diff = ctx.Output<Tensor>("SampleCenterDiff");
+    auto centers_diff = ctx.Output<phi::DenseTensor>("SampleCenterDiff");
     auto centers_diff_data = centers_diff->mutable_data<T>(ctx.GetPlace());
-    auto *out_loss = ctx.Output<Tensor>("Loss");
+    auto *out_loss = ctx.Output<phi::DenseTensor>("Loss");
 
-    auto *centers_out = ctx.Output<Tensor>("CentersOut");
+    auto *centers_out = ctx.Output<phi::DenseTensor>("CentersOut");
     auto *centers_out_data = centers_out->mutable_data<T>(ctx.GetPlace());
 
     if (centers_out_data != centers_data) {
@@ -138,9 +138,9 @@ template <typename DeviceContext, typename T>
 class CenterLossGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    auto *in0 = context.Input<Tensor>("SampleCenterDiff");
-    auto *in1 = context.Input<Tensor>(framework::GradVarName("Loss"));
-    auto *x_g = context.Output<Tensor>(framework::GradVarName("X"));
+    auto *in0 = context.Input<phi::DenseTensor>("SampleCenterDiff");
+    auto *in1 = context.Input<phi::DenseTensor>(framework::GradVarName("Loss"));
+    auto *x_g = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto sub_result = EigenMatrix<T>::From(*in0);
     auto out_grad = EigenMatrix<T>::From(*in1);
 
diff --git a/paddle/fluid/operators/chunk_eval_op.h b/paddle/fluid/operators/chunk_eval_op.h
index 8784c49659669..823a759dddc74 100644
--- a/paddle/fluid/operators/chunk_eval_op.h
+++ b/paddle/fluid/operators/chunk_eval_op.h
@@ -23,7 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 
 template <typename DeviceContext, typename T>
@@ -191,12 +190,13 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
     auto* inference = context.Input<LoDTensor>("Inference");
     auto place = inference->place();
     auto* label = context.Input<LoDTensor>("Label");
-    auto* precision = context.Output<Tensor>("Precision");
-    auto* recall = context.Output<Tensor>("Recall");
-    auto* f1 = context.Output<Tensor>("F1-Score");
-    auto* num_infer_chunks = context.Output<Tensor>("NumInferChunks");
-    auto* num_label_chunks = context.Output<Tensor>("NumLabelChunks");
-    auto* num_correct_chunks = context.Output<Tensor>("NumCorrectChunks");
+    auto* precision = context.Output<phi::DenseTensor>("Precision");
+    auto* recall = context.Output<phi::DenseTensor>("Recall");
+    auto* f1 = context.Output<phi::DenseTensor>("F1-Score");
+    auto* num_infer_chunks = context.Output<phi::DenseTensor>("NumInferChunks");
+    auto* num_label_chunks = context.Output<phi::DenseTensor>("NumLabelChunks");
+    auto* num_correct_chunks =
+        context.Output<phi::DenseTensor>("NumCorrectChunks");
 
     const int64_t* inference_data = inference->data<int64_t>();
     const int64_t* label_data = label->data<int64_t>();
@@ -219,7 +219,7 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
 
     if (use_padding) {
       auto dim1 = inference->dims()[1];
-      auto* seq_length_t = context.Input<Tensor>("SeqLength");
+      auto* seq_length_t = context.Input<phi::DenseTensor>("SeqLength");
       auto* seq_length_data = seq_length_t->data<int64_t>();
       num_sequences = seq_length_t->dims()[0];
 
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.h b/paddle/fluid/operators/cinn/cinn_launch_context.h
index 0bbbcc8b03177..a868a182bfc5e 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.h
@@ -52,7 +52,7 @@ class CinnCompiledObject;
 
 namespace operators::details {
 
-using CinnTensor = ::cinn::hlir::framework::Tensor;
+using CinnTensor = ::cinn::hlir::Tensor;
 using CinnScope = ::cinn::hlir::framework::Scope;
 using CinnCompiledObject = framework::paddle2cinn::CinnCompiledObject;
 
diff --git a/paddle/fluid/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h
index 6fde5106f10a4..841a12ac81bd6 100644
--- a/paddle/fluid/operators/clip_by_norm_op.h
+++ b/paddle/fluid/operators/clip_by_norm_op.h
@@ -23,7 +23,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 // using SelectedRows = phi::SelectedRows;
 template <typename T,
           int MajorType = Eigen::RowMajor,
diff --git a/paddle/fluid/operators/clip_by_norm_op_npu.cc b/paddle/fluid/operators/clip_by_norm_op_npu.cc
index c74565a5ecab7..68a031f3a4d19 100644
--- a/paddle/fluid/operators/clip_by_norm_op_npu.cc
+++ b/paddle/fluid/operators/clip_by_norm_op_npu.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class NPUClipByNormKernel : public framework::OpKernel<T> {
@@ -39,8 +39,8 @@ class NPUClipByNormKernel : public framework::OpKernel<T> {
         context.template device_context<paddle::platform::NPUDeviceContext>();
     auto stream = dev_ctx.stream();
 
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
+    auto* input = context.Input<phi::DenseTensor>("X");
+    auto* output = context.Output<phi::DenseTensor>("Out");
     output->mutable_data<T>(place);
 
     PADDLE_ENFORCE_NOT_NULL(input,
diff --git a/paddle/fluid/operators/clip_by_norm_op_xpu.cc b/paddle/fluid/operators/clip_by_norm_op_xpu.cc
index dcf3a7826f5fc..b99d12b8628e8 100644
--- a/paddle/fluid/operators/clip_by_norm_op_xpu.cc
+++ b/paddle/fluid/operators/clip_by_norm_op_xpu.cc
@@ -27,12 +27,12 @@ class XPUClipByNormKernel : public framework::OpKernel<T> {
     auto max_norm = context.Attr<T>("max_norm");
     auto in_var = context.InputVar("X");
 
-    Tensor* output = nullptr;
-    const Tensor* input = nullptr;
+    phi::DenseTensor* output = nullptr;
+    const phi::DenseTensor* input = nullptr;
     if (in_var->IsType<framework::LoDTensor>()) {
-      input = context.Input<Tensor>("X");
+      input = context.Input<phi::DenseTensor>("X");
 
-      output = context.Output<Tensor>("Out");
+      output = context.Output<phi::DenseTensor>("Out");
       output->mutable_data<T>(context.GetPlace());
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/clip_op_mlu.cc b/paddle/fluid/operators/clip_op_mlu.cc
index 88cce62de6cac..daced778a95dc 100644
--- a/paddle/fluid/operators/clip_op_mlu.cc
+++ b/paddle/fluid/operators/clip_op_mlu.cc
@@ -22,15 +22,15 @@ template <typename T>
 class ClipMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     auto min = static_cast<T>(ctx.Attr<float>("min"));
     auto max = static_cast<T>(ctx.Attr<float>("max"));
 
     if (ctx.HasInput("Min")) {
       Tensor min_cpu;
-      auto* min_tensor = ctx.Input<Tensor>("Min");
+      auto* min_tensor = ctx.Input<phi::DenseTensor>("Min");
       auto* min_data = min_tensor->data<T>();
       if (platform::is_mlu_place(min_tensor->place())) {
         paddle::framework::TensorCopySync(
@@ -42,7 +42,7 @@ class ClipMLUKernel : public framework::OpKernel<T> {
 
     if (ctx.HasInput("Max")) {
       Tensor max_cpu;
-      auto* max_tensor = ctx.Input<Tensor>("Max");
+      auto* max_tensor = ctx.Input<phi::DenseTensor>("Max");
       auto* max_data = max_tensor->data<T>();
       if (platform::is_mlu_place(max_tensor->place())) {
         paddle::framework::TensorCopySync(
@@ -68,13 +68,15 @@ template <typename T>
 class ClipGradMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     dx->mutable_data<T>(ctx.GetPlace());
 
-    auto* min_tensor = ctx.HasInput("Min") ? ctx.Input<Tensor>("Min") : nullptr;
-    auto* max_tensor = ctx.HasInput("Max") ? ctx.Input<Tensor>("Max") : nullptr;
+    auto* min_tensor =
+        ctx.HasInput("Min") ? ctx.Input<phi::DenseTensor>("Min") : nullptr;
+    auto* max_tensor =
+        ctx.HasInput("Max") ? ctx.Input<phi::DenseTensor>("Max") : nullptr;
 
     auto min_val = ctx.Attr<float>("min");
     if (min_tensor) {
diff --git a/paddle/fluid/operators/clip_op_npu.cc b/paddle/fluid/operators/clip_op_npu.cc
index 0a7b05f06814d..19ae23add0e10 100644
--- a/paddle/fluid/operators/clip_op_npu.cc
+++ b/paddle/fluid/operators/clip_op_npu.cc
@@ -18,18 +18,20 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class ClipNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
 
-    auto min_tensor = ctx.HasInput("Min") ? ctx.Input<Tensor>("Min") : nullptr;
-    auto max_tensor = ctx.HasInput("Max") ? ctx.Input<Tensor>("Max") : nullptr;
+    auto min_tensor =
+        ctx.HasInput("Min") ? ctx.Input<phi::DenseTensor>("Min") : nullptr;
+    auto max_tensor =
+        ctx.HasInput("Max") ? ctx.Input<phi::DenseTensor>("Max") : nullptr;
 
     Tensor min_tensor_temp(x->type());
     Tensor max_tensor_temp(x->type());
@@ -60,13 +62,15 @@ template <typename DeviceContext, typename T>
 class ClipGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     dx->mutable_data<T>(ctx.GetPlace());
 
-    auto* min_tensor = ctx.HasInput("Min") ? ctx.Input<Tensor>("Min") : nullptr;
-    auto* max_tensor = ctx.HasInput("Max") ? ctx.Input<Tensor>("Max") : nullptr;
+    auto* min_tensor =
+        ctx.HasInput("Min") ? ctx.Input<phi::DenseTensor>("Min") : nullptr;
+    auto* max_tensor =
+        ctx.HasInput("Max") ? ctx.Input<phi::DenseTensor>("Max") : nullptr;
 
     auto min_val = ctx.Attr<float>("min");
     if (min_tensor) {
diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc
index 3e77bd91baf29..beb02ad5a987b 100644
--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -414,7 +414,7 @@ class CoalesceTensorOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const framework::Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     return framework::OpKernelType(expected_kernel_type.data_type_,
                                    expected_kernel_type.place_,
diff --git a/paddle/fluid/operators/collective/allreduce_op.h b/paddle/fluid/operators/collective/allreduce_op.h
index 12507d76fe73a..a4f935a9c9586 100644
--- a/paddle/fluid/operators/collective/allreduce_op.h
+++ b/paddle/fluid/operators/collective/allreduce_op.h
@@ -39,8 +39,8 @@ class AllReduceOpKernel : public framework::OpKernel<T> {
                           "AllReduce op can run on gpu place only for now."));
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
-    auto in = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
+    auto in = ctx.Input<phi::DenseTensor>("X");
+    auto out = ctx.Output<phi::DenseTensor>("Out");
 
     int dtype =
         platform::ToNCCLDataType(framework::TransToProtoVarType(in->dtype()));
diff --git a/paddle/fluid/operators/collective/barrier_op.cu.cc b/paddle/fluid/operators/collective/barrier_op.cu.cc
index de15395eb4df5..622b25f2a49bb 100644
--- a/paddle/fluid/operators/collective/barrier_op.cu.cc
+++ b/paddle/fluid/operators/collective/barrier_op.cu.cc
@@ -27,8 +27,8 @@ class BarrierOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    auto in = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
+    auto in = ctx.Input<phi::DenseTensor>("X");
+    auto out = ctx.Output<phi::DenseTensor>("Out");
 
     auto place = ctx.GetPlace();
     ncclDataType_t dtype =
diff --git a/paddle/fluid/operators/collective/broadcast_op.cu.cc b/paddle/fluid/operators/collective/broadcast_op.cu.cc
index 4f21dc2992a39..9d1fedc16908d 100644
--- a/paddle/fluid/operators/collective/broadcast_op.cu.cc
+++ b/paddle/fluid/operators/collective/broadcast_op.cu.cc
@@ -39,8 +39,8 @@ class NCCLBroadcastOpKernel : public framework::OpKernel<T> {
     int dev_id = ctx.GetPlace().device;
     int root_dev_id = ctx.Attr<int>("root");
 
-    auto in = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
+    auto in = ctx.Input<phi::DenseTensor>("X");
+    auto out = ctx.Output<phi::DenseTensor>("Out");
     PADDLE_ENFORCE_EQ(
         out->IsInitialized(),
         true,
diff --git a/paddle/fluid/operators/collective/broadcast_op_xpu.cc b/paddle/fluid/operators/collective/broadcast_op_xpu.cc
index 437a93da18843..54eccbead94be 100644
--- a/paddle/fluid/operators/collective/broadcast_op_xpu.cc
+++ b/paddle/fluid/operators/collective/broadcast_op_xpu.cc
@@ -44,8 +44,8 @@ class BKCLBroadcastOpKernel : public framework::OpKernel<T> {
     int dev_id = ctx.GetPlace().device;
     int root_dev_id = ctx.Attr<int>("root");
 
-    auto in = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
+    auto in = ctx.Input<phi::DenseTensor>("X");
+    auto out = ctx.Output<phi::DenseTensor>("Out");
     PADDLE_ENFORCE_EQ(
         out->IsInitialized(),
         true,
diff --git a/paddle/fluid/operators/collective/c_allgather_op.cu.cc b/paddle/fluid/operators/collective/c_allgather_op.cu.cc
index 8356bbb65a8a7..963eda0723080 100644
--- a/paddle/fluid/operators/collective/c_allgather_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op.cu.cc
@@ -30,8 +30,8 @@ class CAllGatherOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    auto in = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
+    auto in = ctx.Input<phi::DenseTensor>("X");
+    auto out = ctx.Output<phi::DenseTensor>("Out");
     ncclDataType_t dtype =
         platform::ToNCCLDataType(framework::TransToProtoVarType(in->dtype()));
 
diff --git a/paddle/fluid/operators/collective/c_allgather_op.h b/paddle/fluid/operators/collective/c_allgather_op.h
index 364b813629bd3..198ec4009f4d3 100644
--- a/paddle/fluid/operators/collective/c_allgather_op.h
+++ b/paddle/fluid/operators/collective/c_allgather_op.h
@@ -37,8 +37,8 @@ class CAllGatherOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
 #if defined(PADDLE_WITH_GLOO)
-    auto in = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
+    auto in = ctx.Input<phi::DenseTensor>("X");
+    auto out = ctx.Output<phi::DenseTensor>("Out");
     framework::DDim out_dims = in->dims();
     auto place = ctx.GetPlace();
 
diff --git a/paddle/fluid/operators/collective/c_allgather_op_mlu.cc b/paddle/fluid/operators/collective/c_allgather_op_mlu.cc
index fc3ad8a006ec5..7bd30ecadc8c8 100644
--- a/paddle/fluid/operators/collective/c_allgather_op_mlu.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_mlu.cc
@@ -28,8 +28,8 @@ class CAllGatherOpMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
 #if defined(PADDLE_WITH_CNCL)
-    auto x = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
+    auto x = ctx.Input<phi::DenseTensor>("X");
+    auto out = ctx.Output<phi::DenseTensor>("Out");
     cnclDataType_t dtype =
         platform::ToCNCLDataType(framework::TransToProtoVarType(x->dtype()));
 
diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu.cc b/paddle/fluid/operators/collective/c_allgather_op_npu.cc
index f682872d5c662..b535441ea28ee 100644
--- a/paddle/fluid/operators/collective/c_allgather_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu.cc
@@ -29,8 +29,8 @@ class CAllGatherOpASCENDKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
 #if defined(PADDLE_WITH_ASCEND_CL)
-    auto in = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
+    auto in = ctx.Input<phi::DenseTensor>("X");
+    auto out = ctx.Output<phi::DenseTensor>("Out");
     HcclDataType dtype =
         platform::ToHCCLDataType(framework::TransToProtoVarType(in->dtype()));
 
diff --git a/paddle/fluid/operators/collective/c_allgather_op_xpu.cc b/paddle/fluid/operators/collective/c_allgather_op_xpu.cc
index ca865f7522a23..107f5ccd1b563 100644
--- a/paddle/fluid/operators/collective/c_allgather_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_xpu.cc
@@ -27,8 +27,8 @@ class CAllGatherOpXPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
 #if defined(PADDLE_WITH_XPU_BKCL)
-    auto in = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
+    auto in = ctx.Input<phi::DenseTensor>("X");
+    auto out = ctx.Output<phi::DenseTensor>("Out");
     BKCLDataType dtype =
         platform::ToBKCLDataType(framework::TransToProtoVarType(in->dtype()));
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index 299dd59d5efa7..87c81fdd738d5 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -83,8 +83,8 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
 #if defined(PADDLE_WITH_GLOO)
-    auto in = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
+    auto in = ctx.Input<phi::DenseTensor>("X");
+    auto out = ctx.Output<phi::DenseTensor>("Out");
 
     auto place = ctx.GetPlace();
     int64_t send_numel = in->numel();
@@ -138,8 +138,8 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
 // return true if found_nan or return false;
 inline bool ContainsNan(const paddle::platform::NPUDeviceContext& dev_ctx,
                         aclrtStream stream,
-                        const paddle::framework::Tensor* in) {
-  using Tensor = paddle::framework::Tensor;
+                        const phi::DenseTensor* in) {
+  using Tensor = phi::DenseTensor;
   Tensor out(in->type());
 
   Tensor mean(in->type());
@@ -180,8 +180,8 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
 #if defined(PADDLE_WITH_ASCEND_CL)
-    auto in = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
+    auto in = ctx.Input<phi::DenseTensor>("X");
+    auto out = ctx.Output<phi::DenseTensor>("Out");
     auto place = ctx.GetPlace();
     HcclDataType dtype =
         platform::ToHCCLDataType(framework::TransToProtoVarType(in->dtype()));
@@ -237,7 +237,7 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
             << ", use_calc_stream:" << ctx.Attr<bool>("use_calc_stream")
             << ", stream:" << stream;
 
-    framework::Tensor tmp;
+    phi::DenseTensor tmp;
     tmp.mutable_data<float>({8}, ctx.GetPlace());
 
     bool found_nan = false;
@@ -263,7 +263,7 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
       T inf = static_cast<T>(std::numeric_limits<float>::infinity());
       VLOG(4) << "fill input data constant inf";
       auto dims = in->dims();
-      auto mutable_in = const_cast<framework::Tensor*>(in);
+      auto mutable_in = const_cast<phi::DenseTensor*>(in);
       FillNpuTensorWithConstant<T>(mutable_in, inf);
       mutable_in->Resize(dims);
     }
@@ -296,8 +296,8 @@ class CAllReduceOpXPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
 #if defined(PADDLE_WITH_XPU_BKCL)
-    auto in = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
+    auto in = ctx.Input<phi::DenseTensor>("X");
+    auto out = ctx.Output<phi::DenseTensor>("Out");
 
     auto place = ctx.GetPlace();
     BKCLDataType dtype =
@@ -365,8 +365,8 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    auto in = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
+    auto in = ctx.Input<phi::DenseTensor>("X");
+    auto out = ctx.Output<phi::DenseTensor>("Out");
     int rid = ctx.Attr<int>("ring_id");
 
     auto place = ctx.GetPlace();
@@ -465,8 +465,8 @@ class CAllReduceOpMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
 #if defined(PADDLE_WITH_CNCL)
-    auto in = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
+    auto in = ctx.Input<phi::DenseTensor>("X");
+    auto out = ctx.Output<phi::DenseTensor>("Out");
 
     auto place = ctx.GetPlace();
     cnclDataType_t dtype =
diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
index e43c67d7bf369..2a40b1e45911e 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
@@ -73,10 +73,10 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel<T> {
 
       if (out != x) {
         framework::TensorCopy(
-            *static_cast<const framework::Tensor*>(x),
+            *static_cast<const phi::DenseTensor*>(x),
             place,
             *platform::DeviceContextPool::Instance().Get(place),
-            static_cast<framework::Tensor*>(out));
+            static_cast<phi::DenseTensor*>(out));
       }
     } else {
       PADDLE_ENFORCE_GPU_SUCCESS(
diff --git a/paddle/fluid/operators/collective/c_broadcast_op.h b/paddle/fluid/operators/collective/c_broadcast_op.h
index 2ccdd91317656..140a4383211f4 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op.h
+++ b/paddle/fluid/operators/collective/c_broadcast_op.h
@@ -36,8 +36,8 @@ class CBroadcastOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
 #if defined(PADDLE_WITH_GLOO)
-    auto in = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
+    auto in = ctx.Input<phi::DenseTensor>("X");
+    auto out = ctx.Output<phi::DenseTensor>("Out");
     auto root = ctx.Attr<int>("root");
 
     auto place = ctx.GetPlace();
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc b/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
index ac24451cf81ba..29bbd2afce1fc 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
@@ -59,10 +59,10 @@ class CBroadcastOPMLUKernel : public framework::OpKernel<T> {
 
       if (out != x) {
         framework::TensorCopy(
-            *static_cast<const framework::Tensor*>(x),
+            *static_cast<const phi::DenseTensor*>(x),
             place,
             *platform::DeviceContextPool::Instance().Get(place),
-            static_cast<framework::Tensor*>(out));
+            static_cast<phi::DenseTensor*>(out));
       }
     } else {
       PADDLE_ENFORCE_MLU_SUCCESS(cnclBcast(out->mutable_data<T>(place),
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
index da394182c1736..9bace16f15482 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
@@ -65,10 +65,10 @@ class CBroadcastOpASCENDKernel : public framework::OpKernel<T> {
     dev_ctx->Wait();
 
     if (out != x) {
-      framework::TensorCopy(*static_cast<const framework::Tensor*>(x),
+      framework::TensorCopy(*static_cast<const phi::DenseTensor*>(x),
                             place,
                             *platform::DeviceContextPool::Instance().Get(place),
-                            static_cast<framework::Tensor*>(out));
+                            static_cast<phi::DenseTensor*>(out));
     }
     dev_ctx->Wait();
 
diff --git a/paddle/fluid/operators/collective/c_concat_op.cu.cc b/paddle/fluid/operators/collective/c_concat_op.cu.cc
index 74bdd2b63ae57..e2ee9cefdbfb2 100644
--- a/paddle/fluid/operators/collective/c_concat_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_concat_op.cu.cc
@@ -32,8 +32,8 @@ template <typename T>
 class CConcatOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto x = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
+    auto x = ctx.Input<phi::DenseTensor>("X");
+    auto out = ctx.Output<phi::DenseTensor>("Out");
     ncclDataType_t dtype =
         platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype()));
 
@@ -62,7 +62,7 @@ class CConcatOpCUDAKernel : public framework::OpKernel<T> {
                           nranks));
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    framework::Tensor temp_out;
+    phi::DenseTensor temp_out;
     framework::DDim temp_out_dims = x->dims();
     temp_out_dims[0] *= nranks;
     temp_out.mutable_data<T>(temp_out_dims, place);
@@ -101,14 +101,14 @@ class CConcatOpCUDAKernel : public framework::OpKernel<T> {
                                            stream));
     }
 
-    std::vector<framework::Tensor> inputs;
+    std::vector<phi::DenseTensor> inputs;
     int axis = x->dims().size() - 1;
     auto out_dims = x->dims();
     out_dims[out_dims.size() - 1] *= nranks;
     int rows_per_tensor = x->dims()[0];
     int offset = 0;
     for (int i = 0; i < nranks; i++) {
-      framework::Tensor temp = temp_out.Slice(offset, offset + rows_per_tensor);
+      phi::DenseTensor temp = temp_out.Slice(offset, offset + rows_per_tensor);
       inputs.emplace_back(temp);
       offset += rows_per_tensor;
     }
diff --git a/paddle/fluid/operators/collective/c_embedding_op_npu.cc b/paddle/fluid/operators/collective/c_embedding_op_npu.cc
index 95ef754196fea..cb016f76110e4 100644
--- a/paddle/fluid/operators/collective/c_embedding_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_embedding_op_npu.cc
@@ -51,21 +51,21 @@ void shard_index(const Tensor &table_t,
   auto stream =
       context.template device_context<paddle::platform::NPUDeviceContext>()
           .stream();
-  framework::Tensor id_t_d;
+  phi::DenseTensor id_t_d;
   id_t_d.mutable_data<T>(ids_t.dims(), context.GetPlace());
   FillNPU(&id_t_d, static_cast<T>(0.0), context);
   id_t_d.Resize(ids_t.dims());
 
-  framework::Tensor id_t_u;
+  phi::DenseTensor id_t_u;
   id_t_u.mutable_data<T>(ids_t.dims(), context.GetPlace());
   FillNPU(&id_t_u, static_cast<T>(height - 1), context);
   id_t_u.Resize(ids_t.dims());
 
-  framework::Tensor id_matched_d;
+  phi::DenseTensor id_matched_d;
   id_matched_d.mutable_data<bool>(ids_t.dims(), context.GetPlace());
-  framework::Tensor id_matched_u;
+  phi::DenseTensor id_matched_u;
   id_matched_u.mutable_data<bool>(ids_t.dims(), context.GetPlace());
-  framework::Tensor ignore_tensor;
+  phi::DenseTensor ignore_tensor;
   ignore_tensor.mutable_data<T>(ids_t.dims(), context.GetPlace());
   FillNPU(&ignore_tensor, static_cast<T>(height), context);
   ignore_tensor.Resize(ids_t.dims());
@@ -120,7 +120,7 @@ void NPUGetIdsEmbedding(const framework::ExecutionContext &context) {
       context.template device_context<paddle::platform::NPUDeviceContext>()
           .stream();
 
-  framework::Tensor ids_t_local;
+  phi::DenseTensor ids_t_local;
   ids_t_local.mutable_data<TIds>(ids_t->dims(), context.GetPlace());
   shard_index<TIds>(*table_t, *ids_t, start_idx, ids_t_local, context);
 
@@ -185,7 +185,7 @@ void NPUUpdateEmbedding(const framework::ExecutionContext &context) {
   const int64_t start_idx = context.Attr<int64_t>("start_index");
   auto ids_t = context.Input<LoDTensor>("Ids");
   auto d_output_t = context.Input<LoDTensor>(framework::GradVarName("Out"));
-  auto table_t = context.Input<Tensor>("W");
+  auto table_t = context.Input<phi::DenseTensor>("W");
   auto table_grad_t = context.Output<LoDTensor>(framework::GradVarName("W"));
 
   VLOG(10) << "ids_t:" << ids_t << ", d_output_t:" << d_output_t
@@ -196,7 +196,7 @@ void NPUUpdateEmbedding(const framework::ExecutionContext &context) {
           .stream();
 
   // convert ids_t to local valid
-  framework::Tensor ids_t_local;
+  phi::DenseTensor ids_t_local;
   ids_t_local.mutable_data<TIds>(ids_t->dims(), context.GetPlace());
   shard_index<TIds>(*table_t, *ids_t, start_idx, ids_t_local, context);
 
diff --git a/paddle/fluid/operators/collective/c_reduce_op.h b/paddle/fluid/operators/collective/c_reduce_op.h
index dae4fa497f7fb..1b6149f3fd55e 100644
--- a/paddle/fluid/operators/collective/c_reduce_op.h
+++ b/paddle/fluid/operators/collective/c_reduce_op.h
@@ -78,8 +78,8 @@ class CReduceOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
 #if defined(PADDLE_WITH_GLOO)
-    auto in = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
+    auto in = ctx.Input<phi::DenseTensor>("X");
+    auto out = ctx.Output<phi::DenseTensor>("Out");
     auto root_id = ctx.Attr<int>("root_id");
 
     auto place = ctx.GetPlace();
@@ -223,8 +223,8 @@ class CReduceOpXPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
 #if defined(PADDLE_WITH_XPU_BKCL)
-    auto in = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
+    auto in = ctx.Input<phi::DenseTensor>("X");
+    auto out = ctx.Output<phi::DenseTensor>("Out");
 
     auto place = ctx.GetPlace();
     BKCLDataType dtype =
@@ -294,8 +294,8 @@ class CReduceOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    auto in = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
+    auto in = ctx.Input<phi::DenseTensor>("X");
+    auto out = ctx.Output<phi::DenseTensor>("Out");
 
     auto place = ctx.GetPlace();
     ncclDataType_t dtype =
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
index 354c31c213b63..9495ba44ca8a6 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
@@ -27,8 +27,8 @@ class CReduceScatterOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    auto in = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
+    auto in = ctx.Input<phi::DenseTensor>("X");
+    auto out = ctx.Output<phi::DenseTensor>("Out");
 
     int rid = ctx.Attr<int>("ring_id");
     auto place = ctx.GetPlace();
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
index d366e3e867c06..81831d9c69328 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
@@ -27,8 +27,8 @@ class CReduceScatterOpAscendKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
 #if defined(PADDLE_WITH_ASCEND_CL)
-    auto in = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
+    auto in = ctx.Input<phi::DenseTensor>("X");
+    auto out = ctx.Output<phi::DenseTensor>("Out");
 
     int ring_id = ctx.Attr<int>("ring_id");
     std::string group =
diff --git a/paddle/fluid/operators/collective/c_scatter_op.cu.cc b/paddle/fluid/operators/collective/c_scatter_op.cu.cc
index 42d9ed2342ca0..9d53856a74b00 100644
--- a/paddle/fluid/operators/collective/c_scatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_scatter_op.cu.cc
@@ -68,7 +68,7 @@ class CScatterOpCUDAKernel : public framework::OpKernel<T> {
 
     framework::DDim x_dims = x->dims();
     framework::DDim out_dims(x_dims);
-    framework::Tensor temp;
+    phi::DenseTensor temp;
     auto out_ptr = temp.mutable_data<T>(out_dims, place);
     if (root_id == comm->rank()) {
       PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
@@ -79,10 +79,10 @@ class CScatterOpCUDAKernel : public framework::OpKernel<T> {
           comm->comm(),
           stream));
 
-      framework::TensorCopy(*static_cast<const framework::Tensor*>(x),
+      framework::TensorCopy(*static_cast<const phi::DenseTensor*>(x),
                             place,
                             *platform::DeviceContextPool::Instance().Get(place),
-                            static_cast<framework::Tensor*>(&temp));
+                            static_cast<phi::DenseTensor*>(&temp));
     } else {
       PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
           out_ptr, numel, dtype, root_id, comm->comm(), stream));
@@ -94,9 +94,9 @@ class CScatterOpCUDAKernel : public framework::OpKernel<T> {
     temp = temp.Slice(start_index, end_index);
     temp.Resize(out_dims);
     out->mutable_data<T>(out_dims, place);
-    framework::TensorCopySync(*static_cast<const framework::Tensor*>(&temp),
+    framework::TensorCopySync(*static_cast<const phi::DenseTensor*>(&temp),
                               place,
-                              static_cast<framework::Tensor*>(out));
+                              static_cast<phi::DenseTensor*>(out));
     out->Resize(out_dims);
 #else
     PADDLE_ENFORCE_EQ(
diff --git a/paddle/fluid/operators/collective/c_scatter_op.h b/paddle/fluid/operators/collective/c_scatter_op.h
index ff59f91d32dc4..8e603d87456a9 100644
--- a/paddle/fluid/operators/collective/c_scatter_op.h
+++ b/paddle/fluid/operators/collective/c_scatter_op.h
@@ -36,8 +36,8 @@ class CScatterOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
 #if defined(PADDLE_WITH_GLOO)
-    auto in = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
+    auto in = ctx.Input<phi::DenseTensor>("X");
+    auto out = ctx.Output<phi::DenseTensor>("Out");
     auto root_id = ctx.Attr<int>("root");
 
     auto gloo = paddle::framework::GlooWrapper::GetInstance();
diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
index ef7e298aaf6a3..455dcd6d7f9fd 100644
--- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
@@ -24,7 +24,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 static constexpr int kNumCUDAThreads = 512;
 static constexpr int kNumMaxinumNumBlocks = 4096;
@@ -97,10 +97,10 @@ class CSoftmaxWithCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
 template <typename T>
 struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
   void operator()(const framework::ExecutionContext& ctx) {
-    const Tensor* logits = ctx.Input<Tensor>("Logits");
-    const Tensor* labels = ctx.Input<Tensor>("Label");
-    Tensor* softmax = ctx.Output<Tensor>("Softmax");
-    Tensor* loss = ctx.Output<Tensor>("Loss");
+    const phi::DenseTensor* logits = ctx.Input<phi::DenseTensor>("Logits");
+    const phi::DenseTensor* labels = ctx.Input<phi::DenseTensor>("Label");
+    phi::DenseTensor* softmax = ctx.Output<phi::DenseTensor>("Softmax");
+    phi::DenseTensor* loss = ctx.Output<phi::DenseTensor>("Loss");
 
     const int rid = ctx.Attr<int>("ring_id");
     const int nranks = ctx.Attr<int>("nranks");
@@ -250,10 +250,10 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
 template <typename T>
 struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
   void operator()(const framework::ExecutionContext& ctx) {
-    const Tensor* logits = ctx.Input<Tensor>("Logits");
-    const Tensor* labels = ctx.Input<Tensor>("Label");
-    Tensor* softmax = ctx.Output<Tensor>("Softmax");
-    Tensor* loss = ctx.Output<Tensor>("Loss");
+    const phi::DenseTensor* logits = ctx.Input<phi::DenseTensor>("Logits");
+    const phi::DenseTensor* labels = ctx.Input<phi::DenseTensor>("Label");
+    phi::DenseTensor* softmax = ctx.Output<phi::DenseTensor>("Softmax");
+    phi::DenseTensor* loss = ctx.Output<phi::DenseTensor>("Loss");
 
     const int rid = ctx.Attr<int>("ring_id");
     const int nranks = ctx.Attr<int>("nranks");
@@ -384,12 +384,13 @@ template <typename T>
 class CSoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* labels = context.Input<Tensor>("Label");
-    const Tensor* loss_grad =
-        context.Input<Tensor>(framework::GradVarName("Loss"));
-    Tensor* logit_grad =
-        context.Output<Tensor>(framework::GradVarName("Logits"));
-    const Tensor* softmax = context.Input<Tensor>("Softmax");
+    const phi::DenseTensor* labels = context.Input<phi::DenseTensor>("Label");
+    const phi::DenseTensor* loss_grad =
+        context.Input<phi::DenseTensor>(framework::GradVarName("Loss"));
+    phi::DenseTensor* logit_grad =
+        context.Output<phi::DenseTensor>(framework::GradVarName("Logits"));
+    const phi::DenseTensor* softmax =
+        context.Input<phi::DenseTensor>("Softmax");
     const int rank = context.Attr<int>("rank");
     auto& dev_ctx = context.template device_context<phi::GPUContext>();
 
diff --git a/paddle/fluid/operators/collective/c_split_op.cu b/paddle/fluid/operators/collective/c_split_op.cu
index 5b34e4ba9d594..2089c23fa6ec5 100644
--- a/paddle/fluid/operators/collective/c_split_op.cu
+++ b/paddle/fluid/operators/collective/c_split_op.cu
@@ -56,8 +56,8 @@ template <typename T>
 class CSplitOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto x = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
+    auto x = ctx.Input<phi::DenseTensor>("X");
+    auto out = ctx.Output<phi::DenseTensor>("Out");
 
     int nranks = ctx.Attr<int>("nranks");
     int rank = ctx.Attr<int>("rank");
diff --git a/paddle/fluid/operators/collective/global_gather_op.cu.cc b/paddle/fluid/operators/collective/global_gather_op.cu.cc
index 3d7ab09f45e7d..89714eb8d2744 100644
--- a/paddle/fluid/operators/collective/global_gather_op.cu.cc
+++ b/paddle/fluid/operators/collective/global_gather_op.cu.cc
@@ -48,7 +48,7 @@ struct GlobalGatherFunctor<phi::GPUContext, T> {
     const int64_t* cpu_global_count_data;
     auto local_count_len = 0;
 
-    framework::Tensor cpu_local_count;
+    phi::DenseTensor cpu_local_count;
     if (platform::is_cpu_place(local_count->place())) {
       cpu_local_count_data = local_count->data<int64_t>();
       local_count_len = local_count->numel();
@@ -59,7 +59,7 @@ struct GlobalGatherFunctor<phi::GPUContext, T> {
       local_count_len = cpu_local_count.numel();
     }
 
-    framework::Tensor cpu_global_count;
+    phi::DenseTensor cpu_global_count;
     if (platform::is_cpu_place(global_count->place())) {
       cpu_global_count_data = global_count->data<int64_t>();
     } else {
@@ -169,7 +169,7 @@ struct GlobalGatherProcessGroupFunctor<phi::GPUContext, T> {
     const int64_t* cpu_global_count_data;
     auto local_count_len = 0;
 
-    framework::Tensor cpu_local_count;
+    phi::DenseTensor cpu_local_count;
     if (platform::is_cpu_place(local_count->place())) {
       cpu_local_count_data = local_count->data<int64_t>();
       local_count_len = local_count->numel();
@@ -180,7 +180,7 @@ struct GlobalGatherProcessGroupFunctor<phi::GPUContext, T> {
       local_count_len = cpu_local_count.numel();
     }
 
-    framework::Tensor cpu_global_count;
+    phi::DenseTensor cpu_global_count;
     if (platform::is_cpu_place(global_count->place())) {
       cpu_global_count_data = global_count->data<int64_t>();
     } else {
diff --git a/paddle/fluid/operators/collective/global_scatter_op.cu.cc b/paddle/fluid/operators/collective/global_scatter_op.cu.cc
index 1337901f185af..d53afb919ccf0 100644
--- a/paddle/fluid/operators/collective/global_scatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/global_scatter_op.cu.cc
@@ -46,7 +46,7 @@ struct GlobalScatterFunctor<phi::GPUContext, T> {
     auto out = ctx.Output<framework::LoDTensor>("Out");
     const int64_t* cpu_local_count_data;
     const int64_t* cpu_global_count_data;
-    framework::Tensor cpu_local_count;
+    phi::DenseTensor cpu_local_count;
     if (platform::is_cpu_place(local_count->place())) {
       cpu_local_count_data = local_count->data<int64_t>();
     } else {
@@ -55,7 +55,7 @@ struct GlobalScatterFunctor<phi::GPUContext, T> {
       cpu_local_count_data = cpu_local_count.data<int64_t>();
     }
     auto global_count_len = 0;
-    framework::Tensor cpu_global_count;
+    phi::DenseTensor cpu_global_count;
     if (platform::is_cpu_place(global_count->place())) {
       cpu_global_count_data = global_count->data<int64_t>();
       global_count_len = global_count->numel();
@@ -167,7 +167,7 @@ struct GlobalScatterProcessGroupFunctor<phi::GPUContext, T> {
     auto out = ctx.Output<framework::LoDTensor>("Out");
     const int64_t* cpu_local_count_data;
     const int64_t* cpu_global_count_data;
-    framework::Tensor cpu_local_count;
+    phi::DenseTensor cpu_local_count;
     if (platform::is_cpu_place(local_count->place())) {
       cpu_local_count_data = local_count->data<int64_t>();
     } else {
@@ -176,7 +176,7 @@ struct GlobalScatterProcessGroupFunctor<phi::GPUContext, T> {
       cpu_local_count_data = cpu_local_count.data<int64_t>();
     }
     auto global_count_len = 0;
-    framework::Tensor cpu_global_count;
+    phi::DenseTensor cpu_global_count;
     if (platform::is_cpu_place(global_count->place())) {
       cpu_global_count_data = global_count->data<int64_t>();
       global_count_len = global_count->numel();
diff --git a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
index 6bc18254737d3..eeda5c72d9cae 100644
--- a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
@@ -28,8 +28,8 @@ class PartialAllGatherOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    auto in = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
+    auto in = ctx.Input<phi::DenseTensor>("X");
+    auto out = ctx.Output<phi::DenseTensor>("Out");
     int64_t numel = in->numel();
     ncclDataType_t dtype =
         platform::ToNCCLDataType(framework::TransToProtoVarType(in->dtype()));
diff --git a/paddle/fluid/operators/collective/partial_allgather_op_npu.cc b/paddle/fluid/operators/collective/partial_allgather_op_npu.cc
index 6b573c94535f7..d032839b83323 100644
--- a/paddle/fluid/operators/collective/partial_allgather_op_npu.cc
+++ b/paddle/fluid/operators/collective/partial_allgather_op_npu.cc
@@ -26,8 +26,8 @@ class CallPartialGatherOpASCENDKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
 #if defined(PADDLE_WITH_ASCEND_CL)
-    auto in = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
+    auto in = ctx.Input<phi::DenseTensor>("X");
+    auto out = ctx.Output<phi::DenseTensor>("Out");
     int64_t numel = in->numel();
     HcclDataType dtype =
         platform::ToHCCLDataType(framework::TransToProtoVarType(in->dtype()));
diff --git a/paddle/fluid/operators/collective/recv_v2_op.cu.cc b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
index ec18a172e1f8b..f68c2caf32bcb 100644
--- a/paddle/fluid/operators/collective/recv_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
@@ -46,7 +46,7 @@ framework::DDim recv_shape_info(const platform::Place &place,
       platform::ToNCCLDataType(framework::TransToProtoVarType(shape_dytpe));
 
   // step1: recv the shape size
-  framework::Tensor gpu_shape_size_tensor(shape_dytpe);
+  phi::DenseTensor gpu_shape_size_tensor(shape_dytpe);
   if (!group) {
     gpu_shape_size_tensor.Resize({1});
     gpu_shape_size_tensor.mutable_data(place, shape_dytpe);
@@ -56,11 +56,11 @@ framework::DDim recv_shape_info(const platform::Place &place,
   }
 
   // copy the shape size tensor to cpu
-  framework::Tensor *cpu_shape_size_tensor = new framework::Tensor(shape_dytpe);
+  phi::DenseTensor *cpu_shape_size_tensor = new phi::DenseTensor(shape_dytpe);
   cpu_shape_size_tensor->Resize({1});
   cpu_shape_size_tensor->mutable_data(platform::CPUPlace(), shape_dytpe);
   if (group) {
-    std::vector<framework::Tensor> shape_size_tensor;
+    std::vector<phi::DenseTensor> shape_size_tensor;
     shape_size_tensor.emplace_back(*cpu_shape_size_tensor);
     auto shape_size_task = group->Recv(shape_size_tensor, peer);
   } else {
@@ -72,7 +72,7 @@ framework::DDim recv_shape_info(const platform::Place &place,
   VLOG(3) << "recv the shape size: " << shape_size << " from peer";
 
   // step2: recv the shape
-  framework::Tensor gpu_shape_tensor(shape_dytpe);
+  phi::DenseTensor gpu_shape_tensor(shape_dytpe);
   if (!group) {
     gpu_shape_tensor.Resize({shape_size});
     gpu_shape_tensor.mutable_data(place, shape_dytpe);
@@ -82,11 +82,11 @@ framework::DDim recv_shape_info(const platform::Place &place,
   }
 
   // copy the shape tensor to cpu
-  framework::Tensor *cpu_shape_tensor = new framework::Tensor(shape_dytpe);
+  phi::DenseTensor *cpu_shape_tensor = new phi::DenseTensor(shape_dytpe);
   cpu_shape_tensor->Resize({shape_size});
   cpu_shape_tensor->mutable_data(platform::CPUPlace(), shape_dytpe);
   if (group) {
-    std::vector<framework::Tensor> shape_tensor;
+    std::vector<phi::DenseTensor> shape_tensor;
     shape_tensor.emplace_back(*cpu_shape_tensor);
     auto shape_task = group->Recv(shape_tensor, peer);
   } else {
diff --git a/paddle/fluid/operators/collective/send_v2_op.cu.cc b/paddle/fluid/operators/collective/send_v2_op.cu.cc
index 37b18703031de..9f63403dc43b2 100644
--- a/paddle/fluid/operators/collective/send_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/send_v2_op.cu.cc
@@ -26,7 +26,7 @@ namespace operators {
 
 #if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \
     NCCL_VERSION_CODE >= 2703
-void send_shape_info(const framework::Tensor& x,
+void send_shape_info(const phi::DenseTensor& x,
                      const platform::Place& place,
                      const gpuStream_t& stream,
                      platform::NCCLComm* comm,
@@ -47,20 +47,19 @@ void send_shape_info(const framework::Tensor& x,
   int shape_size = dims.size();
 
   // step1: send the shape size
-  framework::Tensor cpu_shape_size_tensor(shape_dytpe);
+  phi::DenseTensor cpu_shape_size_tensor(shape_dytpe);
   cpu_shape_size_tensor.Resize({1});
   cpu_shape_size_tensor.mutable_data(platform::CPUPlace(), shape_dytpe);
   auto* cpu_data = cpu_shape_size_tensor.data<int>();
   cpu_data[0] = shape_size;
 
   if (group) {
-    std::vector<framework::Tensor> shape_size_tensor;
+    std::vector<phi::DenseTensor> shape_size_tensor;
     shape_size_tensor.template emplace_back(cpu_shape_size_tensor);
     auto shape_size_task = group->Send(shape_size_tensor, peer);
   } else {
     // copy the shape size tensor to gpu and send
-    framework::Tensor* gpu_shape_size_tensor =
-        new framework::Tensor(shape_dytpe);
+    phi::DenseTensor* gpu_shape_size_tensor = new phi::DenseTensor(shape_dytpe);
     gpu_shape_size_tensor->Resize({1});
     gpu_shape_size_tensor->mutable_data(place, shape_dytpe);
     framework::TensorCopySync(
@@ -76,7 +75,7 @@ void send_shape_info(const framework::Tensor& x,
   VLOG(3) << "send the shape size: " << shape_size << " to peer";
 
   // step2: send the shape
-  framework::Tensor cpu_shape_tensor(shape_dytpe);
+  phi::DenseTensor cpu_shape_tensor(shape_dytpe);
   cpu_shape_tensor.Resize({shape_size});
   cpu_shape_tensor.mutable_data(platform::CPUPlace(), shape_dytpe);
   auto* cpu_shape_data = cpu_shape_tensor.data<int>();
@@ -85,12 +84,12 @@ void send_shape_info(const framework::Tensor& x,
   }
 
   if (group) {
-    std::vector<framework::Tensor> shape_tensor;
+    std::vector<phi::DenseTensor> shape_tensor;
     shape_tensor.template emplace_back(cpu_shape_tensor);
     auto shape_task = group->Send(shape_tensor, peer);
   } else {
     // copy the shape tensor to gpu and send
-    framework::Tensor* gpu_shape_tensor = new framework::Tensor(shape_dytpe);
+    phi::DenseTensor* gpu_shape_tensor = new phi::DenseTensor(shape_dytpe);
     gpu_shape_tensor->Resize({shape_size});
     gpu_shape_tensor->mutable_data(place, shape_dytpe);
     framework::TensorCopySync(cpu_shape_tensor, place, gpu_shape_tensor);
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index 7c3a8103e1dbb..a875f1fc8df9e 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -30,7 +30,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class ConcatOp : public framework::OperatorWithKernel {
  public:
@@ -39,7 +39,7 @@ class ConcatOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    auto inputs = ctx.MultiInput<Tensor>("X");
+    auto inputs = ctx.MultiInput<phi::DenseTensor>("X");
     auto input_data_type = framework::proto::VarType::Type(0);
     bool flag = 0;
     for (auto *input : inputs) {
@@ -66,7 +66,7 @@ class ConcatOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     if (var_name == "AxisTensor") {
       return expected_kernel_type;
@@ -145,7 +145,7 @@ class ConcatOpGrad : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     if (var_name == "AxisTensor") {
       return expected_kernel_type;
diff --git a/paddle/fluid/operators/concat_op_mlu.cc b/paddle/fluid/operators/concat_op_mlu.cc
index a4cc1c37db0cf..38e87ad45bf27 100644
--- a/paddle/fluid/operators/concat_op_mlu.cc
+++ b/paddle/fluid/operators/concat_op_mlu.cc
@@ -31,7 +31,7 @@ class ConcatMLUKernel : public framework::OpKernel<T> {
     auto ins_size = ins.size();
     bool need_resize_out_dims = false;
     if (ctx.HasInput("AxisTensor")) {
-      auto* axis_tensor = ctx.Input<framework::Tensor>("AxisTensor");
+      auto* axis_tensor = ctx.Input<phi::DenseTensor>("AxisTensor");
       axis = GetDataFromTensor<int>(axis_tensor)[0];
       need_resize_out_dims = true;
     }
@@ -84,8 +84,7 @@ template <typename T>
 class ConcatGradMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
     auto ins = ctx.MultiInput<framework::LoDTensor>("X");
     auto out_var_names = ctx.OutputNames(framework::GradVarName("X"));
     auto outs =
@@ -98,7 +97,7 @@ class ConcatGradMLUKernel : public framework::OpKernel<T> {
                                 "The first input tensor is not initalized."));
 
     if (ctx.HasInput("AxisTensor")) {
-      auto* axis_tensor = ctx.Input<framework::Tensor>("AxisTensor");
+      auto* axis_tensor = ctx.Input<phi::DenseTensor>("AxisTensor");
       axis = GetDataFromTensor<int>(axis_tensor)[0];
     }
 
diff --git a/paddle/fluid/operators/concat_op_npu.cc b/paddle/fluid/operators/concat_op_npu.cc
index 3dc0d28bd452d..6c2c48292adbd 100644
--- a/paddle/fluid/operators/concat_op_npu.cc
+++ b/paddle/fluid/operators/concat_op_npu.cc
@@ -39,7 +39,7 @@ class ConcatNPUKernel : public framework::OpKernel<T> {
     auto place = ctx.GetPlace();
     out->mutable_data<T>(place);
 
-    std::vector<framework::Tensor> inputs;
+    std::vector<phi::DenseTensor> inputs;
     std::vector<std::string> names;
     for (size_t i = 0; i < ins.size(); ++i) {
       if (ins[i] && ins[i]->numel() > 0) {
@@ -66,8 +66,7 @@ template <typename T>
 class ConcatGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
     auto ins = ctx.MultiInput<framework::LoDTensor>("X");
     auto out_var_names = ctx.OutputNames(framework::GradVarName("X"));
     auto outs =
diff --git a/paddle/fluid/operators/controlflow/fetch_op.cc b/paddle/fluid/operators/controlflow/fetch_op.cc
index c1ed46867f1ac..ec1ad1475d644 100644
--- a/paddle/fluid/operators/controlflow/fetch_op.cc
+++ b/paddle/fluid/operators/controlflow/fetch_op.cc
@@ -30,7 +30,7 @@ static void DataCopy(const framework::LoDTensor &src_item,
 #ifdef PADDLE_WITH_MKLDNN
     // Conversion from MKL-DNN to Paddle
     if (src_item.layout() == framework::DataLayout::kMKLDNN) {
-      framework::Tensor out;
+      phi::DenseTensor out;
       // Convert to desired Paddle layout, apart from grads of filter
       // as params are not a subject to paddle's data_format
       VLOG(4) << "innerTransDataLayoutFromMKLDNN";
diff --git a/paddle/fluid/operators/controlflow/fetch_v2_op.cc b/paddle/fluid/operators/controlflow/fetch_v2_op.cc
index 64489c294d123..8478db44853b8 100644
--- a/paddle/fluid/operators/controlflow/fetch_v2_op.cc
+++ b/paddle/fluid/operators/controlflow/fetch_v2_op.cc
@@ -38,7 +38,7 @@ static void DeepCopy(const framework::LoDTensor &src_item,
 #ifdef PADDLE_WITH_MKLDNN
     // Conversion from MKL-DNN to Paddle
     if (src_item.layout() == framework::DataLayout::kMKLDNN) {
-      framework::Tensor out;
+      phi::DenseTensor out;
       // Convert to desired Paddle layout, apart from grads of filter
       // as params are not a subject to paddle's data_format
       framework::innerTransDataLayoutFromMKLDNN(
@@ -75,7 +75,7 @@ class FetchV2Op : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const framework::Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     if (!tensor.IsInitialized()) {
       return expected_kernel_type;
diff --git a/paddle/fluid/operators/controlflow/logical_op_mlu.cc b/paddle/fluid/operators/controlflow/logical_op_mlu.cc
index 8eb30607158ec..5e1630447b9de 100644
--- a/paddle/fluid/operators/controlflow/logical_op_mlu.cc
+++ b/paddle/fluid/operators/controlflow/logical_op_mlu.cc
@@ -18,15 +18,15 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T, cnnlLogicOp_t log_method>
 class LogicalMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* y = ctx.Input<phi::DenseTensor>("Y");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     out->mutable_data<T>(ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/controlflow/logical_op_npu.cc b/paddle/fluid/operators/controlflow/logical_op_npu.cc
index c3d7df8d02743..7c2c11bbfb40e 100644
--- a/paddle/fluid/operators/controlflow/logical_op_npu.cc
+++ b/paddle/fluid/operators/controlflow/logical_op_npu.cc
@@ -15,14 +15,14 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class LogicalNotNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     out->mutable_data<T>(ctx.GetPlace());
 
@@ -39,9 +39,9 @@ template <typename DeviceContext, typename T>
 class LogicalOrNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* y = ctx.Input<phi::DenseTensor>("Y");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     out->mutable_data<T>(ctx.GetPlace());
 
@@ -58,9 +58,9 @@ template <typename DeviceContext, typename T>
 class LogicalAndPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* y = ctx.Input<phi::DenseTensor>("Y");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     out->mutable_data<T>(ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/controlflow/logical_op_xpu.h b/paddle/fluid/operators/controlflow/logical_op_xpu.h
index 445d853364ddd..5e1a24116b080 100644
--- a/paddle/fluid/operators/controlflow/logical_op_xpu.h
+++ b/paddle/fluid/operators/controlflow/logical_op_xpu.h
@@ -42,16 +42,16 @@ template <XpuLogicalType xpu_type, typename T>
 class BinaryLogicalOpXPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* y = context.Input<framework::Tensor>("Y");
-    auto* out = context.Output<framework::Tensor>("Out");
+    auto* x = context.Input<phi::DenseTensor>("X");
+    auto* y = context.Input<phi::DenseTensor>("Y");
+    auto* out = context.Output<phi::DenseTensor>("Out");
     bool* out_ptr = out->mutable_data<bool>(context.GetPlace());
     const T* x_ptr = x->data<T>();
     const T* y_ptr = y->data<T>();
     auto& dev_ctx =
         context.template device_context<paddle::platform::XPUDeviceContext>();
-    framework::Tensor broadcast_x;
-    framework::Tensor broadcast_y;
+    phi::DenseTensor broadcast_x;
+    phi::DenseTensor broadcast_y;
     bool need_broad_cast = false;
     if (x->numel() != out->numel()) {
       // x need broadcast
@@ -160,8 +160,8 @@ template <typename T>
 class UnaryLogicalOpXPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
+    auto* x = context.Input<phi::DenseTensor>("X");
+    auto* out = context.Output<phi::DenseTensor>("Out");
     if (x->numel() == 0) {
       return;
     }
diff --git a/paddle/fluid/operators/conv_base_helper.h b/paddle/fluid/operators/conv_base_helper.h
index 285dc8fddb7f3..705fc1f5618b5 100644
--- a/paddle/fluid/operators/conv_base_helper.h
+++ b/paddle/fluid/operators/conv_base_helper.h
@@ -28,7 +28,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using DataLayout = platform::DataLayout;
 using framework::AlgorithmsCache;
 using framework::ConvSearchCache;
@@ -68,7 +68,7 @@ struct ConvArgsBase {
   platform::TensorDescriptor idesc, odesc;
   platform::FilterDescriptor wdesc;
   platform::ConvolutionDescriptor cdesc;
-  const framework::Tensor *x, *w, *o;
+  const phi::DenseTensor *x, *w, *o;
   DataT cudnn_dtype;
 
   // strides
@@ -84,9 +84,9 @@ struct ConvArgsBase {
   // data foramt
   DataLayout data_layout;
 
-  ConvArgsBase(const framework::Tensor* x,
-               const framework::Tensor* w,
-               const framework::Tensor* o,
+  ConvArgsBase(const phi::DenseTensor* x,
+               const phi::DenseTensor* w,
+               const phi::DenseTensor* o,
                const std::vector<int> s,
                const std::vector<int> p,
                const std::vector<int> d,
diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h
index ba4e5585f363f..8795b3fa14bcc 100644
--- a/paddle/fluid/operators/conv_cudnn_helper.h
+++ b/paddle/fluid/operators/conv_cudnn_helper.h
@@ -29,8 +29,8 @@ using ConvArgs = ConvArgsBase<cudnnHandle_t, cudnnDataType_t>;
 
 template <typename DeviceContext, typename T, size_t D>
 static void RemovePaddingSlice(const phi::GPUContext& context,
-                               const Tensor* input,
-                               Tensor* out,
+                               const phi::DenseTensor* input,
+                               phi::DenseTensor* out,
                                const std::vector<int>& starts,
                                const std::vector<int>& axes) {
   auto& place = *context.eigen_device();
diff --git a/paddle/fluid/operators/conv_miopen_helper.h b/paddle/fluid/operators/conv_miopen_helper.h
index 648116647b04a..907ae50941602 100644
--- a/paddle/fluid/operators/conv_miopen_helper.h
+++ b/paddle/fluid/operators/conv_miopen_helper.h
@@ -24,8 +24,8 @@ using ConvArgs = ConvArgsBase<miopenHandle_t, miopenDataType_t>;
 
 template <typename DeviceContext, typename T, size_t D>
 static void RemovePaddingSlice(const phi::GPUContext& context,
-                               const Tensor* input,
-                               Tensor* out,
+                               const phi::DenseTensor* input,
+                               phi::DenseTensor* out,
                                const std::vector<int>& starts,
                                const std::vector<int>& axes) {
   auto& place = *context.eigen_device();
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index d9c1332191ac7..f61329107125a 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -222,8 +222,8 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
   if (input_data_type != framework::proto::VarType::INT8 &&
       input_data_type != framework::proto::VarType::UINT8 &&
       input_data_type != framework::proto::VarType::BF16) {
-    auto filter_data_type =
-        framework::TransToProtoVarType(ctx.Input<Tensor>("Filter")->dtype());
+    auto filter_data_type = framework::TransToProtoVarType(
+        ctx.Input<phi::DenseTensor>("Filter")->dtype());
     PADDLE_ENFORCE_EQ(
         input_data_type,
         filter_data_type,
@@ -260,7 +260,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
 
 framework::OpKernelType ConvOp::GetKernelTypeForVar(
     const std::string& var_name,
-    const Tensor& tensor,
+    const phi::DenseTensor& tensor,
     const framework::OpKernelType& expected_kernel_type) const {
 #ifdef PADDLE_WITH_MKLDNN
   // Only input require reshaping, weights and
@@ -532,7 +532,7 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
 
 framework::OpKernelType ConvOpGrad::GetKernelTypeForVar(
     const std::string& var_name,
-    const Tensor& tensor,
+    const phi::DenseTensor& tensor,
     const framework::OpKernelType& expected_kernel_type) const {
 #ifdef PADDLE_WITH_MKLDNN
   // Only input require reshaping, weights and
diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h
index 806265376fa1f..925603dad9ba3 100644
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -29,7 +29,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 constexpr int kConvMKLDNNFP32 = 1;
 constexpr int kConvMKLDNNINT8 = 2;
 constexpr int kConvMKLDNNINT8WS8 = 3;
@@ -207,7 +207,7 @@ class ConvOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override;
 };
 
@@ -222,7 +222,7 @@ class ConvOpGrad : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override;
 };
 
diff --git a/paddle/fluid/operators/conv_op_mlu.cc b/paddle/fluid/operators/conv_op_mlu.cc
index 0e0ed82e8798a..cd0bd90637e34 100644
--- a/paddle/fluid/operators/conv_op_mlu.cc
+++ b/paddle/fluid/operators/conv_op_mlu.cc
@@ -18,16 +18,16 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using DataLayout = framework::DataLayout;
 
 template <typename T>
 class MLUConvOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* input = ctx.Input<Tensor>("Input");
-    auto* filter = ctx.Input<Tensor>("Filter");
-    auto* output = ctx.Output<Tensor>("Output");
+    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
+    auto* filter = ctx.Input<phi::DenseTensor>("Filter");
+    auto* output = ctx.Output<phi::DenseTensor>("Output");
     output->mutable_data<T>(ctx.GetPlace());
     const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
@@ -129,11 +129,14 @@ template <typename T>
 class MLUConvGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto input = ctx.Input<Tensor>("Input");
-    auto filter = ctx.Input<Tensor>("Filter");
-    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
-    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    auto filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
+    auto input = ctx.Input<phi::DenseTensor>("Input");
+    auto filter = ctx.Input<phi::DenseTensor>("Filter");
+    auto output_grad =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Output"));
+    auto input_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
+    auto filter_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Filter"));
 
     const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
@@ -292,9 +295,9 @@ template <typename T>
 class MLUDepthwiseConvOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* input = ctx.Input<Tensor>("Input");
-    auto* filter = ctx.Input<Tensor>("Filter");
-    auto* output = ctx.Output<Tensor>("Output");
+    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
+    auto* filter = ctx.Input<phi::DenseTensor>("Filter");
+    auto* output = ctx.Output<phi::DenseTensor>("Output");
     output->mutable_data<T>(ctx.GetPlace());
     const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
@@ -398,11 +401,14 @@ template <typename T>
 class MLUDepthwiseConvGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto input = ctx.Input<Tensor>("Input");
-    auto filter = ctx.Input<Tensor>("Filter");
-    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
-    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    auto filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
+    auto input = ctx.Input<phi::DenseTensor>("Input");
+    auto filter = ctx.Input<phi::DenseTensor>("Filter");
+    auto output_grad =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Output"));
+    auto input_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
+    auto filter_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Filter"));
 
     const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
diff --git a/paddle/fluid/operators/conv_op_npu.cc b/paddle/fluid/operators/conv_op_npu.cc
index dad2e7d238bfc..f4c7de95483b5 100644
--- a/paddle/fluid/operators/conv_op_npu.cc
+++ b/paddle/fluid/operators/conv_op_npu.cc
@@ -18,12 +18,12 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using NPUDeviceContext = platform::NPUDeviceContext;
 static void CastToFP16(const framework::ExecutionContext& ctx,
                        const aclrtStream& stream,
-                       const Tensor& in,
-                       Tensor* out) {
+                       const phi::DenseTensor& in,
+                       phi::DenseTensor* out) {
   out->mutable_data<paddle::platform::float16>(ctx.GetPlace());
   NpuOpRunner runner;
   runner.SetType("Cast")
@@ -35,8 +35,8 @@ static void CastToFP16(const framework::ExecutionContext& ctx,
 
 static void CastToFP32(const framework::ExecutionContext& ctx,
                        const aclrtStream& stream,
-                       const Tensor& in,
-                       Tensor* out) {
+                       const phi::DenseTensor& in,
+                       phi::DenseTensor* out) {
   out->mutable_data<float>(ctx.GetPlace());
   NpuOpRunner runner;
   runner.SetType("Cast")
@@ -50,9 +50,9 @@ template <typename T>
 class DepthwiseConvNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* input = ctx.Input<Tensor>("Input");
-    const Tensor* filter = ctx.Input<Tensor>("Filter");
-    Tensor* output = ctx.Output<Tensor>("Output");
+    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
+    const phi::DenseTensor* filter = ctx.Input<phi::DenseTensor>("Filter");
+    phi::DenseTensor* output = ctx.Output<phi::DenseTensor>("Output");
     output->mutable_data<T>(ctx.GetPlace());
 
     const std::vector<int> stride = ctx.Attr<std::vector<int>>("strides");
@@ -151,11 +151,14 @@ template <typename T>
 class DepthwiseConvGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* input = ctx.Input<Tensor>("Input");
-    const Tensor* filter = ctx.Input<Tensor>("Filter");
-    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
-    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    auto filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
+    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
+    const phi::DenseTensor* filter = ctx.Input<phi::DenseTensor>("Filter");
+    auto output_grad =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Output"));
+    auto input_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
+    auto filter_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Filter"));
 
     const std::vector<int> stride = ctx.Attr<std::vector<int>>("strides");
     std::vector<int> padding = ctx.Attr<std::vector<int>>("paddings");
@@ -268,9 +271,9 @@ template <typename T>
 class NPUConvOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* input = ctx.Input<Tensor>("Input");
-    auto* filter = ctx.Input<Tensor>("Filter");
-    auto* output = ctx.Output<Tensor>("Output");
+    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
+    auto* filter = ctx.Input<phi::DenseTensor>("Filter");
+    auto* output = ctx.Output<phi::DenseTensor>("Output");
     output->mutable_data<T>(ctx.GetPlace());
     const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
@@ -336,11 +339,14 @@ template <typename T>
 class NPUConvGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto input = ctx.Input<Tensor>("Input");
-    auto filter = ctx.Input<Tensor>("Filter");
-    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
-    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    auto filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
+    auto input = ctx.Input<phi::DenseTensor>("Input");
+    auto filter = ctx.Input<phi::DenseTensor>("Filter");
+    auto output_grad =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Output"));
+    auto input_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
+    auto filter_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Filter"));
 
     const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
@@ -447,9 +453,9 @@ template <typename T>
 class NPUConv3dKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* input = ctx.Input<Tensor>("Input");
-    const Tensor* filter = ctx.Input<Tensor>("Filter");
-    Tensor* output = ctx.Output<Tensor>("Output");
+    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
+    const phi::DenseTensor* filter = ctx.Input<phi::DenseTensor>("Filter");
+    phi::DenseTensor* output = ctx.Output<phi::DenseTensor>("Output");
 
     const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
@@ -533,12 +539,14 @@ template <typename T>
 class NPUConv3dGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* input = ctx.Input<Tensor>("Input");
-    const Tensor* filter = ctx.Input<Tensor>("Filter");
-    const Tensor* output_grad =
-        ctx.Input<Tensor>(framework::GradVarName("Output"));
-    Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
+    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
+    const phi::DenseTensor* filter = ctx.Input<phi::DenseTensor>("Filter");
+    const phi::DenseTensor* output_grad =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Output"));
+    phi::DenseTensor* input_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
+    phi::DenseTensor* filter_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Filter"));
 
     const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
diff --git a/paddle/fluid/operators/conv_shift_op.cc b/paddle/fluid/operators/conv_shift_op.cc
index d7dfa88e2d277..c6b33998eb61b 100644
--- a/paddle/fluid/operators/conv_shift_op.cc
+++ b/paddle/fluid/operators/conv_shift_op.cc
@@ -21,7 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
@@ -156,9 +155,9 @@ template <typename T>
 class ConvShiftKernel<platform::CPUPlace, T> : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    auto *X = context.Input<Tensor>("X");
-    auto *Y = context.Input<Tensor>("Y");
-    auto *Out = context.Output<Tensor>("Out");
+    auto *X = context.Input<phi::DenseTensor>("X");
+    auto *Y = context.Input<phi::DenseTensor>("Y");
+    auto *Out = context.Output<phi::DenseTensor>("Out");
     Out->mutable_data<T>(context.GetPlace());
 
     auto x = EigenMatrix<T>::From(*X);
@@ -187,11 +186,11 @@ class ConvShiftGradKernel<platform::CPUPlace, T>
     : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    auto *X = context.Input<Tensor>("X");
-    auto *Y = context.Input<Tensor>("Y");
-    auto *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto *dX = context.Output<Tensor>(framework::GradVarName("X"));
-    auto *dY = context.Output<Tensor>(framework::GradVarName("Y"));
+    auto *X = context.Input<phi::DenseTensor>("X");
+    auto *Y = context.Input<phi::DenseTensor>("Y");
+    auto *dOut = context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto *dX = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto *dY = context.Output<phi::DenseTensor>(framework::GradVarName("Y"));
 
     auto x = EigenMatrix<T>::From(*X);
     auto y = EigenMatrix<T>::From(*Y);
diff --git a/paddle/fluid/operators/conv_shift_op.cu b/paddle/fluid/operators/conv_shift_op.cu
index 89b703d8d1a5d..689722d24eccb 100644
--- a/paddle/fluid/operators/conv_shift_op.cu
+++ b/paddle/fluid/operators/conv_shift_op.cu
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 namespace {
 
 inline int DivUp(int x, int y) { return (x + y - 1) / y; }
@@ -127,9 +125,9 @@ template <typename T>
 class ConvShiftKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<Tensor>("X");
-    const Tensor *Y = context.Input<Tensor>("Y");
-    Tensor *Out = context.Output<Tensor>("Out");
+    const phi::DenseTensor *X = context.Input<phi::DenseTensor>("X");
+    const phi::DenseTensor *Y = context.Input<phi::DenseTensor>("Y");
+    phi::DenseTensor *Out = context.Output<phi::DenseTensor>("Out");
     const T *x_data = X->data<T>();
     const T *y_data = Y->data<T>();
     T *out_data = Out->mutable_data<T>(context.GetPlace());
@@ -156,15 +154,18 @@ template <typename T>
 class ConvShiftGradKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<Tensor>("X");
-    const Tensor *Y = context.Input<Tensor>("Y");
-    const Tensor *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
+    const phi::DenseTensor *X = context.Input<phi::DenseTensor>("X");
+    const phi::DenseTensor *Y = context.Input<phi::DenseTensor>("Y");
+    const phi::DenseTensor *dOut =
+        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
     const T *x_data = X->data<T>();
     const T *y_data = Y->data<T>();
     const T *dout_data = dOut->data<T>();
 
-    Tensor *dX = context.Output<Tensor>(framework::GradVarName("X"));
-    Tensor *dY = context.Output<Tensor>(framework::GradVarName("Y"));
+    phi::DenseTensor *dX =
+        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    phi::DenseTensor *dY =
+        context.Output<phi::DenseTensor>(framework::GradVarName("Y"));
 
     int batch_size = X->dims()[0];
     int x_width = X->dims()[1];
diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
index d883d2da291b2..8c221ec542114 100644
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -64,7 +64,7 @@ framework::OpKernelType ConvTransposeOp::GetExpectedKernelType(
 
 framework::OpKernelType ConvTransposeOp::GetKernelTypeForVar(
     const std::string& var_name,
-    const framework::Tensor& tensor,
+    const phi::DenseTensor& tensor,
     const framework::OpKernelType& expected_kernel_type) const {
 #ifdef PADDLE_WITH_MKLDNN
   // Only input require reshaping, weights and
diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h
index ca82ca518a9e7..d47828e5bdc8f 100644
--- a/paddle/fluid/operators/conv_transpose_op.h
+++ b/paddle/fluid/operators/conv_transpose_op.h
@@ -43,7 +43,7 @@ class ConvTransposeOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const framework::Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override;
 };
 
diff --git a/paddle/fluid/operators/conv_transpose_op_mlu.cc b/paddle/fluid/operators/conv_transpose_op_mlu.cc
index f757898886e1f..9adeec2d7079e 100644
--- a/paddle/fluid/operators/conv_transpose_op_mlu.cc
+++ b/paddle/fluid/operators/conv_transpose_op_mlu.cc
@@ -20,16 +20,16 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using DataLayout = framework::DataLayout;
 
 template <typename T>
 class Conv2DTransposeMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* input = ctx.Input<Tensor>("Input");
-    const Tensor* filter = ctx.Input<Tensor>("Filter");
-    Tensor* output = ctx.Output<Tensor>("Output");
+    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
+    const phi::DenseTensor* filter = ctx.Input<phi::DenseTensor>("Filter");
+    phi::DenseTensor* output = ctx.Output<phi::DenseTensor>("Output");
     output->mutable_data<T>(ctx.GetPlace());
     std::vector<int> output_padding =
         ctx.Attr<std::vector<int>>("output_padding");
@@ -131,12 +131,14 @@ template <typename T>
 class Conv2DTransposeGradMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* input = ctx.Input<Tensor>("Input");
-    const Tensor* filter = ctx.Input<Tensor>("Filter");
-    const Tensor* output_grad =
-        ctx.Input<Tensor>(framework::GradVarName("Output"));
-    Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
+    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
+    const phi::DenseTensor* filter = ctx.Input<phi::DenseTensor>("Filter");
+    const phi::DenseTensor* output_grad =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Output"));
+    phi::DenseTensor* input_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
+    phi::DenseTensor* filter_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Filter"));
 
     if ((!input_grad) && (!filter_grad)) return;
 
diff --git a/paddle/fluid/operators/conv_transpose_op_npu.cc b/paddle/fluid/operators/conv_transpose_op_npu.cc
index 94a6825ff6134..66a49b1bb89b2 100644
--- a/paddle/fluid/operators/conv_transpose_op_npu.cc
+++ b/paddle/fluid/operators/conv_transpose_op_npu.cc
@@ -20,16 +20,16 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using NPUDeviceContext = platform::NPUDeviceContext;
 
 template <typename T>
 class Conv2DTransposeNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* input = ctx.Input<Tensor>("Input");
-    const Tensor* filter = ctx.Input<Tensor>("Filter");
-    Tensor* output = ctx.Output<Tensor>("Output");
+    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
+    const phi::DenseTensor* filter = ctx.Input<phi::DenseTensor>("Filter");
+    phi::DenseTensor* output = ctx.Output<phi::DenseTensor>("Output");
     output->mutable_data<T>(ctx.GetPlace());
     std::vector<int> output_padding =
         ctx.Attr<std::vector<int>>("output_padding");
@@ -107,12 +107,14 @@ template <typename T>
 class Conv2DTransposeGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* input = ctx.Input<Tensor>("Input");
-    const Tensor* filter = ctx.Input<Tensor>("Filter");
-    const Tensor* output_grad =
-        ctx.Input<Tensor>(framework::GradVarName("Output"));
-    Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
+    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
+    const phi::DenseTensor* filter = ctx.Input<phi::DenseTensor>("Filter");
+    const phi::DenseTensor* output_grad =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Output"));
+    phi::DenseTensor* input_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
+    phi::DenseTensor* filter_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Filter"));
 
     if ((!input_grad) && (!filter_grad)) return;
 
@@ -203,9 +205,9 @@ template <typename T>
 class Conv3DTransposeNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* input = ctx.Input<Tensor>("Input");
-    const Tensor* filter = ctx.Input<Tensor>("Filter");
-    Tensor* output = ctx.Output<Tensor>("Output");
+    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
+    const phi::DenseTensor* filter = ctx.Input<phi::DenseTensor>("Filter");
+    phi::DenseTensor* output = ctx.Output<phi::DenseTensor>("Output");
     output->mutable_data<T>(ctx.GetPlace());
     std::vector<int> output_padding =
         ctx.Attr<std::vector<int>>("output_padding");
diff --git a/paddle/fluid/operators/copy_cross_scope_op.cc b/paddle/fluid/operators/copy_cross_scope_op.cc
index a826f1d1b897d..70a125af15156 100644
--- a/paddle/fluid/operators/copy_cross_scope_op.cc
+++ b/paddle/fluid/operators/copy_cross_scope_op.cc
@@ -31,7 +31,7 @@ class OpBase;
 }  // namespace paddle
 
 using LoDTensor = paddle::framework::LoDTensor;
-using Tensor = paddle::framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 namespace paddle {
 namespace operators {
@@ -66,7 +66,7 @@ class CopyCrossScopeOp : public framework::OperatorBase {
         platform::errors::NotFound("No variable with name %s found.", id_name));
     auto id_tensor = id_var->GetMutable<LoDTensor>();
     auto it = scope.kids().begin();
-    framework::Tensor cpu_id_tensor;
+    phi::DenseTensor cpu_id_tensor;
     paddle::framework::TensorCopySync(
         *id_tensor, platform::CPUPlace(), &cpu_id_tensor);
     auto id_value = cpu_id_tensor.data<int64_t>();
diff --git a/paddle/fluid/operators/correlation_op.cc b/paddle/fluid/operators/correlation_op.cc
index 0e89889f40f29..cbd06ec042c48 100644
--- a/paddle/fluid/operators/correlation_op.cc
+++ b/paddle/fluid/operators/correlation_op.cc
@@ -22,7 +22,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 inline std::vector<int64_t> CorrelationOutputSize(int batch,
                                                   int input_height,
@@ -115,17 +115,17 @@ class CorrelationOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     auto input_data_type =
         OperatorWithKernel::IndicateVarDataType(ctx, "Input1");
-    PADDLE_ENFORCE_EQ(
-        input_data_type,
-        framework::TransToProtoVarType(ctx.Input<Tensor>("Input2")->dtype()),
-        platform::errors::InvalidArgument(
-            "X and Y shoule have the same datatype"));
+    PADDLE_ENFORCE_EQ(input_data_type,
+                      framework::TransToProtoVarType(
+                          ctx.Input<phi::DenseTensor>("Input2")->dtype()),
+                      platform::errors::InvalidArgument(
+                          "X and Y shoule have the same datatype"));
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     return framework::OpKernelType(
         expected_kernel_type.data_type_, tensor.place(), tensor.layout());
diff --git a/paddle/fluid/operators/correlation_op.cu b/paddle/fluid/operators/correlation_op.cu
index 434506c033c4d..0155463dd5306 100644
--- a/paddle/fluid/operators/correlation_op.cu
+++ b/paddle/fluid/operators/correlation_op.cu
@@ -31,8 +31,6 @@ namespace operators {
 #endif
 #define FULL_MASK 0xffffffff
 
-using framework::Tensor;
-
 template <typename T>
 __forceinline__ __device__ T warpReduceSum(T val) {
   for (int offset = 16; offset > 0; offset /= 2) {
@@ -186,8 +184,8 @@ class CorrelationCUDAKernel : public framework::OpKernel<T> {
                       platform::errors::InvalidArgument(
                           "Correlation only supports GPU now."));
 
-    auto *input1 = ctx.Input<Tensor>("Input1");
-    auto *input2 = ctx.Input<Tensor>("Input2");
+    auto *input1 = ctx.Input<phi::DenseTensor>("Input1");
+    auto *input2 = ctx.Input<phi::DenseTensor>("Input2");
     int pad_size = ctx.Attr<int>("pad_size");
     int kernel_size = ctx.Attr<int>("kernel_size");
     int stride1 = ctx.Attr<int>("stride1");
@@ -195,7 +193,7 @@ class CorrelationCUDAKernel : public framework::OpKernel<T> {
     int max_displacement = ctx.Attr<int>("max_displacement");
     int corr_type_multiply = ctx.Attr<int>("corr_type_multiply");
 
-    auto *output = ctx.Output<Tensor>("Output");
+    auto *output = ctx.Output<phi::DenseTensor>("Output");
     output->mutable_data<T>(ctx.GetPlace());
     auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
 
@@ -209,11 +207,11 @@ class CorrelationCUDAKernel : public framework::OpKernel<T> {
     int padded_input_height = H + 2 * pad_size;
     int padded_input_width = W + 2 * pad_size;
 
-    Tensor rinput1 = ctx.AllocateTmpTensor<T, phi::GPUContext>(
+    phi::DenseTensor rinput1 = ctx.AllocateTmpTensor<T, phi::GPUContext>(
         {N, padded_input_height, padded_input_width, C}, dev_ctx);
     rinput1.mutable_data<T>(ctx.GetPlace());
 
-    Tensor rinput2 = ctx.AllocateTmpTensor<T, phi::GPUContext>(
+    phi::DenseTensor rinput2 = ctx.AllocateTmpTensor<T, phi::GPUContext>(
         {N, padded_input_height, padded_input_width, C}, dev_ctx);
     rinput2.mutable_data<T>(ctx.GetPlace());
 
@@ -453,10 +451,10 @@ class CorrelationCUDAGradKernel : public framework::OpKernel<T> {
                       true,
                       platform::errors::InvalidArgument(
                           "Correlation only supports GPU now."));
-    const auto *input1 = ctx.Input<Tensor>("Input1");
-    const auto *input2 = ctx.Input<Tensor>("Input2");
+    const auto *input1 = ctx.Input<phi::DenseTensor>("Input1");
+    const auto *input2 = ctx.Input<phi::DenseTensor>("Input2");
     const auto *grad_output =
-        ctx.Input<Tensor>(framework::GradVarName("Output"));
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Output"));
     const int pad_size = ctx.Attr<int>("pad_size");
     const int kernel_size = ctx.Attr<int>("kernel_size");
     const int stride1 = ctx.Attr<int>("stride1");
@@ -464,9 +462,11 @@ class CorrelationCUDAGradKernel : public framework::OpKernel<T> {
     const int max_displacement = ctx.Attr<int>("max_displacement");
     const int corr_type_multiply = ctx.Attr<int>("corr_type_multiply");
 
-    auto *grad_input1 = ctx.Output<Tensor>(framework::GradVarName("Input1"));
+    auto *grad_input1 =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Input1"));
     grad_input1->mutable_data<T>(ctx.GetPlace());
-    auto *grad_input2 = ctx.Output<Tensor>(framework::GradVarName("Input2"));
+    auto *grad_input2 =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Input2"));
     grad_input2->mutable_data<T>(ctx.GetPlace());
     auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
 
@@ -479,11 +479,11 @@ class CorrelationCUDAGradKernel : public framework::OpKernel<T> {
     int padded_input_height = H + 2 * pad_size;
     int padded_input_width = W + 2 * pad_size;
 
-    Tensor rinput1 = ctx.AllocateTmpTensor<T, phi::GPUContext>(
+    phi::DenseTensor rinput1 = ctx.AllocateTmpTensor<T, phi::GPUContext>(
         {N, padded_input_height, padded_input_width, C}, dev_ctx);
     rinput1.mutable_data<T>(ctx.GetPlace());
 
-    Tensor rinput2 = ctx.AllocateTmpTensor<T, phi::GPUContext>(
+    phi::DenseTensor rinput2 = ctx.AllocateTmpTensor<T, phi::GPUContext>(
         {N, padded_input_height, padded_input_width, C}, dev_ctx);
     rinput2.mutable_data<T>(ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/cos_sim_op.cc b/paddle/fluid/operators/cos_sim_op.cc
index e3228104de38b..902a5eda2d2f6 100644
--- a/paddle/fluid/operators/cos_sim_op.cc
+++ b/paddle/fluid/operators/cos_sim_op.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class CosSimOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/cos_sim_op.h b/paddle/fluid/operators/cos_sim_op.h
index 7eb62453840a9..b9db1bcb5df22 100644
--- a/paddle/fluid/operators/cos_sim_op.h
+++ b/paddle/fluid/operators/cos_sim_op.h
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class CosSimKernel : public framework::OpKernel<T> {
@@ -29,10 +29,10 @@ class CosSimKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     // get Tensor
     auto* in_x = context.Input<framework::LoDTensor>("X");
-    auto* in_y = context.Input<Tensor>("Y");
+    auto* in_y = context.Input<phi::DenseTensor>("Y");
     auto* out_z = context.Output<framework::LoDTensor>("Out");
-    auto* out_x_norm = context.Output<Tensor>("XNorm");
-    auto* out_y_norm = context.Output<Tensor>("YNorm");
+    auto* out_x_norm = context.Output<phi::DenseTensor>("XNorm");
+    auto* out_y_norm = context.Output<phi::DenseTensor>("YNorm");
 
     int rows_x = in_x->dims()[0];
     int rows_y = in_y->dims()[0];
@@ -75,14 +75,17 @@ class CosSimGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     // get Tensor
-    auto* in_x = context.Input<Tensor>("X");
-    auto* in_y = context.Input<Tensor>("Y");
-    auto* in_z = context.Input<Tensor>("Out");
-    auto* in_x_norm = context.Input<Tensor>("XNorm");
-    auto* in_y_norm = context.Input<Tensor>("YNorm");
-    auto* out_grad_x = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* out_grad_y = context.Output<Tensor>(framework::GradVarName("Y"));
-    auto* in_grad_z = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* in_x = context.Input<phi::DenseTensor>("X");
+    auto* in_y = context.Input<phi::DenseTensor>("Y");
+    auto* in_z = context.Input<phi::DenseTensor>("Out");
+    auto* in_x_norm = context.Input<phi::DenseTensor>("XNorm");
+    auto* in_y_norm = context.Input<phi::DenseTensor>("YNorm");
+    auto* out_grad_x =
+        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* out_grad_y =
+        context.Output<phi::DenseTensor>(framework::GradVarName("Y"));
+    auto* in_grad_z =
+        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
 
     // compute gradident
     int rows_x = in_x->dims()[0];
diff --git a/paddle/fluid/operators/crf_decoding_op.h b/paddle/fluid/operators/crf_decoding_op.h
index ce3844de6a7f0..3723c5c5dd3ea 100644
--- a/paddle/fluid/operators/crf_decoding_op.h
+++ b/paddle/fluid/operators/crf_decoding_op.h
@@ -25,16 +25,15 @@ namespace operators {
 
 using framework::LoD;
 using framework::LoDTensor;
-using framework::Tensor;
 
 template <typename DeviceContext, typename T>
 class CRFDecodingOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* emission_weights = ctx.Input<LoDTensor>("Emission");
-    auto* transition_weights = ctx.Input<Tensor>("Transition");
+    auto* transition_weights = ctx.Input<phi::DenseTensor>("Transition");
     auto* label = ctx.Input<LoDTensor>("Label");
-    auto* decoded_path = ctx.Output<Tensor>("ViterbiPath");
+    auto* decoded_path = ctx.Output<phi::DenseTensor>("ViterbiPath");
 
     int64_t* path = decoded_path->mutable_data<int64_t>(platform::CPUPlace());
     phi::funcs::SetConstant<DeviceContext, int64_t>()(
@@ -42,12 +41,12 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
 
     bool has_length = ctx.HasInput("Length");
     if (has_length) {
-      auto* length = ctx.Input<Tensor>("Length");
+      auto* length = ctx.Input<phi::DenseTensor>("Length");
       const size_t seq_num = length->numel();
       const int64_t* length_data = length->data<int64_t>();
       auto in_dims = emission_weights->dims();
 
-      Tensor emission_weights_tmp = *emission_weights;
+      phi::DenseTensor emission_weights_tmp = *emission_weights;
       emission_weights_tmp.Resize({in_dims[0] * in_dims[1], in_dims[2]});
 
       decoded_path->Resize({in_dims[0] * in_dims[1], 1});
@@ -55,7 +54,8 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
         if (length_data[i] == 0) continue;
         int64_t start_pos = i * in_dims[1];
         int64_t end_pos = start_pos + static_cast<int64_t>(length_data[i]);
-        Tensor decoded_path_one_seq = decoded_path->Slice(start_pos, end_pos);
+        phi::DenseTensor decoded_path_one_seq =
+            decoded_path->Slice(start_pos, end_pos);
         Decode(emission_weights_tmp.Slice(start_pos, end_pos),
                *transition_weights,
                &decoded_path_one_seq);
@@ -97,7 +97,8 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
         if (lod[level][i] == lod[level][i + 1]) continue;
         int64_t start_pos = static_cast<int64_t>(lod[level][i]);
         int64_t end_pos = static_cast<int64_t>(lod[level][i + 1]);
-        Tensor decoded_path_one_seq = decoded_path->Slice(start_pos, end_pos);
+        phi::DenseTensor decoded_path_one_seq =
+            decoded_path->Slice(start_pos, end_pos);
         Decode(emission_weights->Slice(start_pos, end_pos),
                *transition_weights,
                &decoded_path_one_seq);
@@ -119,9 +120,9 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
   }
 
  private:
-  void Decode(const Tensor& emission_weights,
-              const Tensor& transition_weights,
-              Tensor* decoded_path) const {
+  void Decode(const phi::DenseTensor& emission_weights,
+              const phi::DenseTensor& transition_weights,
+              phi::DenseTensor* decoded_path) const {
     auto emission_dims = emission_weights.dims();
     const size_t seq_len = emission_dims[0];
     const size_t tag_num = emission_dims[1];
@@ -132,9 +133,9 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
     // alpha is a memo table. An element alpha(k, v) records the score of the
     // best sequence of tags from position 1 to position k with v being the end
     // tag.
-    Tensor alpha;
+    phi::DenseTensor alpha;
     T* alpha_value = alpha.mutable_data<T>(emission_dims, platform::CPUPlace());
-    Tensor track;
+    phi::DenseTensor track;
     int* track_value =
         track.mutable_data<int>(emission_dims, platform::CPUPlace());
     auto ker =
diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc
index f7c72c11ddfac..462764230f484 100644
--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class CropOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/crop_op.h b/paddle/fluid/operators/crop_op.h
index fe17ac773a259..c193eabba372c 100644
--- a/paddle/fluid/operators/crop_op.h
+++ b/paddle/fluid/operators/crop_op.h
@@ -29,18 +29,17 @@ template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-using framework::Tensor;
 
 static std::vector<int> GetOffsets(const framework::ExecutionContext& ctx) {
   std::vector<int> res;
-  int rank = ctx.Input<Tensor>("X")->dims().size();
+  int rank = ctx.Input<phi::DenseTensor>("X")->dims().size();
   if (ctx.HasInput("Offsets")) {
     PADDLE_ENFORCE_EQ(ctx.Attr<std::vector<int>>("offsets").empty(),
                       true,
                       platform::errors::InvalidArgument(
                           "Input 'Offsets' and attribute 'offsets' "
                           "should not be used at the same time for CropOp."));
-    const auto* offsets_tensor = ctx.Input<Tensor>("Offsets");
+    const auto* offsets_tensor = ctx.Input<phi::DenseTensor>("Offsets");
     PADDLE_ENFORCE_EQ(offsets_tensor->dims().size(),
                       1,
                       platform::errors::InvalidArgument(
@@ -57,7 +56,7 @@ static std::vector<int> GetOffsets(const framework::ExecutionContext& ctx) {
                                           offsets_tensor->dims()[0],
                                           rank));
     const int* offsets_data;
-    framework::Tensor cpu_tmp_tensor;
+    phi::DenseTensor cpu_tmp_tensor;
     if (platform::is_cpu_place(offsets_tensor->place())) {
       offsets_data = offsets_tensor->data<int>();
     } else {
@@ -83,8 +82,8 @@ static std::vector<int> GetOffsets(const framework::ExecutionContext& ctx) {
 
 template <typename DeviceContext, typename T, size_t D>
 void CropFunction(const framework::ExecutionContext& context) {
-  auto* x = context.Input<Tensor>("X");
-  auto* out = context.Output<Tensor>("Out");
+  auto* x = context.Input<phi::DenseTensor>("X");
+  auto* out = context.Output<phi::DenseTensor>("Out");
   auto out_dims = out->dims();
   if (out_dims[0] == -1) {
     out_dims[0] = x->dims()[0];
@@ -115,7 +114,7 @@ template <typename DeviceContext, typename T>
 class CropKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    int rank = context.Input<Tensor>("X")->dims().size();
+    int rank = context.Input<phi::DenseTensor>("X")->dims().size();
     PADDLE_ENFORCE_GE(
         rank,
         1,
@@ -155,10 +154,11 @@ class CropKernel : public framework::OpKernel<T> {
 
 template <typename DeviceContext, typename T, size_t D>
 void CropGradFunction(const framework::ExecutionContext& context) {
-  auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
-  auto* x = context.Input<Tensor>("X");
+  auto* d_x = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
+  auto* x = context.Input<phi::DenseTensor>("X");
   if (d_x != nullptr) {
-    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* d_out =
+        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
     d_x->mutable_data<T>(x->dims(), context.GetPlace());
     auto offsets = GetOffsets(context);
     Eigen::array<std::pair<int64_t, int64_t>, D> paddings;
@@ -180,7 +180,9 @@ class CropGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     size_t rank =
-        context.Input<Tensor>(framework::GradVarName("Out"))->dims().size();
+        context.Input<phi::DenseTensor>(framework::GradVarName("Out"))
+            ->dims()
+            .size();
     PADDLE_ENFORCE_GE(
         rank,
         1,
diff --git a/paddle/fluid/operators/crop_op_npu.cc b/paddle/fluid/operators/crop_op_npu.cc
index bd50dea15f80e..8980e5f73dee7 100644
--- a/paddle/fluid/operators/crop_op_npu.cc
+++ b/paddle/fluid/operators/crop_op_npu.cc
@@ -18,17 +18,17 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class CropNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
 
     std::vector<int> offset_list;
     if (ctx.HasInput("Offsets")) {
-      auto* offsets_tensor = ctx.Input<framework::Tensor>("Offsets");
+      auto* offsets_tensor = ctx.Input<phi::DenseTensor>("Offsets");
       paddle::framework::TensorToVector(
           *offsets_tensor, ctx.device_context(), &offset_list);
       if (offset_list.empty()) {
@@ -56,11 +56,11 @@ class CropNPUKernel : public framework::OpKernel<T> {
     int axis_int = 0;
     framework::NPUAttributeMap attr_input = {{"offsets", offset_list},
                                              {"axis", axis_int}};
-    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
 
     if (ctx.HasInput("Y")) {
-      auto* shape = ctx.Input<framework::Tensor>("Y");
+      auto* shape = ctx.Input<phi::DenseTensor>("Y");
       PADDLE_ENFORCE_EQ(shape->dims().size(),
                         x->dims().size(),
                         platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/crop_tensor_op.cc b/paddle/fluid/operators/crop_tensor_op.cc
index c75a5eaf86dac..44986baef8120 100644
--- a/paddle/fluid/operators/crop_tensor_op.cc
+++ b/paddle/fluid/operators/crop_tensor_op.cc
@@ -20,8 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class CropTensorOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -117,7 +115,7 @@ class CropTensorOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     if (var_name == "ShapeTensor" || var_name == "OffsetsTensor" ||
         var_name == "Shape" || var_name == "Offsets") {
@@ -276,7 +274,7 @@ class CropTensorOpGrad : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     if (var_name == "ShapeTensor" || var_name == "OffsetsTensor" ||
         var_name == "Shape" || var_name == "Offsets") {
diff --git a/paddle/fluid/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h
index 4445e0a79a640..2949dc8d1fb2a 100644
--- a/paddle/fluid/operators/cross_entropy_op.h
+++ b/paddle/fluid/operators/cross_entropy_op.h
@@ -23,15 +23,15 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class CrossEntropyOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* labels = ctx.Input<Tensor>("Label");
-    auto* y = ctx.Output<Tensor>("Y");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* labels = ctx.Input<phi::DenseTensor>("Label");
+    auto* y = ctx.Output<phi::DenseTensor>("Y");
     y->mutable_data<T>(ctx.GetPlace());
 
     int rank = x->dims().size();
@@ -126,10 +126,10 @@ template <typename DeviceContext, typename T>
 class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto* label = ctx.Input<Tensor>("Label");
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* dy = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
+    auto* label = ctx.Input<phi::DenseTensor>("Label");
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
 
     // Following computation only depends on the last dimension size. So it's
@@ -244,10 +244,10 @@ template <typename DeviceContext, typename T>
 class CrossEntropyOpKernel2 : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* label = ctx.Input<Tensor>("Label");
-    auto* y = ctx.Output<Tensor>("Y");
-    auto* match_x = ctx.Output<Tensor>("MatchX");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* label = ctx.Input<phi::DenseTensor>("Label");
+    auto* y = ctx.Output<phi::DenseTensor>("Y");
+    auto* match_x = ctx.Output<phi::DenseTensor>("MatchX");
 
     auto& x_dims = x->dims();
     auto feature_size = x_dims[x_dims.size() - 1];
@@ -271,10 +271,10 @@ template <typename DeviceContext, typename T>
 class CrossEntropyGradientOpKernel2 : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto* match_x = ctx.Input<Tensor>("MatchX");
-    auto* label = ctx.Input<Tensor>("Label");
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
+    auto* match_x = ctx.Input<phi::DenseTensor>("MatchX");
+    auto* label = ctx.Input<phi::DenseTensor>("Label");
 
     auto* p_dx = dx->mutable_data<T>(ctx.GetPlace());
     auto* p_dy = dy->data<T>();
diff --git a/paddle/fluid/operators/ctc_align_op.h b/paddle/fluid/operators/ctc_align_op.h
index 6eeb890d38f03..e6f2e9900b051 100644
--- a/paddle/fluid/operators/ctc_align_op.h
+++ b/paddle/fluid/operators/ctc_align_op.h
@@ -24,7 +24,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/cuda_graph_with_in_out.h b/paddle/fluid/operators/cuda_graph_with_in_out.h
index e5e98aa2445c1..a667c40234dca 100644
--- a/paddle/fluid/operators/cuda_graph_with_in_out.h
+++ b/paddle/fluid/operators/cuda_graph_with_in_out.h
@@ -28,7 +28,7 @@ class CUDAGraphWithInOuts {
   template <typename Callable>
   CUDAGraphWithInOuts(Callable &&callable,
                       platform::CUDAPlace place,
-                      const std::vector<const framework::Tensor *> &in_ptrs,
+                      const std::vector<const phi::DenseTensor *> &in_ptrs,
                       cudaStreamCaptureMode mode,
                       int64_t pool_id) {
     in_indices_.resize(in_ptrs.size());
@@ -61,7 +61,7 @@ class CUDAGraphWithInOuts {
     }
   }
 
-  void Run(const std::vector<const framework::Tensor *> &ins) {
+  void Run(const std::vector<const phi::DenseTensor *> &ins) {
     PADDLE_ENFORCE_EQ(
         ins.size(),
         in_indices_.size(),
@@ -75,8 +75,8 @@ class CUDAGraphWithInOuts {
     graph_->Replay();
   }
 
-  std::vector<framework::Tensor *> GetOutputs() {
-    std::vector<framework::Tensor *> outs(out_indices_.size());
+  std::vector<phi::DenseTensor *> GetOutputs() {
+    std::vector<phi::DenseTensor *> outs(out_indices_.size());
     for (size_t i = 0; i < out_indices_.size(); ++i) {
       if (out_indices_[i] >= 0) {
         outs[i] = &outs_[out_indices_[i]];
@@ -89,8 +89,8 @@ class CUDAGraphWithInOuts {
 
  private:
   std::unique_ptr<platform::CUDAGraph> graph_;
-  std::vector<framework::Tensor> ins_;
-  std::vector<framework::Tensor> outs_;
+  std::vector<phi::DenseTensor> ins_;
+  std::vector<phi::DenseTensor> outs_;
   std::vector<int64_t> in_indices_;
   std::vector<int64_t> out_indices_;
 };
@@ -103,17 +103,17 @@ static std::unique_ptr<CUDAGraphWithInOuts> CaptureCUDAGraph(
     const std::vector<std::string> &output_names,
     cudaStreamCaptureMode mode,
     int64_t pool_id) {
-  std::vector<const framework::Tensor *> inputs;
+  std::vector<const phi::DenseTensor *> inputs;
   for (const auto &name : input_names) {
-    auto input_tensors = ctx.MultiInput<framework::Tensor>(name);
+    auto input_tensors = ctx.MultiInput<phi::DenseTensor>(name);
     inputs.insert(inputs.end(), input_tensors.begin(), input_tensors.end());
   }
 
-  auto func = [&](const std::vector<const framework::Tensor *> &inputs) {
+  auto func = [&](const std::vector<const phi::DenseTensor *> &inputs) {
     callable(ctx);
-    std::vector<framework::Tensor *> outputs;
+    std::vector<phi::DenseTensor *> outputs;
     for (const auto &name : output_names) {
-      auto output_tensors = ctx.MultiOutput<framework::Tensor>(name);
+      auto output_tensors = ctx.MultiOutput<phi::DenseTensor>(name);
       outputs.insert(
           outputs.end(), output_tensors.begin(), output_tensors.end());
     }
@@ -128,9 +128,9 @@ static void ExecuteCUDAGraph(const framework::ExecutionContext &ctx,
                              const std::vector<std::string> &input_names,
                              const std::vector<std::string> &output_names,
                              CUDAGraphWithInOuts *graph) {
-  std::vector<const framework::Tensor *> inputs;
+  std::vector<const phi::DenseTensor *> inputs;
   for (const auto &name : input_names) {
-    auto input_tensors = ctx.MultiInput<framework::Tensor>(name);
+    auto input_tensors = ctx.MultiInput<phi::DenseTensor>(name);
     inputs.insert(inputs.end(), input_tensors.begin(), input_tensors.end());
   }
 
@@ -139,7 +139,7 @@ static void ExecuteCUDAGraph(const framework::ExecutionContext &ctx,
 
   size_t idx = 0;
   for (const auto &name : output_names) {
-    auto output_tensors = ctx.MultiOutput<framework::Tensor>(name);
+    auto output_tensors = ctx.MultiOutput<phi::DenseTensor>(name);
     for (auto *out_t : output_tensors) {
       if (outputs[idx] != nullptr) {
         *out_t = *outputs[idx];
diff --git a/paddle/fluid/operators/cudnn_lstm_cache.h b/paddle/fluid/operators/cudnn_lstm_cache.h
index 317d78639fcf3..32f1b46dbbd39 100644
--- a/paddle/fluid/operators/cudnn_lstm_cache.h
+++ b/paddle/fluid/operators/cudnn_lstm_cache.h
@@ -52,7 +52,7 @@ class ScopedRNNBase {
               const std::vector<int>& sequence_length,
               size_t* workspace_size,
               size_t* reserve_size,
-              framework::Tensor* dropout_state) {
+              phi::DenseTensor* dropout_state) {
     int numDirections = is_bidirec_ ? 2 : 1;
     cudnnDataType_t cudnn_type = platform::CudnnDataType<T>::type;
 
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
index d53333d217603..3435f790a3651 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -27,7 +27,7 @@ namespace paddle {
 namespace operators {
 
 using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T, typename Type>
 bool is_continuous(const Type &weight_list) {
@@ -112,7 +112,7 @@ void LSTMInferece(const bool &has_seq_length,
                   T *out_data,
                   T *last_h_data,
                   T *last_c_data,
-                  framework::Tensor *workspace_data,
+                  phi::DenseTensor *workspace_data,
                   const size_t &workspace_size) {
   if (!has_seq_length) {
 // for inference
@@ -205,15 +205,15 @@ template <typename T>
 class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    const Tensor *x = ctx.Input<Tensor>("Input");
-    const Tensor *init_h = ctx.Input<Tensor>("InitH");
-    const Tensor *init_c = ctx.Input<Tensor>("InitC");
+    const Tensor *x = ctx.Input<phi::DenseTensor>("Input");
+    const Tensor *init_h = ctx.Input<phi::DenseTensor>("InitH");
+    const Tensor *init_c = ctx.Input<phi::DenseTensor>("InitC");
 
-    Tensor *out = ctx.Output<Tensor>("Out");
-    Tensor *last_h = ctx.Output<Tensor>("LastH");
-    Tensor *last_c = ctx.Output<Tensor>("LastC");
-    Tensor *reserve = ctx.Output<Tensor>("Reserve");
-    Tensor *state_out = ctx.Output<Tensor>("StateOut");
+    Tensor *out = ctx.Output<phi::DenseTensor>("Out");
+    Tensor *last_h = ctx.Output<phi::DenseTensor>("LastH");
+    Tensor *last_c = ctx.Output<phi::DenseTensor>("LastC");
+    Tensor *reserve = ctx.Output<phi::DenseTensor>("Reserve");
+    Tensor *state_out = ctx.Output<phi::DenseTensor>("StateOut");
 
     const T *x_data = x->data<T>();
     const T *init_h_data = init_h->data<T>();
@@ -243,7 +243,7 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
     bool has_seq_length = ctx.HasInput("SequenceLength");
     std::vector<int> SequenceLength;
     if (has_seq_length) {
-      auto *sequence_length = ctx.Input<Tensor>("SequenceLength");
+      auto *sequence_length = ctx.Input<phi::DenseTensor>("SequenceLength");
       SequenceLength = operators::GetDataFromTensor<int>(sequence_length);
     }
 
@@ -266,12 +266,12 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
         reinterpret_cast<const phi::GPUContext &>(ctx.device_context())
             .stream();
     if (is_test && ctx.HasInput("W")) {
-      auto *W = ctx.Input<Tensor>("W");
+      auto *W = ctx.Input<phi::DenseTensor>("W");
       w_initialized = W->IsInitialized() ? true : false;
       weight_numel = W->numel();
     }
     if (!w_initialized) {
-      auto weight_list = ctx.MultiInput<framework::Tensor>("WeightList");
+      auto weight_list = ctx.MultiInput<phi::DenseTensor>("WeightList");
       bool continuous =
           is_continuous<T, std::vector<const Tensor *>>(weight_list);
       weight_numel = size_sum(weight_list);
@@ -301,7 +301,7 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
         w_data = const_cast<T *>(weight_list[0]->data<T>());
       }
     } else {
-      auto *W = ctx.Input<Tensor>("W");
+      auto *W = ctx.Input<phi::DenseTensor>("W");
       w_data = const_cast<T *>(W->data<T>());
     }
 
@@ -322,7 +322,7 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
                   &reserve_size,
                   state_out);
 
-    framework::Tensor workspace_data_;
+    phi::DenseTensor workspace_data_;
     workspace_data_.mutable_data<uint8_t>(
         {static_cast<int64_t>(workspace_size)}, ctx.GetPlace());
 
@@ -442,23 +442,28 @@ template <typename T>
 class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *input = ctx.Input<Tensor>("Input");
-    auto *init_h = ctx.Input<Tensor>("InitH");
-    auto *init_c = ctx.Input<Tensor>("InitC");
-    auto *reserve = ctx.Input<Tensor>("Reserve");
-    auto *state_out = ctx.Input<Tensor>("StateOut");
-    auto weight_list = ctx.MultiInput<Tensor>("WeightList");
-
-    auto *out = ctx.Input<Tensor>("Out");
-    auto *out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto *last_h_grad = ctx.Input<Tensor>(framework::GradVarName("LastH"));
-    auto *last_c_grad = ctx.Input<Tensor>(framework::GradVarName("LastC"));
-
-    auto *in_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    auto *init_h_grad = ctx.Output<Tensor>(framework::GradVarName("InitH"));
-    auto *init_c_grad = ctx.Output<Tensor>(framework::GradVarName("InitC"));
-    auto weight_grad_list = ctx.MultiOutput<framework::Tensor>(
-        framework::GradVarName("WeightList"));
+    auto *input = ctx.Input<phi::DenseTensor>("Input");
+    auto *init_h = ctx.Input<phi::DenseTensor>("InitH");
+    auto *init_c = ctx.Input<phi::DenseTensor>("InitC");
+    auto *reserve = ctx.Input<phi::DenseTensor>("Reserve");
+    auto *state_out = ctx.Input<phi::DenseTensor>("StateOut");
+    auto weight_list = ctx.MultiInput<phi::DenseTensor>("WeightList");
+
+    auto *out = ctx.Input<phi::DenseTensor>("Out");
+    auto *out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto *last_h_grad =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("LastH"));
+    auto *last_c_grad =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("LastC"));
+
+    auto *in_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
+    auto *init_h_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("InitH"));
+    auto *init_c_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("InitC"));
+    auto weight_grad_list =
+        ctx.MultiOutput<phi::DenseTensor>(framework::GradVarName("WeightList"));
 
     auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
     auto handle = dev_ctx.cudnn_handle();
@@ -528,7 +533,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
     bool has_seq_length = ctx.HasInput("SequenceLength");
     std::vector<int> SequenceLength;
     if (has_seq_length) {
-      auto *sequence_length = ctx.Input<Tensor>("SequenceLength");
+      auto *sequence_length = ctx.Input<phi::DenseTensor>("SequenceLength");
       SequenceLength = operators::GetDataFromTensor<int>(sequence_length);
     }
 
@@ -557,7 +562,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
                   &reserve_size,
                   const_cast<Tensor *>(state_out));
 
-    framework::Tensor workspace_data_;
+    phi::DenseTensor workspace_data_;
     workspace_data_.mutable_data<uint8_t>(
         {static_cast<int64_t>(workspace_size)}, ctx.GetPlace());
     const uint8_t *reserve_data = reserve->data<uint8_t>();
diff --git a/paddle/fluid/operators/cudnn_rnn_cache.h b/paddle/fluid/operators/cudnn_rnn_cache.h
index 69448000ac39e..6cd7160e0ae26 100644
--- a/paddle/fluid/operators/cudnn_rnn_cache.h
+++ b/paddle/fluid/operators/cudnn_rnn_cache.h
@@ -53,7 +53,7 @@ struct CudnnRNNCache {
   cudnnFilterDescriptor_t dw_desc_;
 
   size_t workspace_size_;
-  framework::Tensor workspace_data_;
+  phi::DenseTensor workspace_data_;
 
   size_t seq_length_;
 
@@ -78,7 +78,7 @@ struct CudnnRNNCache {
             int seed,
             int weight_numel,
             size_t *reserve_size_,
-            framework::Tensor *dropout_state_,
+            phi::DenseTensor *dropout_state_,
             bool initialized,
             cudnnDataType_t cudnn_type) {
     seq_length_ = seq_len;
diff --git a/paddle/fluid/operators/cumsum_op_mlu.cc b/paddle/fluid/operators/cumsum_op_mlu.cc
index bc14075cc23f6..83d9a10af1730 100644
--- a/paddle/fluid/operators/cumsum_op_mlu.cc
+++ b/paddle/fluid/operators/cumsum_op_mlu.cc
@@ -18,14 +18,14 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class CumSumMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     int axis = ctx.Attr<int>("axis");
     bool exclusive = ctx.Attr<bool>("exclusive");
     bool reverse = ctx.Attr<bool>("reverse");
@@ -33,7 +33,7 @@ class CumSumMLUKernel : public framework::OpKernel<T> {
 
     out->mutable_data<T>(ctx.GetPlace());
 
-    Tensor* input_ptr = const_cast<Tensor*>(x);
+    phi::DenseTensor* input_ptr = const_cast<phi::DenseTensor*>(x);
     Tensor flat_x(x->type());
     if (flatten) {
       PADDLE_ENFORCE_EQ(
diff --git a/paddle/fluid/operators/cumsum_op_npu.cc b/paddle/fluid/operators/cumsum_op_npu.cc
index 9d434d24e55a8..672a59cf22f59 100644
--- a/paddle/fluid/operators/cumsum_op_npu.cc
+++ b/paddle/fluid/operators/cumsum_op_npu.cc
@@ -19,10 +19,10 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
-static void CumsumImp(const Tensor& input,
-                      Tensor* output,
+static void CumsumImp(const phi::DenseTensor& input,
+                      phi::DenseTensor* output,
                       const framework::NPUAttributeMap& attr_input,
                       const framework::ExecutionContext& ctx) {
   auto stream =
@@ -65,8 +65,8 @@ template <typename DeviceContext, typename T>
 class CumSumNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     int axis = ctx.Attr<int>("axis");
     bool exclusive = ctx.Attr<bool>("exclusive");
     bool reverse = ctx.Attr<bool>("reverse");
diff --git a/paddle/fluid/operators/cvm_op.cc b/paddle/fluid/operators/cvm_op.cc
index d776ccfa4db35..153b181b4fd6a 100644
--- a/paddle/fluid/operators/cvm_op.cc
+++ b/paddle/fluid/operators/cvm_op.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class CVMOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/cvm_op.cu b/paddle/fluid/operators/cvm_op.cu
index d08d9e14ef06e..f8ab86ff54e36 100644
--- a/paddle/fluid/operators/cvm_op.cu
+++ b/paddle/fluid/operators/cvm_op.cu
@@ -22,7 +22,7 @@ namespace paddle {
 namespace operators {
 
 using platform::PADDLE_CUDA_NUM_THREADS;
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 
 template <typename T>
@@ -131,7 +131,7 @@ class CVMGradCUDAKernel : public framework::OpKernel<T> {
     auto* dx = context.Output<LoDTensor>(framework::GradVarName("X"));
     T* dx_data = dx->mutable_data<T>(context.GetPlace());
 
-    const Tensor* cvm = context.Input<Tensor>("CVM");
+    const phi::DenseTensor* cvm = context.Input<phi::DenseTensor>("CVM");
     const T* cvm_data = cvm->data<T>();
 
     const auto* dOut =
diff --git a/paddle/fluid/operators/cvm_op.h b/paddle/fluid/operators/cvm_op.h
index 3258737d29a6a..4206c8f458425 100644
--- a/paddle/fluid/operators/cvm_op.h
+++ b/paddle/fluid/operators/cvm_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 
 template <typename T>
@@ -105,7 +105,7 @@ class CVMGradOpKernel : public framework::OpKernel<T> {
     auto* dx = context.Output<LoDTensor>(framework::GradVarName("X"));
     T* dx_data = dx->mutable_data<T>(context.GetPlace());
 
-    const Tensor* cvm = context.Input<Tensor>("CVM");
+    const phi::DenseTensor* cvm = context.Input<phi::DenseTensor>("CVM");
     const T* cvm_data = cvm->data<T>();
 
     const auto* dOut =
diff --git a/paddle/fluid/operators/data_norm_op.cc b/paddle/fluid/operators/data_norm_op.cc
index 4fc279e03a36f..ea6b034f0b481 100644
--- a/paddle/fluid/operators/data_norm_op.cc
+++ b/paddle/fluid/operators/data_norm_op.cc
@@ -26,7 +26,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 using DataLayout = framework::DataLayout;
 
@@ -289,7 +289,7 @@ class DataNormKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
     const DataLayout data_layout =
         framework::StringToDataLayout(data_layout_str);
 
-    const auto *x = ctx.Input<Tensor>("X");
+    const auto *x = ctx.Input<phi::DenseTensor>("X");
     const auto &x_dims = x->dims();
     PADDLE_ENFORCE_EQ(
         x_dims.size(),
@@ -299,19 +299,19 @@ class DataNormKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
     const int C =
         (data_layout == DataLayout::kNCHW ? x_dims[1]
                                           : x_dims[x_dims.size() - 1]);
-    auto *y = ctx.Output<Tensor>("Y");
-    auto *mean_out = ctx.Output<Tensor>("Means");
-    auto *scales = ctx.Output<Tensor>("Scales");
+    auto *y = ctx.Output<phi::DenseTensor>("Y");
+    auto *mean_out = ctx.Output<phi::DenseTensor>("Means");
+    auto *scales = ctx.Output<phi::DenseTensor>("Scales");
 
     // alloc memory
     T *y_data = y->mutable_data<T>(ctx.GetPlace());
 
     ConstEigenVectorArrayMap<T> b_size_arr(
-        ctx.Input<Tensor>("BatchSize")->data<T>(), C);
+        ctx.Input<phi::DenseTensor>("BatchSize")->data<T>(), C);
     ConstEigenVectorArrayMap<T> b_sum_arr(
-        ctx.Input<Tensor>("BatchSum")->data<T>(), C);
+        ctx.Input<phi::DenseTensor>("BatchSum")->data<T>(), C);
     ConstEigenVectorArrayMap<T> b_square_sum_arr(
-        ctx.Input<Tensor>("BatchSquareSum")->data<T>(), C);
+        ctx.Input<phi::DenseTensor>("BatchSquareSum")->data<T>(), C);
     EigenVectorArrayMap<T> means_arr(mean_out->mutable_data<T>(ctx.GetPlace()),
                                      C);
     EigenVectorArrayMap<T> scales_arr(scales->mutable_data<T>(ctx.GetPlace()),
@@ -360,8 +360,8 @@ class DataNormKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
                 scales_arr;
           } else if (ctx.Attr<bool>("enable_scale_and_shift") &&
                      slot_dim <= 0) {
-            const auto *scale_w = ctx.Input<Tensor>("scale_w");
-            const auto *bias = ctx.Input<Tensor>("bias");
+            const auto *scale_w = ctx.Input<phi::DenseTensor>("scale_w");
+            const auto *bias = ctx.Input<phi::DenseTensor>("bias");
             ConstEigenVectorArrayMap<T> scale_w_arr(scale_w->data<T>(), C);
             ConstEigenVectorArrayMap<T> bias_arr(bias->data<T>(), C);
 
@@ -377,8 +377,8 @@ class DataNormKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
 
           } else {
             const int item_size = x->numel() / N;
-            const auto *scale_w = ctx.Input<Tensor>("scale_w");
-            const auto *bias = ctx.Input<Tensor>("bias");
+            const auto *scale_w = ctx.Input<phi::DenseTensor>("scale_w");
+            const auto *bias = ctx.Input<phi::DenseTensor>("bias");
             const T *scale_w_data = scale_w->data<T>();
             const T *bias_data = bias->data<T>();
             // location of show number in one embedding
@@ -528,10 +528,10 @@ template <typename T>
 class DataNormGradKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const auto *scales = ctx.Input<Tensor>("Scales");
-    const auto *means = ctx.Input<Tensor>("Means");
+    const auto *x = ctx.Input<phi::DenseTensor>("X");
+    const auto *d_y = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
+    const auto *scales = ctx.Input<phi::DenseTensor>("Scales");
+    const auto *means = ctx.Input<phi::DenseTensor>("Means");
 
     const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
     const DataLayout data_layout =
@@ -551,14 +551,15 @@ class DataNormGradKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
     // init output
     Tensor *d_x = nullptr;
     if (ctx.HasOutput(framework::GradVarName("X"))) {
-      d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+      d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     }
 
     auto *d_batch_size =
-        ctx.Output<Tensor>(framework::GradVarName("BatchSize"));
-    auto *d_batch_sum = ctx.Output<Tensor>(framework::GradVarName("BatchSum"));
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("BatchSize"));
+    auto *d_batch_sum =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("BatchSum"));
     auto *d_batch_square_sum =
-        ctx.Output<Tensor>(framework::GradVarName("BatchSquareSum"));
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("BatchSquareSum"));
 
     const T *mean_data = means->data<T>();
     const T *inv_var_data = scales->data<T>();
@@ -596,10 +597,11 @@ class DataNormGradKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
               d_x_arr.col(nc) = d_y_arr.col(nc) * scales_arr;
             }
           } else {
-            const auto *scale_w = ctx.Input<Tensor>("scale_w");
+            const auto *scale_w = ctx.Input<phi::DenseTensor>("scale_w");
             auto *d_scale =
-                ctx.Output<Tensor>(framework::GradVarName("scale_w"));
-            auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("bias"));
+                ctx.Output<phi::DenseTensor>(framework::GradVarName("scale_w"));
+            auto *d_bias =
+                ctx.Output<phi::DenseTensor>(framework::GradVarName("bias"));
             ConstEigenVectorArrayMap<T> scale_arr(scale_w->data<T>(), C);
             T *d_bias_data = nullptr;
             T *d_scale_data = nullptr;
diff --git a/paddle/fluid/operators/data_norm_op.cu b/paddle/fluid/operators/data_norm_op.cu
index e3f510e755b9c..b0819d81a1dab 100644
--- a/paddle/fluid/operators/data_norm_op.cu
+++ b/paddle/fluid/operators/data_norm_op.cu
@@ -26,7 +26,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 using DataLayout = framework::DataLayout;
 using platform::PADDLE_CUDA_NUM_THREADS;
@@ -107,7 +107,7 @@ template <typename T>
 class DataNormKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *x = ctx.Input<Tensor>("X");
+    const auto *x = ctx.Input<phi::DenseTensor>("X");
     const auto &x_dims = x->dims();
     // Align with CPU version, but should we add this restriction?
     PADDLE_ENFORCE_EQ(
@@ -116,18 +116,20 @@ class DataNormKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
         platform::errors::PreconditionNotMet("The Input dim size should be 2"));
     const int N = x_dims[0];
     const int C = x_dims[1];
-    const T *batch_size_in = ctx.Input<Tensor>("BatchSize")->data<T>();
-    const T *batch_sum_in = ctx.Input<Tensor>("BatchSum")->data<T>();
+    const T *batch_size_in =
+        ctx.Input<phi::DenseTensor>("BatchSize")->data<T>();
+    const T *batch_sum_in = ctx.Input<phi::DenseTensor>("BatchSum")->data<T>();
     const T *batch_square_sum_in =
-        ctx.Input<Tensor>("BatchSquareSum")->data<T>();
+        ctx.Input<phi::DenseTensor>("BatchSquareSum")->data<T>();
     auto *x_data = x->data<T>();
 
     // alloc memory
-    T *y_data = ctx.Output<Tensor>("Y")->mutable_data<T>(ctx.GetPlace());
+    T *y_data =
+        ctx.Output<phi::DenseTensor>("Y")->mutable_data<T>(ctx.GetPlace());
     T *mean_out_data =
-        ctx.Output<Tensor>("Means")->mutable_data<T>(ctx.GetPlace());
+        ctx.Output<phi::DenseTensor>("Means")->mutable_data<T>(ctx.GetPlace());
     T *scale_out_data =
-        ctx.Output<Tensor>("Scales")->mutable_data<T>(ctx.GetPlace());
+        ctx.Output<phi::DenseTensor>("Scales")->mutable_data<T>(ctx.GetPlace());
 
     auto stream = ctx.template device_context<phi::GPUContext>().stream();
 
@@ -147,10 +149,10 @@ template <typename T>
 class DataNormGradKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const auto *scales = ctx.Input<Tensor>("Scales");
-    const auto *means = ctx.Input<Tensor>("Means");
+    const auto *x = ctx.Input<phi::DenseTensor>("X");
+    const auto *d_y = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
+    const auto *scales = ctx.Input<phi::DenseTensor>("Scales");
+    const auto *means = ctx.Input<phi::DenseTensor>("Means");
     const float epsilon = ctx.Attr<float>("epsilon");
     const float dr = ctx.Attr<float>("summary_decay_rate");
     const bool need_sync_stats = ctx.Attr<bool>("sync_stats");
@@ -167,14 +169,16 @@ class DataNormGradKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
     // init output
     Tensor *d_x = nullptr;
     if (ctx.HasOutput(framework::GradVarName("X"))) {
-      d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+      d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     }
-    T *d_batch_size = ctx.Output<Tensor>(framework::GradVarName("BatchSize"))
-                          ->mutable_data<T>(ctx.GetPlace());
-    T *d_batch_sum = ctx.Output<Tensor>(framework::GradVarName("BatchSum"))
-                         ->mutable_data<T>(ctx.GetPlace());
+    T *d_batch_size =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("BatchSize"))
+            ->mutable_data<T>(ctx.GetPlace());
+    T *d_batch_sum =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("BatchSum"))
+            ->mutable_data<T>(ctx.GetPlace());
     T *d_batch_square_sum =
-        ctx.Output<Tensor>(framework::GradVarName("BatchSquareSum"))
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("BatchSquareSum"))
             ->mutable_data<T>(ctx.GetPlace());
 
     auto stream = ctx.template device_context<phi::GPUContext>().stream();
@@ -234,12 +238,12 @@ class DataNormGradKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
 #endif
     }
 
-    T *batch_size_data =
-        ctx.Output<Tensor>("BatchSize")->mutable_data<T>(ctx.GetPlace());
-    T *batch_sum_data =
-        ctx.Output<Tensor>("BatchSum")->mutable_data<T>(ctx.GetPlace());
-    T *batch_square_sum_data =
-        ctx.Output<Tensor>("BatchSquareSum")->mutable_data<T>(ctx.GetPlace());
+    T *batch_size_data = ctx.Output<phi::DenseTensor>("BatchSize")
+                             ->mutable_data<T>(ctx.GetPlace());
+    T *batch_sum_data = ctx.Output<phi::DenseTensor>("BatchSum")
+                            ->mutable_data<T>(ctx.GetPlace());
+    T *batch_square_sum_data = ctx.Output<phi::DenseTensor>("BatchSquareSum")
+                                   ->mutable_data<T>(ctx.GetPlace());
     KernelUpdateParam<<<GET_BLOCKS(C), PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
         C,
         d_batch_size,
diff --git a/paddle/fluid/operators/decode_jpeg_op.cc b/paddle/fluid/operators/decode_jpeg_op.cc
index 6e12b25028b04..973552adb5e30 100644
--- a/paddle/fluid/operators/decode_jpeg_op.cc
+++ b/paddle/fluid/operators/decode_jpeg_op.cc
@@ -40,7 +40,7 @@ class DecodeJpegOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const framework::Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const {
     if (var_name == "X") {
       return expected_kernel_type;
diff --git a/paddle/fluid/operators/deformable_conv_op_mlu.cc b/paddle/fluid/operators/deformable_conv_op_mlu.cc
index 0e3e45148fe91..08969ba98fcd2 100644
--- a/paddle/fluid/operators/deformable_conv_op_mlu.cc
+++ b/paddle/fluid/operators/deformable_conv_op_mlu.cc
@@ -18,17 +18,17 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class DeformableConvMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("Input");
-    auto* offset = ctx.Input<Tensor>("Offset");
-    auto* mask = ctx.Input<Tensor>("Mask");
-    auto* filter = ctx.Input<Tensor>("Filter");
-    auto* output = ctx.Output<Tensor>("Output");
+    auto* input = ctx.Input<phi::DenseTensor>("Input");
+    auto* offset = ctx.Input<phi::DenseTensor>("Offset");
+    auto* mask = ctx.Input<phi::DenseTensor>("Mask");
+    auto* filter = ctx.Input<phi::DenseTensor>("Filter");
+    auto* output = ctx.Output<phi::DenseTensor>("Output");
     output->mutable_data<T>(ctx.GetPlace());
 
     const int groups = ctx.Attr<int>("groups");
@@ -125,17 +125,21 @@ template <typename T>
 class DeformableConvGradMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* output_grad =
-        ctx.Input<Tensor>(framework::GradVarName("Output"));
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    auto* filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
-    auto* offset_grad = ctx.Output<Tensor>(framework::GradVarName("Offset"));
-    auto* mask_grad = ctx.Output<Tensor>(framework::GradVarName("Mask"));
-
-    const Tensor* input = ctx.Input<Tensor>("Input");
-    auto* offset = ctx.Input<Tensor>("Offset");
-    auto* mask = ctx.Input<Tensor>("Mask");
-    auto* filter = ctx.Input<Tensor>("Filter");
+    const phi::DenseTensor* output_grad =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Output"));
+    auto* input_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
+    auto* filter_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Filter"));
+    auto* offset_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Offset"));
+    auto* mask_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Mask"));
+
+    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
+    auto* offset = ctx.Input<phi::DenseTensor>("Offset");
+    auto* mask = ctx.Input<phi::DenseTensor>("Mask");
+    auto* filter = ctx.Input<phi::DenseTensor>("Filter");
 
     int groups = ctx.Attr<int>("groups");
     int deformable_groups = ctx.Attr<int>("deformable_groups");
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cu b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
index 2fcdebd5e826a..acfbda237c2c8 100644
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cu
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
@@ -39,7 +39,7 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 using paddle::platform::PADDLE_CUDA_NUM_THREADS;
 
@@ -184,12 +184,12 @@ template <typename DeviceContext, typename T>
 class DeformablePSROIPoolCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* input = ctx.Input<Tensor>("Input");
+    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
     const LoDTensor* rois = ctx.Input<LoDTensor>("ROIs");
-    const Tensor* trans = ctx.Input<Tensor>("Trans");
-    Tensor* out = ctx.Output<Tensor>("Output");
+    const phi::DenseTensor* trans = ctx.Input<phi::DenseTensor>("Trans");
+    phi::DenseTensor* out = ctx.Output<phi::DenseTensor>("Output");
     out->mutable_data<T>(ctx.GetPlace());
-    Tensor* top_count = ctx.Output<Tensor>("TopCount");
+    phi::DenseTensor* top_count = ctx.Output<phi::DenseTensor>("TopCount");
     top_count->mutable_data<T>(ctx.GetPlace());
 
     auto no_trans = ctx.Attr<bool>("no_trans");
@@ -236,7 +236,7 @@ class DeformablePSROIPoolCUDAKernel : public framework::OpKernel<T> {
     const T* bottom_rois = rois->data<T>();
     const T* bottom_trans = no_trans ? NULL : trans->data<T>();
 
-    framework::Tensor roi_batch_id_list;
+    phi::DenseTensor roi_batch_id_list;
     roi_batch_id_list.Resize({num_rois});
     auto cplace = platform::CPUPlace();
     int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
@@ -489,14 +489,16 @@ template <typename DeviceContext, typename T>
 class DeformablePSROIPoolGradCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* input = ctx.Input<Tensor>("Input");
+    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
     const LoDTensor* rois = ctx.Input<LoDTensor>("ROIs");
-    const Tensor* trans = ctx.Input<Tensor>("Trans");
-    const Tensor* top_count = ctx.Input<Tensor>("TopCount");
-    const Tensor* output_grad =
-        ctx.Input<Tensor>(framework::GradVarName("Output"));
-    Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* trans_grad = ctx.Output<Tensor>(framework::GradVarName("Trans"));
+    const phi::DenseTensor* trans = ctx.Input<phi::DenseTensor>("Trans");
+    const phi::DenseTensor* top_count = ctx.Input<phi::DenseTensor>("TopCount");
+    const phi::DenseTensor* output_grad =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Output"));
+    phi::DenseTensor* input_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
+    phi::DenseTensor* trans_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Trans"));
 
     phi::funcs::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = ctx.cuda_device_context();
@@ -550,7 +552,7 @@ class DeformablePSROIPoolGradCUDAKernel : public framework::OpKernel<T> {
     }
 
     const T* top_count_data = top_count->data<T>();
-    framework::Tensor roi_batch_id_list;
+    phi::DenseTensor roi_batch_id_list;
     roi_batch_id_list.Resize({num_rois});
     auto cplace = platform::CPUPlace();
     int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.h b/paddle/fluid/operators/deformable_psroi_pooling_op.h
index 937afa362996c..d6961524e6f0c 100644
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.h
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.h
@@ -33,7 +33,7 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 
 template <typename T>
@@ -173,12 +173,12 @@ template <typename DeviceContext, typename T>
 class DeformablePSROIPoolCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("Input");
+    auto* input = ctx.Input<phi::DenseTensor>("Input");
     auto* rois = ctx.Input<LoDTensor>("ROIs");
-    auto* trans = ctx.Input<Tensor>("Trans");
-    auto* out = ctx.Output<Tensor>("Output");
+    auto* trans = ctx.Input<phi::DenseTensor>("Trans");
+    auto* out = ctx.Output<phi::DenseTensor>("Output");
     out->mutable_data<T>(ctx.GetPlace());
-    auto* top_count = ctx.Output<Tensor>("TopCount");
+    auto* top_count = ctx.Output<phi::DenseTensor>("TopCount");
     top_count->mutable_data<T>(ctx.GetPlace());
 
     phi::funcs::SetConstant<DeviceContext, T> set_zero;
@@ -196,7 +196,7 @@ class DeformablePSROIPoolCPUKernel : public framework::OpKernel<T> {
             "is:%d.",
             num_rois,
             out->dims()[0]));
-    framework::Tensor roi_batch_id_list;
+    phi::DenseTensor roi_batch_id_list;
     roi_batch_id_list.Resize({num_rois});
     int* roi_batch_id_data =
         roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
@@ -475,19 +475,22 @@ template <typename DeviceContext, typename T>
 class DeformablePSROIPoolGradCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("Input");
+    auto* input = ctx.Input<phi::DenseTensor>("Input");
     auto* rois = ctx.Input<LoDTensor>("ROIs");
-    auto* trans = ctx.Input<Tensor>("Trans");
-    auto* top_count = ctx.Input<Tensor>("TopCount");
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    auto* trans = ctx.Input<phi::DenseTensor>("Trans");
+    auto* top_count = ctx.Input<phi::DenseTensor>("TopCount");
+    auto* output_grad =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Output"));
+    auto* input_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
     phi::funcs::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     if (input_grad) {
       input_grad->mutable_data<T>(ctx.GetPlace());
       set_zero(dev_ctx, input_grad, static_cast<T>(.0));
     }
-    auto* trans_grad = ctx.Output<Tensor>(framework::GradVarName("Trans"));
+    auto* trans_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Trans"));
     if (trans_grad) {
       trans_grad->mutable_data<T>(ctx.GetPlace());
       set_zero(dev_ctx, trans_grad, static_cast<T>(.0));
diff --git a/paddle/fluid/operators/dequantize_abs_max_op.cc b/paddle/fluid/operators/dequantize_abs_max_op.cc
index ff4bb5f53341b..99c4fad0fa2ab 100644
--- a/paddle/fluid/operators/dequantize_abs_max_op.cc
+++ b/paddle/fluid/operators/dequantize_abs_max_op.cc
@@ -35,10 +35,10 @@ namespace operators {
 template <typename T>
 struct DequantizeFunctor<phi::CPUContext, T> {
   void operator()(const phi::CPUContext& dev_ctx,
-                  const framework::Tensor* in,
-                  const framework::Tensor* scale,
+                  const phi::DenseTensor* in,
+                  const phi::DenseTensor* scale,
                   float max_range,
-                  framework::Tensor* out) {
+                  phi::DenseTensor* out) {
     const float* scale_factor = scale->data<float>();
     const T* input_data = in->data<T>();
     float* output_data = out->mutable_data<float>(dev_ctx.GetPlace());
diff --git a/paddle/fluid/operators/dequantize_abs_max_op.cu b/paddle/fluid/operators/dequantize_abs_max_op.cu
index 57d2c02adb095..70c0aca78baec 100644
--- a/paddle/fluid/operators/dequantize_abs_max_op.cu
+++ b/paddle/fluid/operators/dequantize_abs_max_op.cu
@@ -29,10 +29,10 @@ __global__ void KeDequantize(
 template <typename T>
 struct DequantizeFunctor<phi::GPUContext, T> {
   void operator()(const phi::GPUContext& dev_ctx,
-                  const framework::Tensor* in,
-                  const framework::Tensor* scale,
+                  const phi::DenseTensor* in,
+                  const phi::DenseTensor* scale,
                   float max_range,
-                  framework::Tensor* out) {
+                  phi::DenseTensor* out) {
     const T* in_data = in->data<T>();
     const float* scale_factor = scale->data<float>();
     float* out_data = out->mutable_data<float>(dev_ctx.GetPlace());
diff --git a/paddle/fluid/operators/dequantize_abs_max_op.h b/paddle/fluid/operators/dequantize_abs_max_op.h
index fb1fa313da42a..4d9a893c66c3c 100644
--- a/paddle/fluid/operators/dequantize_abs_max_op.h
+++ b/paddle/fluid/operators/dequantize_abs_max_op.h
@@ -30,20 +30,20 @@ namespace operators {
 template <typename DeviceContext, typename T>
 struct DequantizeFunctor {
   void operator()(const DeviceContext& dev_ctx,
-                  const framework::Tensor* in,
-                  const framework::Tensor* scale,
+                  const phi::DenseTensor* in,
+                  const phi::DenseTensor* scale,
                   float max_range,
-                  framework::Tensor* out);
+                  phi::DenseTensor* out);
 };
 
 template <typename DeviceContext, typename T>
 class DequantizeMaxAbsKernel : public framework::OpKernel<T> {
  public:
   virtual void Compute(const framework::ExecutionContext& ctx) const {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* scale = ctx.Input<framework::Tensor>("Scale");
+    auto* in = ctx.Input<phi::DenseTensor>("X");
+    auto* scale = ctx.Input<phi::DenseTensor>("Scale");
 
-    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     float max_range = ctx.Attr<float>("max_range");
 
diff --git a/paddle/fluid/operators/dequantize_log_op.cc b/paddle/fluid/operators/dequantize_log_op.cc
index b3c1770493c9c..62359a2ce2124 100644
--- a/paddle/fluid/operators/dequantize_log_op.cc
+++ b/paddle/fluid/operators/dequantize_log_op.cc
@@ -34,9 +34,9 @@ namespace operators {
 template <typename T>
 struct DequantizeFunctor<phi::CPUContext, T> {
   void operator()(const phi::CPUContext& dev_ctx,
-                  const framework::Tensor* in,
-                  const framework::Tensor* dict,
-                  framework::Tensor* out) {
+                  const phi::DenseTensor* in,
+                  const phi::DenseTensor* dict,
+                  phi::DenseTensor* out) {
     const float* dict_data = dict->data<float>();
     const T* input_data = in->data<T>();
     float* output_data = out->mutable_data<float>(dev_ctx.GetPlace());
diff --git a/paddle/fluid/operators/dequantize_log_op.cu b/paddle/fluid/operators/dequantize_log_op.cu
index 2c47d9b17aa06..360871f9e7251 100644
--- a/paddle/fluid/operators/dequantize_log_op.cu
+++ b/paddle/fluid/operators/dequantize_log_op.cu
@@ -38,9 +38,9 @@ __global__ void KeDequantize(const T* in,
 template <typename T>
 struct DequantizeFunctor<phi::GPUContext, T> {
   void operator()(const phi::GPUContext& dev_ctx,
-                  const framework::Tensor* in,
-                  const framework::Tensor* dict,
-                  framework::Tensor* out) {
+                  const phi::DenseTensor* in,
+                  const phi::DenseTensor* dict,
+                  phi::DenseTensor* out) {
     const T* in_data = in->data<T>();
     const float* dict_data = dict->data<float>();
     float* out_data = out->mutable_data<float>(dev_ctx.GetPlace());
diff --git a/paddle/fluid/operators/dequantize_log_op.h b/paddle/fluid/operators/dequantize_log_op.h
index 01613be898e7b..d15f0392e82fc 100644
--- a/paddle/fluid/operators/dequantize_log_op.h
+++ b/paddle/fluid/operators/dequantize_log_op.h
@@ -29,18 +29,18 @@ namespace operators {
 template <typename DeviceContext, typename T>
 struct DequantizeFunctor {
   void operator()(const DeviceContext& dev_ctx,
-                  const framework::Tensor* in,
-                  const framework::Tensor* dict,
-                  framework::Tensor* out);
+                  const phi::DenseTensor* in,
+                  const phi::DenseTensor* dict,
+                  phi::DenseTensor* out);
 };
 
 template <typename DeviceContext, typename T>
 class DequantizeLogKernel : public framework::OpKernel<T> {
  public:
   virtual void Compute(const framework::ExecutionContext& ctx) const {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* dict = ctx.Input<framework::Tensor>("Dict");
-    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* in = ctx.Input<phi::DenseTensor>("X");
+    auto* dict = ctx.Input<phi::DenseTensor>("Dict");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     out->mutable_data<float>(dev_ctx.GetPlace());
diff --git a/paddle/fluid/operators/dequantize_op.h b/paddle/fluid/operators/dequantize_op.h
index ea7a08c8f3684..f319828a6be4b 100644
--- a/paddle/fluid/operators/dequantize_op.h
+++ b/paddle/fluid/operators/dequantize_op.h
@@ -23,7 +23,6 @@ namespace paddle {
 namespace operators {
 
 using framework::OpKernelType;
-using framework::Tensor;
 
 class DeQuantOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/detection/anchor_generator_op.cu b/paddle/fluid/operators/detection/anchor_generator_op.cu
index 30250eb8cc048..eeb4d731b7b3b 100644
--- a/paddle/fluid/operators/detection/anchor_generator_op.cu
+++ b/paddle/fluid/operators/detection/anchor_generator_op.cu
@@ -75,9 +75,9 @@ template <typename T>
 class AnchorGeneratorOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
-    auto* anchors = ctx.Output<paddle::framework::Tensor>("Anchors");
-    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
+    auto* input = ctx.Input<phi::DenseTensor>("Input");
+    auto* anchors = ctx.Output<phi::DenseTensor>("Anchors");
+    auto* vars = ctx.Output<phi::DenseTensor>("Variances");
 
     auto anchor_sizes = ctx.Attr<std::vector<float>>("anchor_sizes");
     auto aspect_ratios = ctx.Attr<std::vector<float>>("aspect_ratios");
@@ -101,13 +101,13 @@ class AnchorGeneratorOpCUDAKernel : public framework::OpKernel<T> {
     anchors->mutable_data<T>(ctx.GetPlace());
     vars->mutable_data<T>(ctx.GetPlace());
 
-    framework::Tensor ar;
+    phi::DenseTensor ar;
     framework::TensorFromVector(aspect_ratios, ctx.device_context(), &ar);
 
-    framework::Tensor as;
+    phi::DenseTensor as;
     framework::TensorFromVector(anchor_sizes, ctx.device_context(), &as);
 
-    framework::Tensor sd;
+    phi::DenseTensor sd;
     framework::TensorFromVector(stride, ctx.device_context(), &sd);
 
     GenAnchors<T><<<grid, block, 0, stream>>>(anchors->data<T>(),
@@ -121,7 +121,7 @@ class AnchorGeneratorOpCUDAKernel : public framework::OpKernel<T> {
                                               width,
                                               offset);
 
-    framework::Tensor v;
+    phi::DenseTensor v;
     framework::TensorFromVector(variances, ctx.device_context(), &v);
     grid = (box_num * 4 + block - 1) / block;
     SetVariance<T><<<grid, block, 0, stream>>>(
diff --git a/paddle/fluid/operators/detection/anchor_generator_op.h b/paddle/fluid/operators/detection/anchor_generator_op.h
index 767229bfee001..aaebcef3c901f 100644
--- a/paddle/fluid/operators/detection/anchor_generator_op.h
+++ b/paddle/fluid/operators/detection/anchor_generator_op.h
@@ -47,9 +47,9 @@ template <typename T>
 class AnchorGeneratorOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
-    auto* anchors = ctx.Output<paddle::framework::Tensor>("Anchors");
-    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
+    auto* input = ctx.Input<phi::DenseTensor>("Input");
+    auto* anchors = ctx.Output<phi::DenseTensor>("Anchors");
+    auto* vars = ctx.Output<phi::DenseTensor>("Variances");
 
     auto anchor_sizes = ctx.Attr<std::vector<float>>("anchor_sizes");
     auto aspect_ratios = ctx.Attr<std::vector<float>>("aspect_ratios");
@@ -106,7 +106,7 @@ class AnchorGeneratorOpKernel : public framework::OpKernel<T> {
       }
     }
 
-    framework::Tensor var_t;
+    phi::DenseTensor var_t;
     var_t.mutable_data<T>(
         phi::make_ddim({1, static_cast<int>(variances.size())}),
         ctx.GetPlace());
diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h
index e4accef0fa9b3..a831cbf7062b8 100644
--- a/paddle/fluid/operators/detection/bbox_util.cu.h
+++ b/paddle/fluid/operators/detection/bbox_util.cu.h
@@ -30,7 +30,7 @@ namespace cub = hipcub;
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 
 #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
diff --git a/paddle/fluid/operators/detection/bbox_util.h b/paddle/fluid/operators/detection/bbox_util.h
index bfe3adc2c1f20..4046f6b2830d8 100644
--- a/paddle/fluid/operators/detection/bbox_util.h
+++ b/paddle/fluid/operators/detection/bbox_util.h
@@ -56,11 +56,11 @@ inline HOSTDEVICE T RoIArea(const T* box, bool pixel_offset = true) {
  */
 template <typename T>
 inline void BoxToDelta(const int box_num,
-                       const framework::Tensor& ex_boxes,
-                       const framework::Tensor& gt_boxes,
+                       const phi::DenseTensor& ex_boxes,
+                       const phi::DenseTensor& gt_boxes,
                        const float* weights,
                        const bool normalized,
-                       framework::Tensor* box_delta) {
+                       phi::DenseTensor* box_delta) {
   auto ex_boxes_et = framework::EigenTensor<T, 2>::From(ex_boxes);
   auto gt_boxes_et = framework::EigenTensor<T, 2>::From(gt_boxes);
   auto trg = framework::EigenTensor<T, 2>::From(*box_delta);
@@ -101,9 +101,9 @@ void Gather(
 }
 
 template <typename T>
-void BboxOverlaps(const framework::Tensor& r_boxes,
-                  const framework::Tensor& c_boxes,
-                  framework::Tensor* overlaps) {
+void BboxOverlaps(const phi::DenseTensor& r_boxes,
+                  const phi::DenseTensor& c_boxes,
+                  phi::DenseTensor* overlaps) {
   auto r_boxes_et = framework::EigenTensor<T, 2>::From(r_boxes);
   auto c_boxes_et = framework::EigenTensor<T, 2>::From(c_boxes);
   auto overlaps_et = framework::EigenTensor<T, 2>::From(*overlaps);
@@ -136,7 +136,7 @@ void BboxOverlaps(const framework::Tensor& r_boxes,
 // Calculate max IoU between each box and ground-truth and
 // each row represents one box
 template <typename T>
-void MaxIoU(const framework::Tensor& iou, framework::Tensor* max_iou) {
+void MaxIoU(const phi::DenseTensor& iou, phi::DenseTensor* max_iou) {
   const T* iou_data = iou.data<T>();
   int row = iou.dims()[0];
   int col = iou.dims()[1];
@@ -148,9 +148,9 @@ void MaxIoU(const framework::Tensor& iou, framework::Tensor* max_iou) {
   }
 }
 
-static void AppendProposals(framework::Tensor* dst,
+static void AppendProposals(phi::DenseTensor* dst,
                             int64_t offset,
-                            const framework::Tensor& src) {
+                            const phi::DenseTensor& src) {
   auto* out_data = dst->data();
   auto* to_add_data = src.data();
   size_t size_of_t = framework::DataTypeSize(src.dtype());
@@ -163,9 +163,9 @@ static void AppendProposals(framework::Tensor* dst,
 
 template <class T>
 void ClipTiledBoxes(const platform::DeviceContext& ctx,
-                    const framework::Tensor& im_info,
-                    const framework::Tensor& input_boxes,
-                    framework::Tensor* out,
+                    const phi::DenseTensor& im_info,
+                    const phi::DenseTensor& input_boxes,
+                    phi::DenseTensor* out,
                     bool is_scale = true,
                     bool pixel_offset = true) {
   T* out_data = out->mutable_data<T>(ctx.GetPlace());
@@ -197,11 +197,11 @@ void ClipTiledBoxes(const platform::DeviceContext& ctx,
 // Filter the box with small area
 template <class T>
 void FilterBoxes(const platform::DeviceContext& ctx,
-                 const framework::Tensor* boxes,
+                 const phi::DenseTensor* boxes,
                  float min_size,
-                 const framework::Tensor& im_info,
+                 const phi::DenseTensor& im_info,
                  bool is_scale,
-                 framework::Tensor* keep,
+                 phi::DenseTensor* keep,
                  bool pixel_offset = true) {
   const T* im_info_data = im_info.data<T>();
   const T* boxes_data = boxes->data<T>();
@@ -238,10 +238,10 @@ void FilterBoxes(const platform::DeviceContext& ctx,
 
 template <class T>
 static void BoxCoder(const platform::DeviceContext& ctx,
-                     framework::Tensor* all_anchors,
-                     framework::Tensor* bbox_deltas,
-                     framework::Tensor* variances,
-                     framework::Tensor* proposals,
+                     phi::DenseTensor* all_anchors,
+                     phi::DenseTensor* bbox_deltas,
+                     phi::DenseTensor* variances,
+                     phi::DenseTensor* proposals,
                      const bool pixel_offset = true) {
   T* proposals_data = proposals->mutable_data<T>(ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/detection/bipartite_match_op.cc b/paddle/fluid/operators/detection/bipartite_match_op.cc
index ef824d2d8cdcd..45c21c0f570fb 100644
--- a/paddle/fluid/operators/detection/bipartite_match_op.cc
+++ b/paddle/fluid/operators/detection/bipartite_match_op.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 
 class BipartiteMatchOp : public framework::OperatorWithKernel {
@@ -72,7 +72,7 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
  public:
   // The match_indices must be initialized to -1 at first.
   // The match_dist must be initialized to 0 at first.
-  void BipartiteMatch(const Tensor& dist,
+  void BipartiteMatch(const phi::DenseTensor& dist,
                       int* match_indices,
                       T* match_dist) const {
     PADDLE_ENFORCE_EQ(
@@ -157,7 +157,7 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
     }
   }
 
-  void ArgMaxMatch(const Tensor& dist,
+  void ArgMaxMatch(const phi::DenseTensor& dist,
                    int* match_indices,
                    T* match_dist,
                    T overlap_threshold) const {
@@ -197,8 +197,9 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
 
   void Compute(const framework::ExecutionContext& context) const override {
     auto* dist_mat = context.Input<LoDTensor>("DistMat");
-    auto* match_indices = context.Output<Tensor>("ColToRowMatchIndices");
-    auto* match_dist = context.Output<Tensor>("ColToRowMatchDist");
+    auto* match_indices =
+        context.Output<phi::DenseTensor>("ColToRowMatchIndices");
+    auto* match_dist = context.Output<phi::DenseTensor>("ColToRowMatchDist");
 
     auto& dev_ctx = context.device_context<phi::CPUContext>();
 
diff --git a/paddle/fluid/operators/detection/box_clip_op.cu b/paddle/fluid/operators/detection/box_clip_op.cu
index 87dc4a30abb31..d50759c71fe3d 100644
--- a/paddle/fluid/operators/detection/box_clip_op.cu
+++ b/paddle/fluid/operators/detection/box_clip_op.cu
@@ -22,7 +22,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTenso = framework::LoDTensor;
 
 static constexpr int ImInfoSize = 3;
@@ -50,7 +50,7 @@ class GPUBoxClipKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
     auto *input = context.Input<LoDTensor>("Input");
-    auto *im_info = context.Input<Tensor>("ImInfo");
+    auto *im_info = context.Input<phi::DenseTensor>("ImInfo");
     auto *output = context.Output<LoDTensor>("Output");
     const int64_t num = input->dims()[0];
     const int64_t bbox_width = input->numel() / num;
diff --git a/paddle/fluid/operators/detection/box_clip_op.h b/paddle/fluid/operators/detection/box_clip_op.h
index 5c816ee3eb5e2..e85ef88ccdc91 100644
--- a/paddle/fluid/operators/detection/box_clip_op.h
+++ b/paddle/fluid/operators/detection/box_clip_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/detection/box_coder_op_npu.cc b/paddle/fluid/operators/detection/box_coder_op_npu.cc
index 8181f10f2bc5b..4a98920f64b19 100644
--- a/paddle/fluid/operators/detection/box_coder_op_npu.cc
+++ b/paddle/fluid/operators/detection/box_coder_op_npu.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 struct BoxCoderFunction {
@@ -28,29 +28,29 @@ struct BoxCoderFunction {
     stream = ctx.template device_context<paddle::platform::NPUDeviceContext>()
                  .stream();
   }
-  Tensor Adds(const Tensor& x, float scalar) {
+  Tensor Adds(const phi::DenseTensor& x, float scalar) {
     Tensor y;
     y.mutable_data<T>(x.dims(), place);
     const auto& runner = NpuOpRunner("Adds", {x}, {y}, {{"value", scalar}});
     runner.Run(stream);
     return y;
   }
-  Tensor Muls(const Tensor& x, float scalar) {
+  Tensor Muls(const phi::DenseTensor& x, float scalar) {
     Tensor y;
     y.mutable_data<T>(x.dims(), place);
     const auto& runner = NpuOpRunner("Muls", {x}, {y}, {{"value", scalar}});
     runner.Run(stream);
     return y;
   }
-  Tensor Mul(const Tensor& x, const Tensor& y) {
+  Tensor Mul(const phi::DenseTensor& x, const phi::DenseTensor& y) {
     Tensor z;
     z.mutable_data<T>(x.dims(), place);
     const auto& runner = NpuOpRunner("Mul", {x, y}, {z}, {});
     runner.Run(stream);
     return z;
   }
-  Tensor SubWithBroadCast(const Tensor& x,
-                          const Tensor& y,
+  Tensor SubWithBroadCast(const phi::DenseTensor& x,
+                          const phi::DenseTensor& y,
                           const framework::DDim& shape) {
     Tensor z;
     z.mutable_data<T>(shape, place);
@@ -58,59 +58,59 @@ struct BoxCoderFunction {
     runner.Run(stream);
     return z;
   }
-  void DivWithBroadCastVoid(const Tensor& x,
-                            const Tensor& y,
+  void DivWithBroadCastVoid(const phi::DenseTensor& x,
+                            const phi::DenseTensor& y,
                             const framework::DDim& shape,
-                            Tensor* z) {
+                            phi::DenseTensor* z) {
     z->mutable_data<T>(shape, place);
     const auto& runner = NpuOpRunner("Div", {x, y}, {*z}, {});
     runner.Run(stream);
   }
-  Tensor DivWithBroadCast(const Tensor& x,
-                          const Tensor& y,
+  Tensor DivWithBroadCast(const phi::DenseTensor& x,
+                          const phi::DenseTensor& y,
                           const framework::DDim& shape) {
     Tensor z;
     DivWithBroadCastVoid(x, y, shape, &z);
     return z;
   }
-  void MulWithBroadCastVoid(const Tensor& x,
-                            const Tensor& y,
+  void MulWithBroadCastVoid(const phi::DenseTensor& x,
+                            const phi::DenseTensor& y,
                             const framework::DDim& shape,
-                            Tensor* z) {
+                            phi::DenseTensor* z) {
     z->mutable_data<T>(shape, place);
     const auto& runner = NpuOpRunner("Mul", {x, y}, {*z}, {});
     runner.Run(stream);
   }
-  Tensor MulWithBroadCast(const Tensor& x,
-                          const Tensor& y,
+  Tensor MulWithBroadCast(const phi::DenseTensor& x,
+                          const phi::DenseTensor& y,
                           const framework::DDim& shape) {
     Tensor z;
     MulWithBroadCastVoid(x, y, shape, &z);
     return z;
   }
-  void AddWithBroadCastVoid(const Tensor& x,
-                            const Tensor& y,
+  void AddWithBroadCastVoid(const phi::DenseTensor& x,
+                            const phi::DenseTensor& y,
                             const framework::DDim& shape,
-                            Tensor* z) {
+                            phi::DenseTensor* z) {
     z->mutable_data<T>(shape, place);
     const auto& runner = NpuOpRunner("AddV2", {x, y}, {*z}, {});
     runner.Run(stream);
   }
-  Tensor AddWithBroadCast(const Tensor& x,
-                          const Tensor& y,
+  Tensor AddWithBroadCast(const phi::DenseTensor& x,
+                          const phi::DenseTensor& y,
                           const framework::DDim& shape) {
     Tensor z;
     AddWithBroadCastVoid(x, y, shape, &z);
     return z;
   }
-  Tensor Abs(const Tensor& x) {
+  Tensor Abs(const phi::DenseTensor& x) {
     Tensor y;
     y.mutable_data<T>(x.dims(), place);
     const auto& runner = NpuOpRunner("Abs", {x}, {y}, {});
     runner.Run(stream);
     return y;
   }
-  Tensor Log(const Tensor& x) {
+  Tensor Log(const phi::DenseTensor& x) {
     Tensor t_x_m1 = Adds(x, -1);
     Tensor y;
     y.mutable_data<T>(x.dims(), place);
@@ -118,14 +118,14 @@ struct BoxCoderFunction {
     runner.Run(stream);
     return y;
   }
-  Tensor Exp(const Tensor& x) {
+  Tensor Exp(const phi::DenseTensor& x) {
     Tensor y;
     y.mutable_data<T>(x.dims(), place);
     const auto& runner = NpuOpRunner("Exp", {x}, {y}, {});
     runner.Run(stream);
     return y;
   }
-  Tensor Dot(const Tensor& x, const Tensor& y) {
+  Tensor Dot(const phi::DenseTensor& x, const phi::DenseTensor& y) {
     auto dim_x = x.dims();
     auto dim_y = y.dims();
     PADDLE_ENFORCE_EQ(
@@ -158,7 +158,7 @@ struct BoxCoderFunction {
   void ConcatVoid(const std::vector<Tensor>& inputs,
                   const framework::DDim& shape_out,
                   int axis,
-                  Tensor* output) {
+                  phi::DenseTensor* output) {
     output->mutable_data<T>(shape_out, place);
     std::vector<std::string> names;
     for (size_t i = 0; i < inputs.size(); i++) {
@@ -179,7 +179,7 @@ struct BoxCoderFunction {
     ConcatVoid(inputs, shape_out, axis, &output);
     return output;
   }
-  Tensor Slice(const Tensor& x,
+  Tensor Slice(const phi::DenseTensor& x,
                const std::vector<int>& offsets,
                const std::vector<int>& size,
                const framework::DDim& shape) {
@@ -201,7 +201,7 @@ template <typename T>
 void Vector2Tensor(const framework::ExecutionContext& ctx,
                    const std::vector<T>& vec,
                    const framework::DDim& ddim,
-                   Tensor* tsr) {
+                   phi::DenseTensor* tsr) {
   framework::TensorFromVector<T>(vec, ctx.device_context(), tsr);
   ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
   tsr->Resize(ddim);
@@ -209,12 +209,12 @@ void Vector2Tensor(const framework::ExecutionContext& ctx,
 
 template <typename T>
 void BoxCoderEnc(const framework::ExecutionContext& ctx,
-                 const Tensor* tb,
-                 const Tensor* pb,
-                 const Tensor* pbv,
+                 const phi::DenseTensor* tb,
+                 const phi::DenseTensor* pb,
+                 const phi::DenseTensor* pbv,
                  const bool norm,
                  const std::vector<float>& variance,
-                 Tensor* out) {
+                 phi::DenseTensor* out) {
   auto M = pb->dims()[0];
   auto N = tb->dims()[0];
   auto shape_0 = phi::make_ddim({4, 2});
@@ -273,13 +273,13 @@ void BoxCoderEnc(const framework::ExecutionContext& ctx,
 
 template <typename T>
 void BoxCoderDec(const framework::ExecutionContext& ctx,
-                 const Tensor* tb,
-                 const Tensor* pb,
-                 const Tensor* pbv,
+                 const phi::DenseTensor* tb,
+                 const phi::DenseTensor* pb,
+                 const phi::DenseTensor* pbv,
                  const bool norm,
                  const std::vector<float>& variance,
                  int axis,
-                 Tensor* out) {
+                 phi::DenseTensor* out) {
   auto shape_0 = phi::make_ddim({4, 2});
   Tensor m_diff;
   Tensor m_aver;
@@ -378,10 +378,10 @@ template <typename T>
 class BoxCoderNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* prior_box = ctx.Input<Tensor>("PriorBox");
-    auto* prior_box_var = ctx.Input<Tensor>("PriorBoxVar");
+    auto* prior_box = ctx.Input<phi::DenseTensor>("PriorBox");
+    auto* prior_box_var = ctx.Input<phi::DenseTensor>("PriorBoxVar");
     auto* target_box = ctx.Input<framework::LoDTensor>("TargetBox");
-    auto* output_box = ctx.Output<Tensor>("OutputBox");
+    auto* output_box = ctx.Output<phi::DenseTensor>("OutputBox");
     std::vector<float> variance = ctx.Attr<std::vector<float>>("variance");
     const int axis = ctx.Attr<int>("axis");
 
diff --git a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu
index f87a636bdfb02..e37c0299110a3 100644
--- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu
+++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu
@@ -100,12 +100,12 @@ class BoxDecoderAndAssignCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* prior_box = context.Input<framework::LoDTensor>("PriorBox");
-    auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
+    auto* prior_box_var = context.Input<phi::DenseTensor>("PriorBoxVar");
     auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
     auto* box_score = context.Input<framework::LoDTensor>("BoxScore");
-    auto* output_box = context.Output<framework::Tensor>("DecodeBox");
+    auto* output_box = context.Output<phi::DenseTensor>("DecodeBox");
     auto* output_assign_box =
-        context.Output<framework::Tensor>("OutputAssignBox");
+        context.Output<phi::DenseTensor>("OutputAssignBox");
 
     auto roi_num = target_box->dims()[0];
     auto class_num = box_score->dims()[1];
diff --git a/paddle/fluid/operators/detection/box_decoder_and_assign_op.h b/paddle/fluid/operators/detection/box_decoder_and_assign_op.h
index 85ee3b76448ad..1377fecd3d4d8 100644
--- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.h
+++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.h
@@ -25,12 +25,12 @@ class BoxDecoderAndAssignKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* prior_box = context.Input<framework::LoDTensor>("PriorBox");
-    auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
+    auto* prior_box_var = context.Input<phi::DenseTensor>("PriorBoxVar");
     auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
     auto* box_score = context.Input<framework::LoDTensor>("BoxScore");
-    auto* output_box = context.Output<framework::Tensor>("DecodeBox");
+    auto* output_box = context.Output<phi::DenseTensor>("DecodeBox");
     auto* output_assign_box =
-        context.Output<framework::Tensor>("OutputAssignBox");
+        context.Output<phi::DenseTensor>("OutputAssignBox");
     int roi_num = target_box->dims()[0];
     int class_num = box_score->dims()[1];
     auto* target_box_data = target_box->data<T>();
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
index 48902f517967b..95b9d006bffe6 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
@@ -16,7 +16,7 @@ limitations under the License.*/
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 class CollectFpnProposalsOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
index 0fbc54d3135d6..936dc7a50b45b 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
@@ -33,7 +33,7 @@ namespace cub = hipcub;
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 
 static constexpr int kNumCUDAThreads = 64;
@@ -89,12 +89,12 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     int lod_size;
     auto place = dev_ctx.GetPlace();
 
-    auto multi_rois_num = ctx.MultiInput<Tensor>("MultiLevelRoIsNum");
+    auto multi_rois_num = ctx.MultiInput<phi::DenseTensor>("MultiLevelRoIsNum");
     for (size_t i = 0; i < roi_ins.size(); ++i) {
       auto roi_in = roi_ins[i];
       auto score_in = score_ins[i];
       if (multi_rois_num.size() > 0) {
-        framework::Tensor temp;
+        phi::DenseTensor temp;
         paddle::framework::TensorCopySync(
             *multi_rois_num[i], platform::CPUPlace(), &temp);
         const int* length_in = temp.data<int>();
@@ -250,7 +250,7 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     }
 
     if (ctx.HasOutput("RoisNum")) {
-      auto* rois_num = ctx.Output<Tensor>("RoisNum");
+      auto* rois_num = ctx.Output<phi::DenseTensor>("RoisNum");
       int* rois_num_data = rois_num->mutable_data<int>({lod_size}, place);
       memory::Copy(place,
                    rois_num_data,
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
index c9b9acfcb2005..6ac6fc3e09a69 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
@@ -67,7 +67,7 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     auto multi_layer_scores =
         context.MultiInput<paddle::framework::LoDTensor>("MultiLevelScores");
     auto multi_rois_num =
-        context.MultiInput<framework::Tensor>("MultiLevelRoIsNum");
+        context.MultiInput<phi::DenseTensor>("MultiLevelRoIsNum");
     int num_size = multi_rois_num.size();
 
     auto* fpn_rois = context.Output<paddle::framework::LoDTensor>("FpnRois");
@@ -182,7 +182,7 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     }
     num_per_batch.emplace_back(post_nms_topN - pre_idx);
     if (context.HasOutput("RoisNum")) {
-      auto* rois_num = context.Output<framework::Tensor>("RoisNum");
+      auto* rois_num = context.Output<phi::DenseTensor>("RoisNum");
       int* rois_num_data =
           rois_num->mutable_data<int>({batch_size}, context.GetPlace());
       for (int i = 0; i < batch_size; i++) {
diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cu b/paddle/fluid/operators/detection/density_prior_box_op.cu
index aa60d054546cd..9dbdc35d07f02 100644
--- a/paddle/fluid/operators/detection/density_prior_box_op.cu
+++ b/paddle/fluid/operators/detection/density_prior_box_op.cu
@@ -87,10 +87,10 @@ template <typename T>
 class DensityPriorBoxOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
-    auto* image = ctx.Input<paddle::framework::Tensor>("Image");
-    auto* boxes = ctx.Output<paddle::framework::Tensor>("Boxes");
-    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
+    auto* input = ctx.Input<phi::DenseTensor>("Input");
+    auto* image = ctx.Input<phi::DenseTensor>("Image");
+    auto* boxes = ctx.Output<phi::DenseTensor>("Boxes");
+    auto* vars = ctx.Output<phi::DenseTensor>("Variances");
 
     auto variances = ctx.Attr<std::vector<float>>("variances");
     auto is_clip = ctx.Attr<bool>("clip");
@@ -124,7 +124,7 @@ class DensityPriorBoxOpCUDAKernel : public framework::OpKernel<T> {
     }
     int step_average = static_cast<int>((step_width + step_height) * 0.5);
 
-    framework::Tensor h_temp;
+    phi::DenseTensor h_temp;
     T* tdata = h_temp.mutable_data<T>({num_priors * 4}, platform::CPUPlace());
     int idx = 0;
     for (size_t s = 0; s < fixed_sizes.size(); ++s) {
@@ -152,7 +152,7 @@ class DensityPriorBoxOpCUDAKernel : public framework::OpKernel<T> {
     boxes->mutable_data<T>(ctx.GetPlace());
     vars->mutable_data<T>(ctx.GetPlace());
 
-    framework::Tensor d_temp;
+    phi::DenseTensor d_temp;
     framework::TensorCopy(h_temp, ctx.GetPlace(), &d_temp);
 
     // At least use 32 threads, at most 512 threads.
diff --git a/paddle/fluid/operators/detection/density_prior_box_op.h b/paddle/fluid/operators/detection/density_prior_box_op.h
index 0912ce9016031..f3e3cb0ffb6ca 100644
--- a/paddle/fluid/operators/detection/density_prior_box_op.h
+++ b/paddle/fluid/operators/detection/density_prior_box_op.h
@@ -22,10 +22,10 @@ template <typename T>
 class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
-    auto* image = ctx.Input<paddle::framework::Tensor>("Image");
-    auto* boxes = ctx.Output<paddle::framework::Tensor>("Boxes");
-    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
+    auto* input = ctx.Input<phi::DenseTensor>("Input");
+    auto* image = ctx.Input<phi::DenseTensor>("Image");
+    auto* boxes = ctx.Output<phi::DenseTensor>("Boxes");
+    auto* vars = ctx.Output<phi::DenseTensor>("Variances");
 
     auto variances = ctx.Attr<std::vector<float>>("variances");
     auto clip = ctx.Attr<bool>("clip");
@@ -121,7 +121,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
         return std::min<T>(std::max<T>(v, 0.), 1.);
       });
     }
-    framework::Tensor var_t;
+    phi::DenseTensor var_t;
     var_t.mutable_data<T>(
         phi::make_ddim({1, static_cast<int>(variances.size())}),
         ctx.GetPlace());
diff --git a/paddle/fluid/operators/detection/density_prior_box_op_npu.cc b/paddle/fluid/operators/detection/density_prior_box_op_npu.cc
index 2b28cd926f513..a6f9170712d96 100644
--- a/paddle/fluid/operators/detection/density_prior_box_op_npu.cc
+++ b/paddle/fluid/operators/detection/density_prior_box_op_npu.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using fp16 = paddle::platform::float16;
 
 template <typename T>
@@ -31,55 +31,67 @@ struct DensityPriorBoxFunction {
     FillNpuTensorWithConstant<float>(&t0, static_cast<float>(0));
     FillNpuTensorWithConstant<float>(&t1, static_cast<float>(1));
   }
-  void Arange(int n, Tensor* x) {
+  void Arange(int n, phi::DenseTensor* x) {
     //  x should be init first
     FillNpuTensorWithConstant<float>(&tn, static_cast<float>(n));
     const auto& runner = NpuOpRunner("Range", {t0, tn, t1}, {*x}, {});
     runner.Run(stream);
   }
-  void Add(const Tensor* x, const Tensor* y, Tensor* z) {
+  void Add(const phi::DenseTensor* x,
+           const phi::DenseTensor* y,
+           phi::DenseTensor* z) {
     //  z should be init first
     const auto& runner = NpuOpRunner("AddV2", {*x, *y}, {*z}, {});
     runner.Run(stream);
   }
-  void Cast(const Tensor* x, Tensor* y) {
+  void Cast(const phi::DenseTensor* x, phi::DenseTensor* y) {
     auto dst_dtype =
         ConvertToNpuDtype(framework::TransToProtoVarType(y->type()));
     const auto& runner = NpuOpRunner(
         "Cast", {*x}, {*y}, {{"dst_type", static_cast<int>(dst_dtype)}});
     runner.Run(stream);
   }
-  void Sub(const Tensor* x, const Tensor* y, Tensor* z) {
+  void Sub(const phi::DenseTensor* x,
+           const phi::DenseTensor* y,
+           phi::DenseTensor* z) {
     //  z should be init first
     const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*z}, {});
     runner.Run(stream);
   }
-  void Mul(const Tensor* x, const Tensor* y, Tensor* z) {
+  void Mul(const phi::DenseTensor* x,
+           const phi::DenseTensor* y,
+           phi::DenseTensor* z) {
     //  y should be init first
     const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*z}, {});
     runner.Run(stream);
   }
-  void Adds(const Tensor* x, float scalar, Tensor* y) {
+  void Adds(const phi::DenseTensor* x, float scalar, phi::DenseTensor* y) {
     //  y should be init first
     const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scalar}});
     runner.Run(stream);
   }
-  void Muls(const Tensor* x, float scalar, Tensor* y) {
+  void Muls(const phi::DenseTensor* x, float scalar, phi::DenseTensor* y) {
     //  y should be init first
     const auto& runner = NpuOpRunner("Muls", {*x}, {*y}, {{"value", scalar}});
     runner.Run(stream);
   }
-  void Maximum(const Tensor* x, const Tensor* y, Tensor* z) {
+  void Maximum(const phi::DenseTensor* x,
+               const phi::DenseTensor* y,
+               phi::DenseTensor* z) {
     //  y should be init first
     const auto& runner = NpuOpRunner("Maximum", {*x, *y}, {*z}, {});
     runner.Run(stream);
   }
-  void Minimum(const Tensor* x, const Tensor* y, Tensor* z) {
+  void Minimum(const phi::DenseTensor* x,
+               const phi::DenseTensor* y,
+               phi::DenseTensor* z) {
     //  y should be init first
     const auto& runner = NpuOpRunner("Minimum", {*x, *y}, {*z}, {});
     runner.Run(stream);
   }
-  void Concat(const std::vector<Tensor>& inputs, int axis, Tensor* output) {
+  void Concat(const std::vector<Tensor>& inputs,
+              int axis,
+              phi::DenseTensor* output) {
     //  output should be init first
     std::vector<std::string> names;
     for (size_t i = 0; i < inputs.size(); i++) {
@@ -93,7 +105,9 @@ struct DensityPriorBoxFunction {
     runner.AddInputNames(names);
     runner.Run(stream);
   }
-  void Tile(const Tensor* x, Tensor* y, const std::vector<int>& multiples) {
+  void Tile(const phi::DenseTensor* x,
+            phi::DenseTensor* y,
+            const std::vector<int>& multiples) {
     //  y should be init first
     if (x->dims() == y->dims()) {
       framework::TensorCopy(
@@ -107,7 +121,7 @@ struct DensityPriorBoxFunction {
         NpuOpRunner("TileD", {*x}, {*y}, {{"multiples", multiples}});
     runner.Run(stream);
   }
-  void FloatVec2Tsr(const std::vector<float>& vec, Tensor* tsr_dst) {
+  void FloatVec2Tsr(const std::vector<float>& vec, phi::DenseTensor* tsr_dst) {
     //
     framework::TensorFromVector<T>(vec, ctx.device_context(), tsr_dst);
     ctx.template device_context<platform::NPUDeviceContext>().Wait();
@@ -123,7 +137,7 @@ struct DensityPriorBoxFunction {
 };
 
 template <>
-void DensityPriorBoxFunction<fp16>::Arange(int n, Tensor* x) {
+void DensityPriorBoxFunction<fp16>::Arange(int n, phi::DenseTensor* x) {
   Tensor x_fp32(experimental::DataType::FLOAT32);
   x_fp32.mutable_data<float>(x->dims(), place);
   FillNpuTensorWithConstant<float>(&tn, static_cast<float>(n));
@@ -134,7 +148,7 @@ void DensityPriorBoxFunction<fp16>::Arange(int n, Tensor* x) {
 
 template <>
 void DensityPriorBoxFunction<fp16>::FloatVec2Tsr(const std::vector<float>& vec,
-                                                 Tensor* tsr_dst) {
+                                                 phi::DenseTensor* tsr_dst) {
   Tensor tsr_fp32(experimental::DataType::FLOAT32);
   tsr_fp32.mutable_data<float>(tsr_dst->dims(), place);
   framework::TensorFromVector<float>(vec, ctx.device_context(), &tsr_fp32);
@@ -146,10 +160,10 @@ template <typename T>
 class DensityPriorBoxOpNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
-    auto* image = ctx.Input<paddle::framework::Tensor>("Image");
-    auto* boxes = ctx.Output<paddle::framework::Tensor>("Boxes");
-    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
+    auto* input = ctx.Input<phi::DenseTensor>("Input");
+    auto* image = ctx.Input<phi::DenseTensor>("Image");
+    auto* boxes = ctx.Output<phi::DenseTensor>("Boxes");
+    auto* vars = ctx.Output<phi::DenseTensor>("Variances");
 
     auto variances = ctx.Attr<std::vector<float>>("variances");
     auto clip = ctx.Attr<bool>("clip");
diff --git a/paddle/fluid/operators/detection/generate_mask_labels_op.cc b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
index 5473a57902b87..feb12ab90f211 100644
--- a/paddle/fluid/operators/detection/generate_mask_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
@@ -25,12 +25,12 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 const int kBoxDim = 4;
 
 template <typename T>
-void AppendMask(LoDTensor* out, int64_t offset, Tensor* to_add) {
+void AppendMask(LoDTensor* out, int64_t offset, phi::DenseTensor* to_add) {
   auto* out_data = out->data<T>();
   auto* to_add_data = to_add->data<T>();
   memcpy(out_data + offset, to_add_data, to_add->numel() * sizeof(T));
@@ -123,11 +123,11 @@ class GenerateMaskLabelsOp : public framework::OperatorWithKernel {
  */
 template <typename T>
 static inline void ExpandMaskTarget(const phi::CPUContext& ctx,
-                                    const Tensor& masks,
-                                    const Tensor& mask_class_labels,
+                                    const phi::DenseTensor& masks,
+                                    const phi::DenseTensor& mask_class_labels,
                                     const int resolution,
                                     const int num_classes,
-                                    Tensor* mask_targets) {
+                                    phi::DenseTensor* mask_targets) {
   const uint8_t* masks_data = masks.data<uint8_t>();
   int64_t num_mask = masks.dims()[0];
   const int* mask_class_labels_data = mask_class_labels.data<int>();
@@ -151,12 +151,12 @@ static inline void ExpandMaskTarget(const phi::CPUContext& ctx,
 
 template <typename T>
 std::vector<Tensor> SampleMaskForOneImage(const phi::CPUContext& ctx,
-                                          const Tensor& im_info,
-                                          const Tensor& gt_classes,
-                                          const Tensor& is_crowd,
-                                          const Tensor& gt_segms,
-                                          const Tensor& rois,
-                                          const Tensor& label_int32,
+                                          const phi::DenseTensor& im_info,
+                                          const phi::DenseTensor& gt_classes,
+                                          const phi::DenseTensor& is_crowd,
+                                          const phi::DenseTensor& gt_segms,
+                                          const phi::DenseTensor& rois,
+                                          const phi::DenseTensor& label_int32,
                                           const int num_classes,
                                           const int resolution,
                                           const framework::LoD& segm_length) {
diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
index 7376e0993a506..64a6120bbcad2 100644
--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@@ -25,12 +25,12 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 const int kBoxDim = 4;
 
 template <typename T>
-void AppendRois(LoDTensor* out, int64_t offset, Tensor* to_add) {
+void AppendRois(LoDTensor* out, int64_t offset, phi::DenseTensor* to_add) {
   auto* out_data = out->data<T>();
   auto* to_add_data = to_add->data<T>();
   memcpy(out_data + offset, to_add_data, to_add->numel() * sizeof(T));
@@ -41,9 +41,9 @@ void AppendRois(LoDTensor* out, int64_t offset, Tensor* to_add) {
 // and the corresponding RoI will be removed.
 template <typename T>
 void FilterRoIs(const platform::DeviceContext& ctx,
-                const Tensor& rpn_rois,
-                const Tensor& max_overlap,
-                Tensor* keep) {
+                const phi::DenseTensor& rpn_rois,
+                const phi::DenseTensor& max_overlap,
+                phi::DenseTensor* keep) {
   const T* rpn_rois_dt = rpn_rois.data<T>();
   const T* max_overlap_dt = max_overlap.data<T>();
   int rois_num = max_overlap.numel();
@@ -169,9 +169,9 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel {
 
 template <typename T>
 void Concat(const phi::CPUContext& context,
-            const Tensor& in_tensor_a,
-            const Tensor& in_tensor_b,
-            Tensor* out_tensor) {
+            const phi::DenseTensor& in_tensor_a,
+            const phi::DenseTensor& in_tensor_b,
+            phi::DenseTensor* out_tensor) {
   int axis = 0;
   std::vector<Tensor> inputs;
   inputs.emplace_back(in_tensor_a);
@@ -182,8 +182,8 @@ void Concat(const phi::CPUContext& context,
 
 template <typename T>
 std::vector<std::vector<int>> SampleFgBgGt(const phi::CPUContext& context,
-                                           Tensor* iou,
-                                           const Tensor& is_crowd,
+                                           phi::DenseTensor* iou,
+                                           const phi::DenseTensor& is_crowd,
                                            const int batch_size_per_im,
                                            const float fg_fraction,
                                            const float fg_thresh,
@@ -192,7 +192,7 @@ std::vector<std::vector<int>> SampleFgBgGt(const phi::CPUContext& context,
                                            std::minstd_rand engine,
                                            const bool use_random,
                                            const bool is_cascade_rcnn,
-                                           const Tensor& rpn_rois) {
+                                           const phi::DenseTensor& rpn_rois) {
   std::vector<int> fg_inds;
   std::vector<int> bg_inds;
   std::vector<int> mapped_gt_inds;
@@ -286,17 +286,17 @@ std::vector<std::vector<int>> SampleFgBgGt(const phi::CPUContext& context,
 
 template <typename T>
 void GatherBoxesLabels(const phi::CPUContext& context,
-                       const Tensor& boxes,
-                       const Tensor& max_overlap,
-                       const Tensor& gt_boxes,
-                       const Tensor& gt_classes,
+                       const phi::DenseTensor& boxes,
+                       const phi::DenseTensor& max_overlap,
+                       const phi::DenseTensor& gt_boxes,
+                       const phi::DenseTensor& gt_classes,
                        const std::vector<int>& fg_inds,
                        const std::vector<int>& bg_inds,
                        const std::vector<int>& gt_inds,
-                       Tensor* sampled_boxes,
-                       Tensor* sampled_labels,
-                       Tensor* sampled_gts,
-                       Tensor* sampled_max_overlap) {
+                       phi::DenseTensor* sampled_boxes,
+                       phi::DenseTensor* sampled_labels,
+                       phi::DenseTensor* sampled_gts,
+                       phi::DenseTensor* sampled_max_overlap) {
   int fg_num = fg_inds.size();
   int bg_num = bg_inds.size();
   Tensor fg_inds_t, bg_inds_t, gt_box_inds_t, gt_label_inds_t;
@@ -335,11 +335,11 @@ void GatherBoxesLabels(const phi::CPUContext& context,
 template <typename T>
 std::vector<Tensor> SampleRoisForOneImage(
     const phi::CPUContext& context,
-    const Tensor& rpn_rois_in,
-    const Tensor& gt_classes,
-    const Tensor& is_crowd,
-    const Tensor& gt_boxes,
-    const Tensor& im_info,
+    const phi::DenseTensor& rpn_rois_in,
+    const phi::DenseTensor& gt_classes,
+    const phi::DenseTensor& is_crowd,
+    const phi::DenseTensor& gt_boxes,
+    const phi::DenseTensor& im_info,
     const int batch_size_per_im,
     const float fg_fraction,
     const float fg_thresh,
@@ -351,7 +351,7 @@ std::vector<Tensor> SampleRoisForOneImage(
     bool use_random,
     bool is_cascade_rcnn,
     bool is_cls_agnostic,
-    const Tensor& max_overlap) {
+    const phi::DenseTensor& max_overlap) {
   // 1.1 map to original image
   auto im_scale = im_info.data<T>()[2];
   Tensor rpn_rois;
@@ -618,7 +618,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
       Tensor im_info_slice = im_info->Slice(i, i + 1);
       Tensor max_overlap_slice;
       if (is_cascade_rcnn) {
-        auto* max_overlap = context.Input<Tensor>("MaxOverlap");
+        auto* max_overlap = context.Input<phi::DenseTensor>("MaxOverlap");
         max_overlap_slice =
             max_overlap->Slice(rpn_rois_lod[i], rpn_rois_lod[i + 1]);
       } else {
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc
index 0118cc1f76b3f..84f542b61d120 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -27,7 +27,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 
 class GenerateProposalsOp : public framework::OperatorWithKernel {
@@ -77,17 +77,18 @@ template <typename T>
 class GenerateProposalsKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    auto *scores = context.Input<Tensor>("Scores");
-    auto *bbox_deltas = context.Input<Tensor>("BboxDeltas");
-    auto *im_info = context.Input<Tensor>("ImInfo");
-    auto anchors = GET_DATA_SAFELY(context.Input<Tensor>("Anchors"),
+    auto *scores = context.Input<phi::DenseTensor>("Scores");
+    auto *bbox_deltas = context.Input<phi::DenseTensor>("BboxDeltas");
+    auto *im_info = context.Input<phi::DenseTensor>("ImInfo");
+    auto anchors = GET_DATA_SAFELY(context.Input<phi::DenseTensor>("Anchors"),
                                    "Input",
                                    "Anchors",
                                    "GenerateProposals");
-    auto variances = GET_DATA_SAFELY(context.Input<Tensor>("Variances"),
-                                     "Input",
-                                     "Variances",
-                                     "GenerateProposals");
+    auto variances =
+        GET_DATA_SAFELY(context.Input<phi::DenseTensor>("Variances"),
+                        "Input",
+                        "Variances",
+                        "GenerateProposals");
 
     auto *rpn_rois = context.Output<LoDTensor>("RpnRois");
     auto *rpn_roi_probs = context.Output<LoDTensor>("RpnRoiProbs");
@@ -165,7 +166,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
       tmp_num.push_back(proposals.dims()[0]);
     }
     if (context.HasOutput("RpnRoisNum")) {
-      auto *rpn_rois_num = context.Output<Tensor>("RpnRoisNum");
+      auto *rpn_rois_num = context.Output<phi::DenseTensor>("RpnRoisNum");
       rpn_rois_num->mutable_data<int>({num}, context.GetPlace());
       int *num_data = rpn_rois_num->data<int>();
       for (int i = 0; i < num; i++) {
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu
index ed1ad6da34d4a..64aea88758c3d 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
@@ -28,7 +28,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 
 namespace {
@@ -131,17 +131,18 @@ template <typename DeviceContext, typename T>
 class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    auto *scores = context.Input<Tensor>("Scores");
-    auto *bbox_deltas = context.Input<Tensor>("BboxDeltas");
-    auto *im_info = context.Input<Tensor>("ImInfo");
-    auto anchors = GET_DATA_SAFELY(context.Input<Tensor>("Anchors"),
+    auto *scores = context.Input<phi::DenseTensor>("Scores");
+    auto *bbox_deltas = context.Input<phi::DenseTensor>("BboxDeltas");
+    auto *im_info = context.Input<phi::DenseTensor>("ImInfo");
+    auto anchors = GET_DATA_SAFELY(context.Input<phi::DenseTensor>("Anchors"),
                                    "Input",
                                    "Anchors",
                                    "GenerateProposals");
-    auto variances = GET_DATA_SAFELY(context.Input<Tensor>("Variances"),
-                                     "Input",
-                                     "Variances",
-                                     "GenerateProposals");
+    auto variances =
+        GET_DATA_SAFELY(context.Input<phi::DenseTensor>("Variances"),
+                        "Input",
+                        "Variances",
+                        "GenerateProposals");
 
     auto *rpn_rois = context.Output<LoDTensor>("RpnRois");
     auto *rpn_roi_probs = context.Output<LoDTensor>("RpnRoiProbs");
@@ -240,7 +241,7 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
       tmp_num.push_back(proposals.dims()[0]);
     }
     if (context.HasOutput("RpnRoisNum")) {
-      auto *rpn_rois_num = context.Output<Tensor>("RpnRoisNum");
+      auto *rpn_rois_num = context.Output<phi::DenseTensor>("RpnRoisNum");
       rpn_rois_num->mutable_data<int>({num}, context.GetPlace());
       int *num_data = rpn_rois_num->data<int>();
       memory::Copy(place,
diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
index 15918030c024b..71c944a1e68aa 100644
--- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
@@ -29,7 +29,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 
 class GenerateProposalsV2Op : public framework::OperatorWithKernel {
diff --git a/paddle/fluid/operators/detection/iou_similarity_op.h b/paddle/fluid/operators/detection/iou_similarity_op.h
index 807ccd68c5377..ae5095e664705 100644
--- a/paddle/fluid/operators/detection/iou_similarity_op.h
+++ b/paddle/fluid/operators/detection/iou_similarity_op.h
@@ -110,7 +110,7 @@ class IOUSimilarityKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     const framework::LoDTensor* in_x = ctx.Input<framework::LoDTensor>("X");
-    const framework::Tensor* in_y = ctx.Input<framework::Tensor>("Y");
+    const phi::DenseTensor* in_y = ctx.Input<phi::DenseTensor>("Y");
     bool normalized = ctx.Attr<bool>("box_normalized");
     framework::LoDTensor* out = ctx.Output<framework::LoDTensor>("Out");
 
diff --git a/paddle/fluid/operators/detection/iou_similarity_op_mlu.cc b/paddle/fluid/operators/detection/iou_similarity_op_mlu.cc
index 617daf670b0da..f043bbd2e162a 100644
--- a/paddle/fluid/operators/detection/iou_similarity_op_mlu.cc
+++ b/paddle/fluid/operators/detection/iou_similarity_op_mlu.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 struct IouFunction {
@@ -26,11 +26,15 @@ struct IouFunction {
   explicit IouFunction(const framework::ExecutionContext& ctx) : ctx(ctx) {
     place = ctx.GetPlace();
   }
-  void Transpose(const Tensor* x, Tensor* y, const std::vector<int>& axis) {
+  void Transpose(const phi::DenseTensor* x,
+                 phi::DenseTensor* y,
+                 const std::vector<int>& axis) {
     //  y should be init first
     TransposeFromMLUTensor<T>(ctx, axis, x, y, false /*need_reshape_or_alloc*/);
   }
-  void Add(const Tensor* x, const Tensor* y, Tensor* z) {
+  void Add(const phi::DenseTensor* x,
+           const phi::DenseTensor* y,
+           phi::DenseTensor* z) {
     //  y should be init first
     MLUCnnlTensorDesc x_desc(*x);
     MLUCnnlTensorDesc y_desc(*y);
@@ -49,7 +53,9 @@ struct IouFunction {
                       ToCnnlDataType<T>());
   }
 
-  void Sub(const Tensor* x, const Tensor* y, Tensor* z) {
+  void Sub(const phi::DenseTensor* x,
+           const phi::DenseTensor* y,
+           phi::DenseTensor* z) {
     //  y should be init first
     MLUCnnlTensorDesc x_desc(*x);
     MLUCnnlTensorDesc y_desc(*y);
@@ -67,7 +73,9 @@ struct IouFunction {
                       GetBasePtr(z),
                       ToCnnlDataType<T>());
   }
-  void Mul(const Tensor* x, const Tensor* y, Tensor* z) {
+  void Mul(const phi::DenseTensor* x,
+           const phi::DenseTensor* y,
+           phi::DenseTensor* z) {
     //  z should be init first
     MLUCnnlTensorDesc x_desc(*x);
     MLUCnnlTensorDesc y_desc(*y);
@@ -85,7 +93,9 @@ struct IouFunction {
                       GetBasePtr(z),
                       ToCnnlDataType<T>());
   }
-  void DivNoNan(const Tensor* x, const Tensor* y, Tensor* z) {
+  void DivNoNan(const phi::DenseTensor* x,
+                const phi::DenseTensor* y,
+                phi::DenseTensor* z) {
     //  z should be init first
     MLUCnnlTensorDesc x_desc(*x);
     MLUCnnlTensorDesc y_desc(*y);
@@ -102,7 +112,7 @@ struct IouFunction {
                       z_desc.get(),
                       GetBasePtr(z));
   }
-  void Adds(const Tensor* x, float scalar, Tensor* y) {
+  void Adds(const phi::DenseTensor* x, float scalar, phi::DenseTensor* y) {
     //  y should be init first
     MLUCnnlTensorDesc x_desc(*x);
     MLUCnnlTensorDesc y_desc(*y);
@@ -116,7 +126,9 @@ struct IouFunction {
                        y_desc.get(),
                        GetBasePtr(y));
   }
-  void Maximum(const Tensor* x, const Tensor* y, Tensor* z) {
+  void Maximum(const phi::DenseTensor* x,
+               const phi::DenseTensor* y,
+               phi::DenseTensor* z) {
     //  z should be init first
     MLUCnnlTensorDesc x_desc(*x);
     MLUCnnlTensorDesc y_desc(*y);
@@ -130,7 +142,9 @@ struct IouFunction {
                      z_desc.get(),
                      GetBasePtr(z));
   }
-  void Minimum(const Tensor* x, const Tensor* y, Tensor* z) {
+  void Minimum(const phi::DenseTensor* x,
+               const phi::DenseTensor* y,
+               phi::DenseTensor* z) {
     //  z should be init first
     MLUCnnlTensorDesc x_desc(*x);
     MLUCnnlTensorDesc y_desc(*y);
@@ -155,7 +169,7 @@ class IouSimilarityMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* x = ctx.Input<framework::LoDTensor>("X");
-    auto* y = ctx.Input<framework::Tensor>("Y");
+    auto* y = ctx.Input<phi::DenseTensor>("Y");
     bool normalized = ctx.Attr<bool>("box_normalized");
     auto* out = ctx.Output<framework::LoDTensor>("Out");
 
diff --git a/paddle/fluid/operators/detection/iou_similarity_op_npu.cc b/paddle/fluid/operators/detection/iou_similarity_op_npu.cc
index 19c9d516976b7..ab7716a909588 100644
--- a/paddle/fluid/operators/detection/iou_similarity_op_npu.cc
+++ b/paddle/fluid/operators/detection/iou_similarity_op_npu.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 struct IouFunction {
@@ -28,43 +28,57 @@ struct IouFunction {
     stream = ctx.template device_context<paddle::platform::NPUDeviceContext>()
                  .stream();
   }
-  void Transpose(const Tensor* x, Tensor* y, const std::vector<int>& axis) {
+  void Transpose(const phi::DenseTensor* x,
+                 phi::DenseTensor* y,
+                 const std::vector<int>& axis) {
     //  y should be init first
     const auto& runner =
         NpuOpRunner("TransposeD", {*x}, {*y}, {{"perm", axis}});
     runner.Run(stream);
   }
-  void Add(const Tensor* x, const Tensor* y, Tensor* z) {
+  void Add(const phi::DenseTensor* x,
+           const phi::DenseTensor* y,
+           phi::DenseTensor* z) {
     //  y should be init first
     const auto& runner = NpuOpRunner("AddV2", {*x, *y}, {*z}, {});
     runner.Run(stream);
   }
-  void Sub(const Tensor* x, const Tensor* y, Tensor* z) {
+  void Sub(const phi::DenseTensor* x,
+           const phi::DenseTensor* y,
+           phi::DenseTensor* z) {
     //  y should be init first
     const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*z}, {});
     runner.Run(stream);
   }
-  void Mul(const Tensor* x, const Tensor* y, Tensor* z) {
+  void Mul(const phi::DenseTensor* x,
+           const phi::DenseTensor* y,
+           phi::DenseTensor* z) {
     //  y should be init first
     const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*z}, {});
     runner.Run(stream);
   }
-  void DivNoNan(const Tensor* x, const Tensor* y, Tensor* z) {
+  void DivNoNan(const phi::DenseTensor* x,
+                const phi::DenseTensor* y,
+                phi::DenseTensor* z) {
     //  y should be init first
     const auto& runner = NpuOpRunner("DivNoNan", {*x, *y}, {*z}, {});
     runner.Run(stream);
   }
-  void Adds(const Tensor* x, float scalar, Tensor* y) {
+  void Adds(const phi::DenseTensor* x, float scalar, phi::DenseTensor* y) {
     //  y should be init first
     const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scalar}});
     runner.Run(stream);
   }
-  void Maximum(const Tensor* x, const Tensor* y, Tensor* z) {
+  void Maximum(const phi::DenseTensor* x,
+               const phi::DenseTensor* y,
+               phi::DenseTensor* z) {
     //  z should be init first
     const auto& runner = NpuOpRunner("Maximum", {*x, *y}, {*z}, {});
     runner.Run(stream);
   }
-  void Minimum(const Tensor* x, const Tensor* y, Tensor* z) {
+  void Minimum(const phi::DenseTensor* x,
+               const phi::DenseTensor* y,
+               phi::DenseTensor* z) {
     //  z should be init first
     const auto& runner = NpuOpRunner("Minimum", {*x, *y}, {*z}, {});
     runner.Run(stream);
@@ -81,7 +95,7 @@ class IouSimilarityNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* x = ctx.Input<framework::LoDTensor>("X");
-    auto* y = ctx.Input<framework::Tensor>("Y");
+    auto* y = ctx.Input<phi::DenseTensor>("Y");
     bool normalized = ctx.Attr<bool>("box_normalized");
     auto* out = ctx.Output<framework::LoDTensor>("Out");
 
diff --git a/paddle/fluid/operators/detection/iou_similarity_op_xpu.cc b/paddle/fluid/operators/detection/iou_similarity_op_xpu.cc
index 04e2b758e8074..0a9f077e03bf0 100644
--- a/paddle/fluid/operators/detection/iou_similarity_op_xpu.cc
+++ b/paddle/fluid/operators/detection/iou_similarity_op_xpu.cc
@@ -24,7 +24,7 @@ class XPUIOUSimilarityKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     const framework::LoDTensor* in_x = ctx.Input<framework::LoDTensor>("X");
-    const framework::Tensor* in_y = ctx.Input<framework::Tensor>("Y");
+    const phi::DenseTensor* in_y = ctx.Input<phi::DenseTensor>("Y");
     bool normalized = ctx.Attr<bool>("box_normalized");
     framework::LoDTensor* out = ctx.Output<framework::LoDTensor>("Out");
 
diff --git a/paddle/fluid/operators/detection/locality_aware_nms_op.cc b/paddle/fluid/operators/detection/locality_aware_nms_op.cc
index 16e2c28265d14..f75e4c96ba81d 100644
--- a/paddle/fluid/operators/detection/locality_aware_nms_op.cc
+++ b/paddle/fluid/operators/detection/locality_aware_nms_op.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 
 class LocalityAwareNMSOp : public framework::OperatorWithKernel {
@@ -166,8 +166,8 @@ void GetMaxScoreIndexWithLocalityAware(
 template <typename T>
 class LocalityAwareNMSKernel : public framework::OpKernel<T> {
  public:
-  void LocalityAwareNMSFast(Tensor* bbox,
-                            Tensor* scores,
+  void LocalityAwareNMSFast(phi::DenseTensor* bbox,
+                            phi::DenseTensor* scores,
                             const T score_threshold,
                             const T nms_threshold,
                             const T eta,
@@ -237,8 +237,8 @@ class LocalityAwareNMSKernel : public framework::OpKernel<T> {
   }
 
   void LocalityAwareNMS(const framework::ExecutionContext& ctx,
-                        Tensor* scores,
-                        Tensor* bboxes,
+                        phi::DenseTensor* scores,
+                        phi::DenseTensor* bboxes,
                         const int scores_size,
                         std::map<int, std::vector<int>>* indices,
                         int* num_nmsed_out) const {
@@ -309,11 +309,11 @@ class LocalityAwareNMSKernel : public framework::OpKernel<T> {
 
   void LocalityAwareNMSOutput(
       const platform::DeviceContext& ctx,
-      const Tensor& scores,
-      const Tensor& bboxes,
+      const phi::DenseTensor& scores,
+      const phi::DenseTensor& bboxes,
       const std::map<int, std::vector<int>>& selected_indices,
       const int scores_size,
-      Tensor* outs,
+      phi::DenseTensor* outs,
       int* oindices = nullptr,
       const int offset = 0) const {
     int64_t predict_dim = scores.dims()[1];
diff --git a/paddle/fluid/operators/detection/matrix_nms_op.cc b/paddle/fluid/operators/detection/matrix_nms_op.cc
index 1c755c62ebc1b..618b2bdd23d89 100644
--- a/paddle/fluid/operators/detection/matrix_nms_op.cc
+++ b/paddle/fluid/operators/detection/matrix_nms_op.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 
 class MatrixNMSOp : public framework::OperatorWithKernel {
diff --git a/paddle/fluid/operators/detection/mine_hard_examples_op.cc b/paddle/fluid/operators/detection/mine_hard_examples_op.cc
index f3df3b228d7ee..577b4ca572f36 100644
--- a/paddle/fluid/operators/detection/mine_hard_examples_op.cc
+++ b/paddle/fluid/operators/detection/mine_hard_examples_op.cc
@@ -53,10 +53,10 @@ template <typename DeviceContext, typename T>
 class MineHardExamplesKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_cls_loss = ctx.Input<framework::Tensor>("ClsLoss");
-    auto* in_loc_loss = ctx.Input<framework::Tensor>("LocLoss");
-    auto* in_matched_indices = ctx.Input<framework::Tensor>("MatchIndices");
-    auto* in_match_dist = ctx.Input<framework::Tensor>("MatchDist");
+    auto* in_cls_loss = ctx.Input<phi::DenseTensor>("ClsLoss");
+    auto* in_loc_loss = ctx.Input<phi::DenseTensor>("LocLoss");
+    auto* in_matched_indices = ctx.Input<phi::DenseTensor>("MatchIndices");
+    auto* in_match_dist = ctx.Input<phi::DenseTensor>("MatchDist");
     float neg_pos_ratio = ctx.Attr<float>("neg_pos_ratio");
     T neg_dist_threshold =
         static_cast<T>(ctx.Attr<float>("neg_dist_threshold"));
@@ -66,7 +66,7 @@ class MineHardExamplesKernel : public framework::OpKernel<T> {
 
     auto out_neg_indices = ctx.Output<framework::LoDTensor>("NegIndices");
     auto out_match_indices =
-        ctx.Output<framework::Tensor>("UpdatedMatchIndices");
+        ctx.Output<phi::DenseTensor>("UpdatedMatchIndices");
 
     framework::TensorCopy(
         *in_matched_indices, ctx.GetPlace(), out_match_indices);
diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc
index 67b26ddbc2df9..676ee804e23bc 100644
--- a/paddle/fluid/operators/detection/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc
@@ -21,10 +21,11 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 
-inline std::vector<size_t> GetNmsLodFromRoisNum(const Tensor* rois_num) {
+inline std::vector<size_t> GetNmsLodFromRoisNum(
+    const phi::DenseTensor* rois_num) {
   std::vector<size_t> rois_lod;
   auto* rois_num_data = rois_num->data<int>();
   rois_lod.push_back(static_cast<size_t>(0));
@@ -124,9 +125,9 @@ class MultiClassNMSOp : public framework::OperatorWithKernel {
 
 template <class T>
 void SliceOneClass(const platform::DeviceContext& ctx,
-                   const framework::Tensor& items,
+                   const phi::DenseTensor& items,
                    const int class_id,
-                   framework::Tensor* one_class_item) {
+                   phi::DenseTensor* one_class_item) {
   T* item_data = one_class_item->mutable_data<T>(ctx.GetPlace());
   const T* items_data = items.data<T>();
   const int64_t num_item = items.dims()[0];
@@ -148,8 +149,8 @@ void SliceOneClass(const platform::DeviceContext& ctx,
 template <typename T>
 class MultiClassNMSKernel : public framework::OpKernel<T> {
  public:
-  void NMSFast(const Tensor& bbox,
-               const Tensor& scores,
+  void NMSFast(const phi::DenseTensor& bbox,
+               const phi::DenseTensor& scores,
                const T score_threshold,
                const T nms_threshold,
                const T eta,
@@ -211,8 +212,8 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
   }
 
   void MultiClassNMS(const framework::ExecutionContext& ctx,
-                     const Tensor& scores,
-                     const Tensor& bboxes,
+                     const phi::DenseTensor& scores,
+                     const phi::DenseTensor& bboxes,
                      const int scores_size,
                      std::map<int, std::vector<int>>* indices,
                      int* num_nmsed_out) const {
@@ -301,11 +302,11 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
   }
 
   void MultiClassOutput(const platform::DeviceContext& ctx,
-                        const Tensor& scores,
-                        const Tensor& bboxes,
+                        const phi::DenseTensor& scores,
+                        const phi::DenseTensor& bboxes,
                         const std::map<int, std::vector<int>>& selected_indices,
                         const int scores_size,
-                        Tensor* outs,
+                        phi::DenseTensor* outs,
                         int* oindices = nullptr,
                         const int offset = 0) const {
     int64_t class_num = scores.dims()[1];
@@ -362,7 +363,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
     bool return_index = ctx.HasOutput("Index") ? true : false;
     auto index = ctx.Output<LoDTensor>("Index");
     bool has_roisnum = ctx.HasInput("RoisNum") ? true : false;
-    auto rois_num = ctx.Input<Tensor>("RoisNum");
+    auto rois_num = ctx.Input<phi::DenseTensor>("RoisNum");
     auto score_dims = scores->dims();
     auto score_size = score_dims.size();
     auto& dev_ctx = ctx.template device_context<phi::CPUContext>();
@@ -467,7 +468,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
       }
     }
     if (ctx.HasOutput("NmsRoisNum")) {
-      auto* nms_rois_num = ctx.Output<Tensor>("NmsRoisNum");
+      auto* nms_rois_num = ctx.Output<phi::DenseTensor>("NmsRoisNum");
       nms_rois_num->mutable_data<int>({n}, ctx.GetPlace());
       int* num_data = nms_rois_num->data<int>();
       for (int i = 1; i <= n; i++) {
diff --git a/paddle/fluid/operators/detection/nms_op.cc b/paddle/fluid/operators/detection/nms_op.cc
index 03680538f778e..66682c67870ba 100644
--- a/paddle/fluid/operators/detection/nms_op.cc
+++ b/paddle/fluid/operators/detection/nms_op.cc
@@ -23,8 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class NMSOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cc b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
index 7450ffa876339..e386465c3bdf6 100644
--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cc
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class PolygonBoxTransformCPUKernel : public framework::OpKernel<T> {
@@ -27,10 +27,10 @@ class PolygonBoxTransformCPUKernel : public framework::OpKernel<T> {
         platform::is_cpu_place(ctx.GetPlace()),
         true,
         platform::errors::InvalidArgument("It must use CUDAPlace."));
-    auto* in = ctx.Input<Tensor>("Input");
+    auto* in = ctx.Input<phi::DenseTensor>("Input");
     auto in_dims = in->dims();
     const T* in_data = in->data<T>();
-    auto* out = ctx.Output<Tensor>("Output");
+    auto* out = ctx.Output<phi::DenseTensor>("Output");
     T* out_data = out->mutable_data<T>(ctx.GetPlace());
 
     int batch_size = in_dims[0];
diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cu b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
index c90b5b4de0268..49e3d3d96ba5d 100644
--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cu
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using platform::PADDLE_CUDA_NUM_THREADS;
 #define CUDA_BLOCK_SIZE 16
 
@@ -48,10 +48,10 @@ class PolygonBoxTransformOpCUDAKernel : public framework::OpKernel<T> {
         true,
         platform::errors::InvalidArgument(
             "The polygon_box_transform operator needs to be executed on GPU."));
-    auto* in = ctx.Input<Tensor>("Input");
+    auto* in = ctx.Input<phi::DenseTensor>("Input");
     auto in_dims = in->dims();
     const T* in_data = in->data<T>();
-    auto* out = ctx.Output<Tensor>("Output");
+    auto* out = ctx.Output<phi::DenseTensor>("Output");
     T* out_data = out->mutable_data<T>(ctx.GetPlace());
 
     int batch_size = in_dims[0];
diff --git a/paddle/fluid/operators/detection/prior_box_op.cc b/paddle/fluid/operators/detection/prior_box_op.cc
index 03733e34ec670..de6e0822fe37c 100644
--- a/paddle/fluid/operators/detection/prior_box_op.cc
+++ b/paddle/fluid/operators/detection/prior_box_op.cc
@@ -43,7 +43,7 @@ class PriorBoxOp : public framework::OperatorWithKernel {
       library_ = framework::LibraryType::kMKLDNN;
       layout_ = framework::DataLayout::kMKLDNN;
       auto input_image_type = framework::TransToProtoVarType(
-          ctx.Input<framework::Tensor>("Image")->dtype());
+          ctx.Input<phi::DenseTensor>("Image")->dtype());
       int customized_type_value =
           framework::OpKernelType::kDefaultCustomizedTypeValue;
       if (input_image_type == framework::DataTypeTrait<float>::DataType()) {
diff --git a/paddle/fluid/operators/detection/prior_box_op.h b/paddle/fluid/operators/detection/prior_box_op.h
index 889bc8354bc41..3adbfda50a779 100644
--- a/paddle/fluid/operators/detection/prior_box_op.h
+++ b/paddle/fluid/operators/detection/prior_box_op.h
@@ -54,10 +54,10 @@ template <typename T, typename K>
 class PriorBoxOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
-    auto* image = ctx.Input<paddle::framework::Tensor>("Image");
-    auto* boxes = ctx.Output<paddle::framework::Tensor>("Boxes");
-    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
+    auto* input = ctx.Input<phi::DenseTensor>("Input");
+    auto* image = ctx.Input<phi::DenseTensor>("Image");
+    auto* boxes = ctx.Output<phi::DenseTensor>("Boxes");
+    auto* vars = ctx.Output<phi::DenseTensor>("Variances");
 
     auto min_sizes = ctx.Attr<std::vector<float>>("min_sizes");
     auto max_sizes = ctx.Attr<std::vector<float>>("max_sizes");
@@ -171,7 +171,7 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
       });
     }
 
-    framework::Tensor var_t;
+    phi::DenseTensor var_t;
     var_t.mutable_data<K>(
         phi::make_ddim({1, static_cast<int>(variances.size())}),
         ctx.GetPlace());
diff --git a/paddle/fluid/operators/detection/prior_box_op_npu.cc b/paddle/fluid/operators/detection/prior_box_op_npu.cc
index 9098c4084e143..8a3a313be159c 100644
--- a/paddle/fluid/operators/detection/prior_box_op_npu.cc
+++ b/paddle/fluid/operators/detection/prior_box_op_npu.cc
@@ -18,16 +18,16 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class PriorBoxNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("Input");
-    auto* image = ctx.Input<Tensor>("Image");
-    auto* boxes = ctx.Output<Tensor>("Boxes");
-    auto* variances = ctx.Output<Tensor>("Variances");
+    auto* input = ctx.Input<phi::DenseTensor>("Input");
+    auto* image = ctx.Input<phi::DenseTensor>("Image");
+    auto* boxes = ctx.Output<phi::DenseTensor>("Boxes");
+    auto* variances = ctx.Output<phi::DenseTensor>("Variances");
 
     PADDLE_ENFORCE_EQ(boxes->dims(),
                       variances->dims(),
diff --git a/paddle/fluid/operators/detection/retinanet_detection_output_op.cc b/paddle/fluid/operators/detection/retinanet_detection_output_op.cc
index 2f3b59db5c038..2ebe2915e81ce 100644
--- a/paddle/fluid/operators/detection/retinanet_detection_output_op.cc
+++ b/paddle/fluid/operators/detection/retinanet_detection_output_op.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 
 class RetinanetDetectionOutputOp : public framework::OperatorWithKernel {
@@ -413,7 +413,7 @@ class RetinanetDetectionOutputKernel : public framework::OpKernel<T> {
                                 const std::vector<Tensor>& scores,
                                 const std::vector<Tensor>& bboxes,
                                 const std::vector<Tensor>& anchors,
-                                const Tensor& im_info,
+                                const phi::DenseTensor& im_info,
                                 std::vector<std::vector<T>>* nmsed_out,
                                 int* num_nmsed_out) const {
     int64_t nms_top_k = ctx.Attr<int>("nms_top_k");
@@ -471,7 +471,7 @@ class RetinanetDetectionOutputKernel : public framework::OpKernel<T> {
 
   void MultiClassOutput(const platform::DeviceContext& ctx,
                         const std::vector<std::vector<T>>& nmsed_out,
-                        Tensor* outs) const {
+                        phi::DenseTensor* outs) const {
     auto* odata = outs->data<T>();
     int count = 0;
     int64_t out_dim = 6;
@@ -487,9 +487,9 @@ class RetinanetDetectionOutputKernel : public framework::OpKernel<T> {
   }
 
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto boxes = ctx.MultiInput<Tensor>("BBoxes");
-    auto scores = ctx.MultiInput<Tensor>("Scores");
-    auto anchors = ctx.MultiInput<Tensor>("Anchors");
+    auto boxes = ctx.MultiInput<phi::DenseTensor>("BBoxes");
+    auto scores = ctx.MultiInput<phi::DenseTensor>("Scores");
+    auto anchors = ctx.MultiInput<phi::DenseTensor>("Anchors");
     auto* im_info = ctx.Input<LoDTensor>("ImInfo");
     auto* outs = ctx.Output<LoDTensor>("Out");
 
diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
index 9994864b47d2b..c91a4f6c30a77 100644
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
@@ -22,7 +22,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 
 template <typename T>
@@ -249,12 +249,12 @@ template <typename T>
 class CPUROIPerspectiveTransformOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* in = ctx.Input<phi::DenseTensor>("X");
     auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    auto* mask = ctx.Output<framework::Tensor>("Mask");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
+    auto* mask = ctx.Output<phi::DenseTensor>("Mask");
     auto* out_transform_matrix =
-        ctx.Output<framework::Tensor>("TransformMatrix");
+        ctx.Output<phi::DenseTensor>("TransformMatrix");
     auto transformed_height = ctx.Attr<int>("transformed_height");
     auto transformed_width = ctx.Attr<int>("transformed_width");
     auto spatial_scale = ctx.Attr<float>("spatial_scale");
@@ -268,7 +268,7 @@ class CPUROIPerspectiveTransformOpKernel : public framework::OpKernel<T> {
     const T* input_data = in->data<T>();
     int* mask_data = mask->mutable_data<int>(ctx.GetPlace());
 
-    framework::Tensor roi2image;
+    phi::DenseTensor roi2image;
     roi2image.Resize({rois_num});
     int* roi2image_data = roi2image.mutable_data<int>(ctx.GetPlace());
     auto lod = rois->lod().back();
@@ -397,11 +397,10 @@ template <typename T>
 class CPUROIPerspectiveTransformGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* in = ctx.Input<phi::DenseTensor>("X");
     auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* out_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* in_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     auto transformed_height = ctx.Attr<int>("transformed_height");
     auto transformed_width = ctx.Attr<int>("transformed_width");
@@ -418,7 +417,7 @@ class CPUROIPerspectiveTransformGradOpKernel : public framework::OpKernel<T> {
     const T* out_grad_data = out_grad->data<T>();
     const T* rois_data = rois->data<T>();
 
-    framework::Tensor roi2image;
+    phi::DenseTensor roi2image;
     roi2image.Resize({rois_num});
     int* roi2image_data = roi2image.mutable_data<int>(ctx.GetPlace());
     auto lod = rois->lod().back();
diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
index 73b28f8f0e476..dee0a8e69d6d8 100644
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
@@ -367,14 +367,14 @@ template <typename T>
 class CUDAROIPerspectiveTransformOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* in = ctx.Input<phi::DenseTensor>("X");
     auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    auto* out2in_idx = ctx.Output<framework::Tensor>("Out2InIdx");
-    auto* out2in_w = ctx.Output<framework::Tensor>("Out2InWeights");
-    auto* mask = ctx.Output<framework::Tensor>("Mask");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
+    auto* out2in_idx = ctx.Output<phi::DenseTensor>("Out2InIdx");
+    auto* out2in_w = ctx.Output<phi::DenseTensor>("Out2InWeights");
+    auto* mask = ctx.Output<phi::DenseTensor>("Mask");
     auto* out_transform_matrix =
-        ctx.Output<framework::Tensor>("TransformMatrix");
+        ctx.Output<phi::DenseTensor>("TransformMatrix");
 
     int* mask_data = mask->mutable_data<int>(ctx.GetPlace());
     int* out2in_idx_data =
@@ -400,8 +400,8 @@ class CUDAROIPerspectiveTransformOpKernel : public framework::OpKernel<T> {
     T* output_data = out->mutable_data<T>(ctx.GetPlace());
     const T* rois_data = rois->data<T>();
 
-    framework::Tensor roi2image;
-    framework::Tensor roi2image_dev;
+    phi::DenseTensor roi2image;
+    phi::DenseTensor roi2image_dev;
     roi2image.Resize({rois_num});
     int* roi2image_data = roi2image.mutable_data<int>(platform::CPUPlace());
     auto lod = rois->lod().back();
@@ -513,9 +513,8 @@ class CUDAROIPerspectiveTransformGradOpKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* out2in_idx = ctx.Input<framework::LoDTensor>("Out2InIdx");
     auto* out2in_w = ctx.Input<framework::LoDTensor>("Out2InWeights");
-    auto* out_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* in_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
index c6e4c00f79bba..f60cef3d1b554 100644
--- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc
+++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
@@ -105,7 +105,7 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel {
 };
 
 template <typename T>
-void AppendRpns(LoDTensor* out, int64_t offset, Tensor* to_add) {
+void AppendRpns(LoDTensor* out, int64_t offset, phi::DenseTensor* to_add) {
   auto* out_data = out->data<T>();
   auto* to_add_data = to_add->data<T>();
   memcpy(out_data + offset, to_add_data, to_add->numel() * sizeof(T));
@@ -113,7 +113,7 @@ void AppendRpns(LoDTensor* out, int64_t offset, Tensor* to_add) {
 
 template <typename T>
 std::vector<Tensor> FilterStraddleAnchor(const phi::CPUContext& context,
-                                         const Tensor* anchor,
+                                         const phi::DenseTensor* anchor,
                                          const float rpn_straddle_thresh,
                                          T im_height,
                                          T im_width) {
@@ -154,8 +154,8 @@ std::vector<Tensor> FilterStraddleAnchor(const phi::CPUContext& context,
 
 template <typename T>
 Tensor FilterCrowdGt(const phi::CPUContext& context,
-                     Tensor* gt_boxes,
-                     Tensor* is_crowd) {
+                     phi::DenseTensor* gt_boxes,
+                     phi::DenseTensor* is_crowd) {
   int gt_num = gt_boxes->dims()[0];
   std::vector<int> not_crowd_inds;
   auto* is_crowd_data = is_crowd->data<int>();
@@ -196,8 +196,8 @@ void ReservoirSampling(const int num,
 
 template <typename T>
 void ScoreAssign(const T* anchor_by_gt_overlap_data,
-                 const Tensor& anchor_to_gt_max,
-                 const Tensor& gt_to_anchor_max,
+                 const phi::DenseTensor& anchor_to_gt_max,
+                 const phi::DenseTensor& gt_to_anchor_max,
                  const int rpn_batch_size_per_im,
                  const float rpn_fg_fraction,
                  const float rpn_positive_overlap,
@@ -299,14 +299,15 @@ void ScoreAssign(const T* anchor_by_gt_overlap_data,
 }
 
 template <typename T>
-std::vector<Tensor> SampleRpnFgBgGt(const phi::CPUContext& ctx,
-                                    const Tensor& anchor_by_gt_overlap,
-                                    const int rpn_batch_size_per_im,
-                                    const float rpn_positive_overlap,
-                                    const float rpn_negative_overlap,
-                                    const float rpn_fg_fraction,
-                                    std::minstd_rand engine,
-                                    bool use_random) {
+std::vector<Tensor> SampleRpnFgBgGt(
+    const phi::CPUContext& ctx,
+    const phi::DenseTensor& anchor_by_gt_overlap,
+    const int rpn_batch_size_per_im,
+    const float rpn_positive_overlap,
+    const float rpn_negative_overlap,
+    const float rpn_fg_fraction,
+    std::minstd_rand engine,
+    bool use_random) {
   auto* anchor_by_gt_overlap_data = anchor_by_gt_overlap.data<T>();
   int anchor_num = anchor_by_gt_overlap.dims()[0];
   int gt_num = anchor_by_gt_overlap.dims()[1];
@@ -393,7 +394,7 @@ template <typename T>
 class RpnTargetAssignKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* anchor = context.Input<Tensor>("Anchor");  // (H*W*A) * 4
+    auto* anchor = context.Input<phi::DenseTensor>("Anchor");  // (H*W*A) * 4
     auto* gt_boxes = context.Input<LoDTensor>("GtBoxes");
     auto* is_crowd = context.Input<LoDTensor>("IsCrowd");
     auto* im_info = context.Input<LoDTensor>("ImInfo");
@@ -857,9 +858,9 @@ class RetinanetTargetAssignOp : public framework::OperatorWithKernel {
 
 template <typename T>
 std::vector<Tensor> FilterCrowdGtBoxLabel(const phi::CPUContext& context,
-                                          Tensor* gt_boxes,
-                                          Tensor* gt_labels,
-                                          Tensor* is_crowd) {
+                                          phi::DenseTensor* gt_boxes,
+                                          phi::DenseTensor* gt_labels,
+                                          phi::DenseTensor* is_crowd) {
   int gt_num = gt_boxes->dims()[0];
   std::vector<int> not_crowd_inds;
   auto* is_crowd_data = is_crowd->data<int>();
@@ -892,8 +893,8 @@ std::vector<Tensor> FilterCrowdGtBoxLabel(const phi::CPUContext& context,
 
 template <typename T>
 std::vector<Tensor> GetAllFgBgGt(const phi::CPUContext& ctx,
-                                 const Tensor& anchor_by_gt_overlap,
-                                 const Tensor& ncrowd_gt_labels,
+                                 const phi::DenseTensor& anchor_by_gt_overlap,
+                                 const phi::DenseTensor& ncrowd_gt_labels,
                                  const float positive_overlap,
                                  const float negative_overlap,
                                  std::minstd_rand engine) {
@@ -992,7 +993,7 @@ template <typename T>
 class RetinanetTargetAssignKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* anchor = context.Input<Tensor>("Anchor");  // (H*W*A) * 4
+    auto* anchor = context.Input<phi::DenseTensor>("Anchor");  // (H*W*A) * 4
     auto* gt_boxes = context.Input<LoDTensor>("GtBoxes");
     auto* gt_labels = context.Input<LoDTensor>("GtLabels");
     auto* is_crowd = context.Input<LoDTensor>("IsCrowd");
diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc
index bc23c5105db94..91479a78b63b1 100644
--- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc
+++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class SigmoidFocalLossOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
index 3def90fd459e5..bad93fd22b2e9 100644
--- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
+++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 static constexpr int kNumCUDAThreads = 512;
 static constexpr int kNumMaxinumNumBlocks = 4096;
@@ -119,10 +119,10 @@ template <typename DeviceContext, typename T>
 class GPUSigmoidFocalLossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<Tensor>("X");
-    const Tensor *Labels = context.Input<Tensor>("Label");
-    const Tensor *FgNum = context.Input<Tensor>("FgNum");
-    Tensor *Out = context.Output<Tensor>("Out");
+    const Tensor *X = context.Input<phi::DenseTensor>("X");
+    const Tensor *Labels = context.Input<phi::DenseTensor>("Label");
+    const Tensor *FgNum = context.Input<phi::DenseTensor>("FgNum");
+    Tensor *Out = context.Output<phi::DenseTensor>("Out");
     T gamma = static_cast<T>(context.Attr<float>("gamma"));
     T alpha = static_cast<T>(context.Attr<float>("alpha"));
     auto x_dims = X->dims();
@@ -150,11 +150,12 @@ template <typename DeviceContext, typename T>
 class GPUSigmoidFocalLossGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<Tensor>("X");
-    const Tensor *Labels = context.Input<Tensor>("Label");
-    const Tensor *FgNum = context.Input<Tensor>("FgNum");
-    const Tensor *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor *dX = context.Output<Tensor>(framework::GradVarName("X"));
+    const Tensor *X = context.Input<phi::DenseTensor>("X");
+    const Tensor *Labels = context.Input<phi::DenseTensor>("Label");
+    const Tensor *FgNum = context.Input<phi::DenseTensor>("FgNum");
+    const Tensor *dOut =
+        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    Tensor *dX = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto dx_data = dX->mutable_data<T>(context.GetPlace());
     T gamma = static_cast<T>(context.Attr<float>("gamma"));
     T alpha = static_cast<T>(context.Attr<float>("alpha"));
diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.h b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.h
index 4ad9743cfca94..b7c77a5e28222 100644
--- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.h
+++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.h
@@ -22,16 +22,16 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class SigmoidFocalLossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<Tensor>("X");
-    const Tensor *Labels = context.Input<Tensor>("Label");
-    const Tensor *FgNum = context.Input<Tensor>("FgNum");
-    Tensor *Out = context.Output<Tensor>("Out");
+    const Tensor *X = context.Input<phi::DenseTensor>("X");
+    const Tensor *Labels = context.Input<phi::DenseTensor>("Label");
+    const Tensor *FgNum = context.Input<phi::DenseTensor>("FgNum");
+    Tensor *Out = context.Output<phi::DenseTensor>("Out");
     T gamma = static_cast<T>(context.Attr<float>("gamma"));
     T alpha = static_cast<T>(context.Attr<float>("alpha"));
     auto out_data = Out->mutable_data<T>(context.GetPlace());
@@ -79,11 +79,12 @@ template <typename DeviceContext, typename T>
 class SigmoidFocalLossGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<Tensor>("X");
-    const Tensor *Labels = context.Input<Tensor>("Label");
-    const Tensor *FgNum = context.Input<Tensor>("FgNum");
-    const Tensor *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor *dX = context.Output<Tensor>(framework::GradVarName("X"));
+    const Tensor *X = context.Input<phi::DenseTensor>("X");
+    const Tensor *Labels = context.Input<phi::DenseTensor>("Label");
+    const Tensor *FgNum = context.Input<phi::DenseTensor>("FgNum");
+    const Tensor *dOut =
+        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    Tensor *dX = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto dx_data = dX->mutable_data<T>(context.GetPlace());
     T gamma = static_cast<T>(context.Attr<float>("gamma"));
     T alpha = static_cast<T>(context.Attr<float>("alpha"));
diff --git a/paddle/fluid/operators/detection/target_assign_op.h b/paddle/fluid/operators/detection/target_assign_op.h
index 55481dc3e8166..a7c66bcf02e07 100644
--- a/paddle/fluid/operators/detection/target_assign_op.h
+++ b/paddle/fluid/operators/detection/target_assign_op.h
@@ -97,10 +97,10 @@ class TargetAssignKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* x = ctx.Input<framework::LoDTensor>("X");
-    auto* match_indices = ctx.Input<framework::Tensor>("MatchIndices");
+    auto* match_indices = ctx.Input<phi::DenseTensor>("MatchIndices");
 
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    auto* out_wt = ctx.Output<framework::Tensor>("OutWeight");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
+    auto* out_wt = ctx.Output<phi::DenseTensor>("OutWeight");
 
     PADDLE_ENFORCE_EQ(x->lod().size(),
                       1UL,
diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc
index 3261f8fca3d20..257347f663c68 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cc
+++ b/paddle/fluid/operators/detection/yolo_box_op.cc
@@ -17,8 +17,6 @@
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class YoloBoxOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.cc b/paddle/fluid/operators/detection/yolov3_loss_op.cc
index 0448d7e5183c8..5f6ffece3bf54 100644
--- a/paddle/fluid/operators/detection/yolov3_loss_op.cc
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.cc
@@ -21,8 +21,6 @@
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class Yolov3LossOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/detection_map_op.cc b/paddle/fluid/operators/detection_map_op.cc
index 8c30ae28f4e76..51fdd4ad1f2ec 100644
--- a/paddle/fluid/operators/detection_map_op.cc
+++ b/paddle/fluid/operators/detection_map_op.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class DetectionMAPOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/detection_map_op.h b/paddle/fluid/operators/detection_map_op.h
index 3ed55f6697f1a..cde33cd956419 100644
--- a/paddle/fluid/operators/detection_map_op.h
+++ b/paddle/fluid/operators/detection_map_op.h
@@ -62,13 +62,13 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* in_detect = ctx.Input<framework::LoDTensor>("DetectRes");
     auto* in_label = ctx.Input<framework::LoDTensor>("Label");
-    auto* out_map = ctx.Output<framework::Tensor>("MAP");
+    auto* out_map = ctx.Output<phi::DenseTensor>("MAP");
 
-    auto* in_pos_count = ctx.Input<framework::Tensor>("PosCount");
+    auto* in_pos_count = ctx.Input<phi::DenseTensor>("PosCount");
     auto* in_true_pos = ctx.Input<framework::LoDTensor>("TruePos");
     auto* in_false_pos = ctx.Input<framework::LoDTensor>("FalsePos");
 
-    auto* out_pos_count = ctx.Output<framework::Tensor>("AccumPosCount");
+    auto* out_pos_count = ctx.Output<phi::DenseTensor>("AccumPosCount");
     auto* out_true_pos = ctx.Output<framework::LoDTensor>("AccumTruePos");
     auto* out_false_pos = ctx.Output<framework::LoDTensor>("AccumFalsePos");
 
@@ -241,7 +241,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
       const std::map<int, int>& label_pos_count,
       const std::map<int, std::vector<std::pair<T, int>>>& true_pos,
       const std::map<int, std::vector<std::pair<T, int>>>& false_pos,
-      framework::Tensor* output_pos_count,
+      phi::DenseTensor* output_pos_count,
       framework::LoDTensor* output_true_pos,
       framework::LoDTensor* output_false_pos,
       const int class_num) const {
@@ -307,7 +307,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
     output_false_pos->set_lod(false_pos_lod);
   }
 
-  void GetInputPos(const framework::Tensor& input_pos_count,
+  void GetInputPos(const phi::DenseTensor& input_pos_count,
                    const framework::LoDTensor& input_true_pos,
                    const framework::LoDTensor& input_false_pos,
                    std::map<int, int>* label_pos_count,
diff --git a/paddle/fluid/operators/dgc_clip_by_norm_op.cc b/paddle/fluid/operators/dgc_clip_by_norm_op.cc
index 9949fefb1b18b..7c75949039358 100644
--- a/paddle/fluid/operators/dgc_clip_by_norm_op.cc
+++ b/paddle/fluid/operators/dgc_clip_by_norm_op.cc
@@ -33,7 +33,7 @@ class DGCClipByNormOp : public ClipByNormOp {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const framework::Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     if (var_name == "current_step") {
       VLOG(10) << "var_name:" << var_name << " need not to transform";
diff --git a/paddle/fluid/operators/dgc_clip_by_norm_op.h b/paddle/fluid/operators/dgc_clip_by_norm_op.h
index 27c30a8997b2c..8637ac88a422d 100644
--- a/paddle/fluid/operators/dgc_clip_by_norm_op.h
+++ b/paddle/fluid/operators/dgc_clip_by_norm_op.h
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class DGCClipByNormKernel : public framework::OpKernel<T> {
@@ -32,7 +32,7 @@ class DGCClipByNormKernel : public framework::OpKernel<T> {
       return;
     }
 
-    auto current_step_tensor = ctx.Input<framework::Tensor>("current_step");
+    auto current_step_tensor = ctx.Input<phi::DenseTensor>("current_step");
     auto* current_step = current_step_tensor->data<T>();
 
     VLOG(10) << "current_step:" << *current_step
@@ -50,8 +50,8 @@ class DGCClipByNormKernel : public framework::OpKernel<T> {
     auto& dev_ctx = ctx.device_context<DeviceContext>();
 
     if (in_var->IsType<framework::LoDTensor>()) {
-      auto* x = ctx.Input<Tensor>("X");
-      auto* y = ctx.Output<Tensor>("Out");
+      auto* x = ctx.Input<phi::DenseTensor>("X");
+      auto* y = ctx.Output<phi::DenseTensor>("Out");
       return phi::ClipByNormKernel<T>(
           static_cast<const typename framework::ConvertToPhiContext<
               DeviceContext>::TYPE&>(dev_ctx),
diff --git a/paddle/fluid/operators/dgc_op.cc b/paddle/fluid/operators/dgc_op.cc
index 1f7b5dbdce9c8..e247ab05ebadd 100644
--- a/paddle/fluid/operators/dgc_op.cc
+++ b/paddle/fluid/operators/dgc_op.cc
@@ -47,7 +47,7 @@ class DGCOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const framework::Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     if (var_name == "current_step" || var_name == "k" || var_name == "nranks") {
       VLOG(10) << "var_name:" << var_name << " need not to transform";
diff --git a/paddle/fluid/operators/dgc_op.h b/paddle/fluid/operators/dgc_op.h
index 82e002cbb3389..44121a9434c72 100644
--- a/paddle/fluid/operators/dgc_op.h
+++ b/paddle/fluid/operators/dgc_op.h
@@ -53,11 +53,11 @@ template <typename DeviceContext, typename T>
 class DGCOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto u = ctx.Input<framework::Tensor>("U");
-    auto v = ctx.Input<framework::Tensor>("V");
-    auto g = ctx.Input<framework::Tensor>("Grad");
+    auto u = ctx.Input<phi::DenseTensor>("U");
+    auto v = ctx.Input<phi::DenseTensor>("V");
+    auto g = ctx.Input<phi::DenseTensor>("Grad");
 
-    auto grad_out = ctx.Output<framework::Tensor>("Grad_out");
+    auto grad_out = ctx.Output<phi::DenseTensor>("Grad_out");
 
     // attrs
     float m = ctx.Attr<float>("m");
@@ -67,7 +67,7 @@ class DGCOpKernel : public framework::OpKernel<T> {
     auto rampup_step = ctx.Attr<float>("rampup_step");
 
     // nranks
-    auto nranks_tensor = ctx.Input<framework::Tensor>("nranks");
+    auto nranks_tensor = ctx.Input<phi::DenseTensor>("nranks");
     const int nranks = static_cast<const int>(*nranks_tensor->data<float>());
     PADDLE_ENFORCE_GT(nranks,
                       1,
@@ -76,7 +76,7 @@ class DGCOpKernel : public framework::OpKernel<T> {
                           "use multi card or multi machine GPU"));
 
     // regularization
-    auto p = ctx.Input<framework::Tensor>("Param");
+    auto p = ctx.Input<phi::DenseTensor>("Param");
     float regular_coeff = ctx.Attr<float>("regular_coeff");
     int regular_type = ctx.Attr<int>("regular_type");
 
@@ -110,7 +110,7 @@ class DGCOpKernel : public framework::OpKernel<T> {
     }
 
     // current step
-    auto current_step_tensor = ctx.Input<framework::Tensor>("current_step");
+    auto current_step_tensor = ctx.Input<phi::DenseTensor>("current_step");
     const float* current_step = current_step_tensor->data<float>();
 
     if (static_cast<int>(*current_step) < static_cast<int>(rampup_begin_step)) {
@@ -140,14 +140,14 @@ class DGCOpKernel : public framework::OpKernel<T> {
              << ",  current_step:" << *current_step << ", ratio:" << ratio
              << ", k:" << k << ", nranks:" << nranks;
 
-    auto k_out = ctx.Output<framework::Tensor>("k");
+    auto k_out = ctx.Output<phi::DenseTensor>("k");
     T* k_out_data = k_out->data<T>();
     *k_out_data = k;
 
-    auto u_out = ctx.Output<framework::Tensor>("U_out");
-    auto v_out = ctx.Output<framework::Tensor>("V_out");
-    auto encode_grad_out = ctx.Output<framework::Tensor>("EncodeGrad");
-    auto gather_buff = ctx.Output<framework::Tensor>("GatherBuff");
+    auto u_out = ctx.Output<phi::DenseTensor>("U_out");
+    auto v_out = ctx.Output<phi::DenseTensor>("V_out");
+    auto encode_grad_out = ctx.Output<phi::DenseTensor>("EncodeGrad");
+    auto gather_buff = ctx.Output<phi::DenseTensor>("GatherBuff");
 
     // FIXME(gongwb): use cublas.
     auto u_out_e = framework::EigenVector<T>::Flatten(*u_out);
diff --git a/paddle/fluid/operators/diag_op.h b/paddle/fluid/operators/diag_op.h
index 9c9ff69586a1f..e3514e59e806d 100644
--- a/paddle/fluid/operators/diag_op.h
+++ b/paddle/fluid/operators/diag_op.h
@@ -39,10 +39,10 @@ template <typename DeviceContext, typename T>
 class DiagKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* diagonal = context.Input<framework::Tensor>("Diagonal");
+    auto* diagonal = context.Input<phi::DenseTensor>("Diagonal");
     auto* diag_data = diagonal->data<T>();
     auto numel = diagonal->numel();
-    auto* out = context.Output<framework::Tensor>("Out");
+    auto* out = context.Output<phi::DenseTensor>("Out");
     T* out_data = out->mutable_data<T>(context.GetPlace());
 
     phi::funcs::SetConstant<DeviceContext, T> set_zero;
diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h
index bc3eedacc5b0d..413d02e3b6738 100644
--- a/paddle/fluid/operators/dropout_impl.cu.h
+++ b/paddle/fluid/operators/dropout_impl.cu.h
@@ -267,25 +267,25 @@ __global__ void VectorizedGeneratorMask(const size_t n,
 }
 
 inline void CalcBroadcastedMask(const phi::GPUContext& dev_ctx,
-                                const framework::Tensor& mask,
-                                framework::Tensor* broadcasted_mask) {
+                                const phi::DenseTensor& mask,
+                                phi::DenseTensor* broadcasted_mask) {
   // The broadcast of mask can be combined to the following ElementwiseKernel
   // when the BroadcastKernel supports different input types.
   broadcasted_mask->mutable_data<uint8_t>(dev_ctx.GetPlace());
 
-  std::vector<const framework::Tensor*> ins = {&mask};
-  std::vector<framework::Tensor*> outs = {broadcasted_mask};
+  std::vector<const phi::DenseTensor*> ins = {&mask};
+  std::vector<phi::DenseTensor*> outs = {broadcasted_mask};
   phi::funcs::BroadcastKernel<phi::ElementwiseType::kUnary, uint8_t, uint8_t>(
       dev_ctx, ins, &outs, -1, kps::IdentityFunctor<uint8_t>());
 }
 
 template <typename T, typename MT>
 void ScaleByDropoutFactor(const phi::GPUContext& dev_ctx,
-                          const framework::Tensor& x,
-                          framework::Tensor* y,
+                          const phi::DenseTensor& x,
+                          phi::DenseTensor* y,
                           MT factor) {
-  std::vector<const framework::Tensor*> ins = {&x};
-  std::vector<framework::Tensor*> outs = {y};
+  std::vector<const phi::DenseTensor*> ins = {&x};
+  std::vector<phi::DenseTensor*> outs = {y};
   auto functor = phi::funcs::ScaleFunctor<T>(factor);
   phi::funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
 }
@@ -297,10 +297,10 @@ void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx,
                               bool upscale_in_train,
                               bool is_fix_seed,
                               int seed_val,
-                              const framework::Tensor& x,
-                              const framework::Tensor* seed,
-                              framework::Tensor* mask,
-                              framework::Tensor* y,
+                              const phi::DenseTensor& x,
+                              const phi::DenseTensor* seed,
+                              phi::DenseTensor* mask,
+                              phi::DenseTensor* y,
                               bool is_dropout_nd = false) {
   int64_t x_numel = x.numel();
   auto stream = dev_ctx.stream();
@@ -359,14 +359,14 @@ void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx,
                                                  increment,
                                                  main_offset);
 
-      framework::Tensor broadcasted_mask;
+      phi::DenseTensor broadcasted_mask;
       broadcasted_mask.Resize(x.dims());
       CalcBroadcastedMask(dev_ctx, *mask, &broadcasted_mask);
 
       auto dst_functor = DstFunctor<T, uint8_t>(
           1.0f - dropout_prob, upscale_in_train, x_numel);
-      std::vector<const framework::Tensor*> ins = {&x, &broadcasted_mask};
-      std::vector<framework::Tensor*> outs = {y};
+      std::vector<const phi::DenseTensor*> ins = {&x, &broadcasted_mask};
+      std::vector<phi::DenseTensor*> outs = {y};
       phi::funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, dst_functor);
     } else {
 #define PD_DROPOUT_KERNEL_NAME VectorizedRandomGenerator<T, uint8_t>
@@ -424,9 +424,9 @@ void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx,
                                 bool is_test,
                                 float dropout_prob,
                                 bool upscale_in_train,
-                                const framework::Tensor& grad_y,
-                                const framework::Tensor& mask,
-                                framework::Tensor* grad_x,
+                                const phi::DenseTensor& grad_y,
+                                const phi::DenseTensor& mask,
+                                phi::DenseTensor* grad_x,
                                 bool is_dropout_nd = false) {
   using MT = typename details::MPTypeTrait<T>::Type;
 
@@ -436,15 +436,15 @@ void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx,
     // y = factor * x
     ScaleByDropoutFactor<T, MT>(dev_ctx, grad_y, grad_x, factor);
   } else {
-    framework::Tensor broadcasted_mask;
+    phi::DenseTensor broadcasted_mask;
     if (is_dropout_nd) {
       broadcasted_mask.Resize(grad_y.dims());
       CalcBroadcastedMask(dev_ctx, mask, &broadcasted_mask);
     }
 
-    std::vector<const framework::Tensor*> ins = {
+    std::vector<const phi::DenseTensor*> ins = {
         &grad_y, is_dropout_nd ? &broadcasted_mask : &mask};
-    std::vector<framework::Tensor*> outs = {grad_x};
+    std::vector<phi::DenseTensor*> outs = {grad_x};
     if (upscale_in_train) {
       if (dropout_prob == 1.0f) {
 #ifdef PADDLE_WITH_HIP
diff --git a/paddle/fluid/operators/dropout_impl_util.h b/paddle/fluid/operators/dropout_impl_util.h
index 88e492efcc45a..84ff221cbe139 100644
--- a/paddle/fluid/operators/dropout_impl_util.h
+++ b/paddle/fluid/operators/dropout_impl_util.h
@@ -21,7 +21,7 @@ namespace paddle {
 namespace operators {
 
 inline void GetSeedDataAndIncrement(const phi::GPUContext& dev_ctx,
-                                    const framework::Tensor* seed,
+                                    const phi::DenseTensor* seed,
                                     const bool is_fix_seed,
                                     const int seed_val,
                                     const int offset,
@@ -31,7 +31,7 @@ inline void GetSeedDataAndIncrement(const phi::GPUContext& dev_ctx,
   auto gen_cuda = framework::DefaultCUDAGenerator(device_id);
 
   if (seed) {
-    framework::Tensor seed_cpu_tensor;
+    phi::DenseTensor seed_cpu_tensor;
     paddle::framework::TensorCopySync(
         *seed, platform::CPUPlace(), &seed_cpu_tensor);
     *seed_data = static_cast<uint64_t>(seed_cpu_tensor.data<int>()[0]);
diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc
index 43ae066b527fd..804834a974aad 100644
--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
@@ -22,8 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class DropoutOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -37,7 +35,7 @@ class DropoutOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     if (var_name == "Seed") {
       VLOG(10) << "var_name:" << var_name
diff --git a/paddle/fluid/operators/dropout_op_mlu.cc b/paddle/fluid/operators/dropout_op_mlu.cc
index 142e047e6c2b1..7cf98738d073f 100644
--- a/paddle/fluid/operators/dropout_op_mlu.cc
+++ b/paddle/fluid/operators/dropout_op_mlu.cc
@@ -18,18 +18,18 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class DropoutMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     auto dropout_prob = ctx.Attr<float>("dropout_prob");
     auto is_test = ctx.Attr<bool>("is_test");
     auto* seed_tensor =
-        ctx.HasInput("Seed") ? ctx.Input<Tensor>("Seed") : nullptr;
+        ctx.HasInput("Seed") ? ctx.Input<phi::DenseTensor>("Seed") : nullptr;
     auto dropout_implementation =
         ctx.Attr<std::string>("dropout_implementation");
 
@@ -65,7 +65,7 @@ class DropoutMLUKernel : public framework::OpKernel<T> {
         seed_data = ctx.Attr<bool>("fix_seed") ? ctx.Attr<int>("seed") : 0;
       }
 
-      auto* mask = ctx.Output<Tensor>("Mask");
+      auto* mask = ctx.Output<phi::DenseTensor>("Mask");
       mask->mutable_data<uint8_t>(ctx.GetPlace());
       MLUCnnlTensorDesc mask_desc(*mask);
       // Special case when dropout_prob is 1.0
@@ -137,9 +137,9 @@ class DropoutGradMLUKernel : public framework::OpKernel<T> {
                       true,
                       platform::errors::InvalidArgument(
                           "GradOp is only callable when is_test is false"));
-    auto* grad_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* grad_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* mask = ctx.Input<Tensor>("Mask");
+    auto* grad_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* grad_out = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* mask = ctx.Input<phi::DenseTensor>("Mask");
     auto dropout_prob = ctx.Attr<float>("dropout_prob");
     auto dropout_impl = ctx.Attr<std::string>("dropout_implementation");
 
diff --git a/paddle/fluid/operators/dropout_op_npu.cc b/paddle/fluid/operators/dropout_op_npu.cc
index 96e2b6d956777..a63b6e5e479af 100644
--- a/paddle/fluid/operators/dropout_op_npu.cc
+++ b/paddle/fluid/operators/dropout_op_npu.cc
@@ -23,17 +23,17 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class DropoutNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
     auto* seed_tensor =
-        ctx.HasInput("Seed") ? ctx.Input<Tensor>("Seed") : nullptr;
-    auto* out = ctx.Output<Tensor>("Out");
-    auto* mask = ctx.Output<Tensor>("Mask");
+        ctx.HasInput("Seed") ? ctx.Input<phi::DenseTensor>("Seed") : nullptr;
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
+    auto* mask = ctx.Output<phi::DenseTensor>("Mask");
 
     auto dropout_prob = ctx.Attr<float>("dropout_prob");
     auto is_test = ctx.Attr<bool>("is_test");
@@ -151,9 +151,9 @@ template <typename DeviceContext, typename T>
 class DropoutGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* mask = ctx.Input<Tensor>("Mask");
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* mask = ctx.Input<phi::DenseTensor>("Mask");
 
     auto dropout_prob = ctx.Attr<float>("dropout_prob");
     auto is_test = ctx.Attr<bool>("is_test");
diff --git a/paddle/fluid/operators/eig_op.h b/paddle/fluid/operators/eig_op.h
index c9c81f2ed8a61..5fba57c037977 100644
--- a/paddle/fluid/operators/eig_op.h
+++ b/paddle/fluid/operators/eig_op.h
@@ -40,9 +40,7 @@
 namespace paddle {
 namespace operators {
 
-using paddle::framework::Tensor;
-
-inline int BatchCount(const Tensor& matrix) {
+inline int BatchCount(const phi::DenseTensor& matrix) {
   int count = 1;
   int num_dims = matrix.dims().size();
   for (int i = 0; i < num_dims - 2; ++i) {
@@ -51,7 +49,7 @@ inline int BatchCount(const Tensor& matrix) {
   return count;
 }
 
-inline int MatrixStride(const Tensor& matrix) {
+inline int MatrixStride(const phi::DenseTensor& matrix) {
   framework::DDim dims_list = matrix.dims();
   int num_dims = dims_list.size();
   return dims_list[num_dims - 1] * dims_list[num_dims - 2];
diff --git a/paddle/fluid/operators/eigh_op.cc b/paddle/fluid/operators/eigh_op.cc
index c85a7d842e4f9..4d2982b314a1c 100644
--- a/paddle/fluid/operators/eigh_op.cc
+++ b/paddle/fluid/operators/eigh_op.cc
@@ -20,8 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class EighOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/eigvalsh_op.cc b/paddle/fluid/operators/eigvalsh_op.cc
index 9ba892b61badf..9d09b96280e2f 100644
--- a/paddle/fluid/operators/eigvalsh_op.cc
+++ b/paddle/fluid/operators/eigvalsh_op.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class EigvalshOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_mlu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_mlu.cc
index 9e3da4ed6af80..456a11f95aaca 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_mlu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class ElementwiseAddMLUKernel : public framework::OpKernel<T> {
@@ -32,11 +32,11 @@ class ElementwiseAddGradMLUKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto& dev_ctx =
         ctx.template device_context<paddle::platform::MLUDeviceContext>();
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* y = ctx.Input<phi::DenseTensor>("Y");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
     int axis = ctx.Attr<int>("axis");
     axis = (axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis);
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
index 726b4186030d2..70e3de7a0bcd9 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class ElementwiseAddNPUKernel : public framework::OpKernel<T> {
@@ -69,11 +69,11 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto& dev_ctx =
         ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* y = ctx.Input<phi::DenseTensor>("Y");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
     int axis = ctx.Attr<int>("axis");
 
     axis = (axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis);
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h
index 3c03b54b6f98f..b1f0817539f17 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h
@@ -24,7 +24,7 @@ namespace operators {
 class ElementwiseDivOpDoubleGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  using Tensor = framework::Tensor;
+  using Tensor = phi::DenseTensor;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
     auto y_grad_name = framework::GradVarName("Y");
@@ -59,7 +59,7 @@ class ElementwiseDivOpDoubleGrad : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const framework::Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const {
     if (framework::IsComplexType(expected_kernel_type.data_type_)) {
       // only promote inputs’s types when contains complex input
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op_mlu.cc b/paddle/fluid/operators/elementwise/elementwise_div_op_mlu.cc
index 306ee1952dbe1..27f7281b9fb1e 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op_mlu.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class ElementwiseDivMLUKernel : public framework::OpKernel<T> {
@@ -35,12 +35,12 @@ template <typename T>
 class ElementwiseDivGradMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<Tensor>("Out");
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    auto* out = ctx.Input<phi::DenseTensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* y = ctx.Input<phi::DenseTensor>("Y");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
     int axis = ctx.Attr<int>("axis");
 
     const auto& x_dims = x->dims();
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc
index 9ae7782ca01ea..74a2a5b6ca6eb 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc
@@ -21,16 +21,16 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class ElementwiseDivNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* y = ctx.Input<phi::DenseTensor>("Y");
 
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     auto place = ctx.GetPlace();
 
@@ -49,13 +49,13 @@ template <typename DeviceContext, typename T>
 class ElementwiseDivGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<Tensor>("Out");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Input<phi::DenseTensor>("Out");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* y = ctx.Input<phi::DenseTensor>("Y");
 
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
 
     auto place = ctx.GetPlace();
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc
index 79e283e1ffd35..396f1b6f6223a 100644
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc
@@ -21,15 +21,15 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class ElementwiseFloorDivNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* y = ctx.Input<phi::DenseTensor>("Y");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     out->mutable_data<T>(ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc
index 7cd1f70494256..fe91c28cd1f05 100644
--- a/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class ElementwiseMaxNPUKernel : public framework::OpKernel<T> {
@@ -27,9 +27,9 @@ class ElementwiseMaxNPUKernel : public framework::OpKernel<T> {
     auto& dev_ctx =
         ctx.template device_context<paddle::platform::NPUDeviceContext>();
 
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* y = ctx.Input<phi::DenseTensor>("Y");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
     int axis = ctx.Attr<int>("axis");
 
@@ -67,11 +67,11 @@ class ElementwiseMaxGradNPUKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto& dev_ctx =
         ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* y = ctx.Input<phi::DenseTensor>("Y");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
     int axis = ctx.Attr<int>("axis");
 
     // The ascend elementwise_max_grad op only supports broadcast
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op_mlu.cc b/paddle/fluid/operators/elementwise/elementwise_min_op_mlu.cc
index 7b29f8e4cd3f3..861ed2046c077 100644
--- a/paddle/fluid/operators/elementwise/elementwise_min_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op_mlu.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class ElementwiseMinMLUKernel : public framework::OpKernel<T> {
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc
index e34b88189d3bc..8014f82ca5742 100644
--- a/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc
@@ -22,7 +22,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class ElementwiseMinNPUKernel : public framework::OpKernel<T> {
@@ -30,10 +30,10 @@ class ElementwiseMinNPUKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto& dev_ctx =
         ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* y = ctx.Input<phi::DenseTensor>("Y");
 
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     auto place = ctx.GetPlace();
 
     out->mutable_data<T>(place);
@@ -71,11 +71,11 @@ class ElementwiseMinGradNPUKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto& dev_ctx =
         ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* y = ctx.Input<phi::DenseTensor>("Y");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
     int axis = ctx.Attr<int>("axis");
     axis = (axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis);
     auto stream = dev_ctx.stream();
diff --git a/paddle/fluid/operators/elementwise/elementwise_mlu.h b/paddle/fluid/operators/elementwise/elementwise_mlu.h
index 50085f531a99d..57f4b0c057686 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mlu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mlu.h
@@ -77,9 +77,9 @@ void MLUOpTensorKernel(const framework::ExecutionContext& ctx,
                     platform::errors::Unavailable(
                         "This kernel of MLU only support ADD, SUB, MUL."));
 
-  auto* x = ctx.Input<Tensor>("X");
-  auto* y = ctx.Input<Tensor>("Y");
-  auto* out = ctx.Output<Tensor>("Out");
+  auto* x = ctx.Input<phi::DenseTensor>("X");
+  auto* y = ctx.Input<phi::DenseTensor>("Y");
+  auto* out = ctx.Output<phi::DenseTensor>("Out");
   out->mutable_data<T>(ctx.GetPlace());
 
   int axis = ctx.Attr<int>("axis");
@@ -186,9 +186,9 @@ inline void MLUBinary<POW>(const framework::ExecutionContext& ctx,
 
 template <BINARY_FUNCTOR Functor, typename T>
 void MLUBinaryOp(const framework::ExecutionContext& ctx) {
-  auto* x = ctx.Input<Tensor>("X");
-  auto* y = ctx.Input<Tensor>("Y");
-  auto* out = ctx.Output<Tensor>("Out");
+  auto* x = ctx.Input<phi::DenseTensor>("X");
+  auto* y = ctx.Input<phi::DenseTensor>("Y");
+  auto* out = ctx.Output<phi::DenseTensor>("Out");
   out->mutable_data<T>(ctx.GetPlace());
 
   int axis = ctx.Attr<int>("axis");
@@ -259,8 +259,8 @@ inline void MLUUnary<RECIPROCAL>(const framework::ExecutionContext& ctx,
 
 template <UNARY_FUNCTOR Functor, typename Tin, typename Tout = Tin>
 void MLUUnaryOp(const framework::ExecutionContext& ctx) {
-  auto* x = ctx.Input<Tensor>("X");
-  auto* out = ctx.Output<Tensor>("Out");
+  auto* x = ctx.Input<phi::DenseTensor>("X");
+  auto* out = ctx.Output<phi::DenseTensor>("Out");
 
   out->mutable_data<Tout>(ctx.GetPlace());
 
@@ -283,11 +283,11 @@ enum MINMAX_GRAD_FUNCTOR {
 };
 template <MINMAX_GRAD_FUNCTOR Functor, typename Tin, typename Tout = Tin>
 void MLUMinMaxGradHelper(const framework::ExecutionContext& ctx) {
-  auto* x = ctx.Input<Tensor>("X");
-  auto* y = ctx.Input<Tensor>("Y");
-  auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-  auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-  auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+  auto* x = ctx.Input<phi::DenseTensor>("X");
+  auto* y = ctx.Input<phi::DenseTensor>("Y");
+  auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+  auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+  auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
   int axis = ctx.Attr<int>("axis");
 
   const auto& x_dims = x->dims();
diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_mod_op_npu.cc
index 5de2f6509fe40..bdeef48389b6c 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op_npu.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class ElementwiseModNPUKernel : public framework::OpKernel<T> {
@@ -26,9 +26,9 @@ class ElementwiseModNPUKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto& dev_ctx =
         ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* y = ctx.Input<phi::DenseTensor>("Y");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     int axis = ctx.Attr<int>("axis");
 
     auto x_dims = x->dims();
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
index ffd36412e3ebe..afc06b0d9981b 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -25,7 +25,7 @@ namespace operators {
 
 class ElementwiseMulOp : public ElementwiseOp {
  public:
-  using Tensor = framework::Tensor;
+  using Tensor = phi::DenseTensor;
   using ElementwiseOp::ElementwiseOp;
 
   framework::OpKernelType GetExpectedKernelType(
@@ -46,7 +46,7 @@ class ElementwiseMulOp : public ElementwiseOp {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const framework::Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const {
     if (framework::IsComplexType(expected_kernel_type.data_type_)) {
       // only promote inputs’s types when contains complex input
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc
index 0c41dc40cdc12..fe2848621c76f 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using MLUDeviceContext = platform::MLUDeviceContext;
 
 template <typename T>
@@ -32,11 +32,11 @@ template <typename T>
 class ElementwiseMulGradMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* y = ctx.Input<phi::DenseTensor>("Y");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
     int axis = ctx.Attr<int>("axis");
 
     const auto& x_dims = x->dims();
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc
index fa6fd9c422e81..4fc3be1b29cc7 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using NPUDeviceContext = platform::NPUDeviceContext;
 
 template <typename T>
@@ -28,8 +28,8 @@ static void ReduceDims(const framework::ExecutionContext& ctx,
                        const int axis,
                        const framework::DDim& ddims,
                        const framework::DDim& brd_ddims,
-                       const Tensor& in,
-                       Tensor* out) {
+                       const phi::DenseTensor& in,
+                       phi::DenseTensor* out) {
   std::vector<int64_t> axes;
   int64_t brd_size = brd_ddims.size();
   int64_t org_size = ddims.size();
@@ -55,9 +55,9 @@ class ElementwiseMulNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto& dev_ctx = ctx.template device_context<NPUDeviceContext>();
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* y = ctx.Input<phi::DenseTensor>("Y");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
 
     int axis = ctx.Attr<int>("axis");
@@ -91,11 +91,11 @@ class ElementwiseMulGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto& dev_ctx = ctx.template device_context<NPUDeviceContext>();
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* y = ctx.Input<phi::DenseTensor>("Y");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
     int axis = ctx.Attr<int>("axis");
 
     axis = (axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis);
diff --git a/paddle/fluid/operators/elementwise/elementwise_npu.h b/paddle/fluid/operators/elementwise/elementwise_npu.h
index e4b6998a8f3fe..5266491d6f506 100644
--- a/paddle/fluid/operators/elementwise/elementwise_npu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_npu.h
@@ -20,14 +20,14 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 void NpuBroadcast(const platform::NPUDeviceContext& dev_ctx,
-                  const Tensor* src,
+                  const phi::DenseTensor* src,
                   int axis,
                   const framework::DDim& dst_dims,
-                  Tensor* transformed_src) {
+                  phi::DenseTensor* transformed_src) {
   auto stream = dev_ctx.stream();
 
   // 1. expand the axis with dim 1
@@ -96,11 +96,11 @@ void NpuBroadcast(const platform::NPUDeviceContext& dev_ctx,
 
 template <typename T>
 void NpuElementWiseOpBroadcast(const platform::NPUDeviceContext& dev_ctx,
-                               const Tensor* x,
-                               const Tensor* y,
+                               const phi::DenseTensor* x,
+                               const phi::DenseTensor* y,
                                int axis,
-                               Tensor* transformed_x,
-                               Tensor* transformed_y) {
+                               phi::DenseTensor* transformed_x,
+                               phi::DenseTensor* transformed_y) {
   auto x_dims = x->dims();
   auto y_dims = y->dims();
   bool is_xsize_larger = true;
diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index e722d5f7e6e99..dc054579dc181 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -36,7 +36,7 @@ class ElementwiseOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  using Tensor = framework::Tensor;
+  using Tensor = phi::DenseTensor;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
     OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ElementwiseOp");
@@ -170,7 +170,7 @@ class ElementwiseOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const framework::Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     if (framework::IsComplexType(expected_kernel_type.data_type_)) {
       // only promote inputs’s types when contains complex input
@@ -292,7 +292,7 @@ For example:
 class ElementwiseOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  using Tensor = framework::Tensor;
+  using Tensor = phi::DenseTensor;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
     auto out_grad_name = framework::GradVarName("Out");
@@ -331,7 +331,7 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const framework::Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     if (framework::IsComplexType(expected_kernel_type.data_type_)) {
       // only promote inputs’s types when contains complex input
@@ -349,7 +349,7 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
 class ElementwiseOpDoubleGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  using Tensor = framework::Tensor;
+  using Tensor = phi::DenseTensor;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
     auto x_grad_name = framework::GradVarName("X");
@@ -385,7 +385,7 @@ class ElementwiseOpDoubleGrad : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const framework::Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const {
     if (framework::IsComplexType(expected_kernel_type.data_type_)) {
       // only promote inputs’s types when contains complex input
@@ -404,7 +404,7 @@ class ElementwiseOpDoubleGradWithoutDXDY
     : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  using Tensor = framework::Tensor;
+  using Tensor = phi::DenseTensor;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
     if (ctx->HasOutput("DDOut")) {
@@ -446,7 +446,7 @@ class ElementwiseOpDoubleGradWithoutDXDY
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const framework::Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const {
     if (framework::IsComplexType(expected_kernel_type.data_type_)) {
       // only promote inputs’s types when contains complex input
@@ -464,7 +464,7 @@ class ElementwiseOpDoubleGradWithoutDXDY
 class ElementwiseOpTripleGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  using Tensor = framework::Tensor;
+  using Tensor = phi::DenseTensor;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
     if (ctx->HasOutput("D_DDX")) {
@@ -507,7 +507,7 @@ class ElementwiseOpTripleGrad : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const framework::Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const {
     if (framework::IsComplexType(expected_kernel_type.data_type_)) {
       // only promote inputs’s types when contains complex input
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
index dbb555a0a6a13..25e22f9e2895c 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
@@ -26,8 +26,8 @@ template <ElementwiseType ET,
           int NumOuts = 1>
 void LaunchElementwiseCudaKernel(
     const KPDevice &ctx,
-    const std::vector<const framework::Tensor *> &ins,
-    std::vector<framework::Tensor *> *outs,
+    const std::vector<const phi::DenseTensor *> &ins,
+    std::vector<phi::DenseTensor *> *outs,
     int axis,
     Functor func) {
   std::vector<const phi::DenseTensor *> pt_inputs;
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index f81b76aa4877c..2abb15c98ce1f 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -69,9 +69,9 @@ namespace operators {
  */
 template <typename OutT>
 int PackTensorsIntoVector(const framework::ExecutionContext &ctx,
-                          std::vector<const framework::Tensor *> *ins,
-                          std::vector<framework::Tensor *> *outs,
-                          framework::Tensor *x_for_selectedrows = nullptr) {
+                          std::vector<const phi::DenseTensor *> *ins,
+                          std::vector<phi::DenseTensor *> *outs,
+                          phi::DenseTensor *x_for_selectedrows = nullptr) {
   int axis = -1;
   auto x_var = ctx.InputVar("X");
   PADDLE_ENFORCE_NOT_NULL(
@@ -80,7 +80,7 @@ int PackTensorsIntoVector(const framework::ExecutionContext &ctx,
           "Unable to get input Variable X, Variable name is %s.\n",
           ctx.InputName("X")));
   auto *y = ctx.Input<framework::LoDTensor>("Y");
-  framework::Tensor *z;
+  phi::DenseTensor *z;
 
   if (x_var->IsType<framework::LoDTensor>()) {
     auto *x = ctx.Input<framework::LoDTensor>("X");
@@ -152,13 +152,13 @@ template <typename DeviceContext,
           typename DY_OP,
           typename Tout = T>
 void ElemwiseGradCompute(const framework::ExecutionContext &ctx,
-                         const framework::Tensor &x,
-                         const framework::Tensor &y,
-                         const framework::Tensor &out,
-                         const framework::Tensor &dout,
+                         const phi::DenseTensor &x,
+                         const phi::DenseTensor &y,
+                         const phi::DenseTensor &out,
+                         const phi::DenseTensor &dout,
                          int axis,
-                         framework::Tensor *dx,
-                         framework::Tensor *dy,
+                         phi::DenseTensor *dx,
+                         phi::DenseTensor *dy,
                          DX_OP dx_op,
                          DY_OP dy_op) {
   const auto &dev_ctx = ctx.template device_context<DeviceContext>();
@@ -180,11 +180,11 @@ template <typename Functor,
           typename T,
           typename OutType = T>
 void ElementwiseComputeEx(const framework::ExecutionContext &ctx,
-                          const framework::Tensor *x,
-                          const framework::Tensor *y,
+                          const phi::DenseTensor *x,
+                          const phi::DenseTensor *y,
                           int axis,
                           Functor func,
-                          framework::Tensor *z) {
+                          phi::DenseTensor *z) {
   z->mutable_data<OutType>(ctx.GetPlace());
   const auto &dev_ctx = ctx.template device_context<DeviceContext>();
   phi::funcs::ElementwiseCompute<Functor, T, OutType>(
@@ -468,11 +468,11 @@ template <typename DeviceContext,
 void FusedElemwiseAndActComputeNoBroadcast(
     const framework::ExecutionContext &ctx,
     const framework::DDim &x_dim,
-    const framework::Tensor &x,
-    const framework::Tensor &y,
+    const phi::DenseTensor &x,
+    const phi::DenseTensor &y,
     CompoundFunctor compound_functor,
-    framework::Tensor *out,
-    framework::Tensor *intermediate_out) {
+    phi::DenseTensor *out,
+    phi::DenseTensor *intermediate_out) {
   size_t N = static_cast<size_t>(phi::product(x_dim));
 
   platform::ForRange<DeviceContext> for_range(
@@ -499,12 +499,12 @@ void FusedElemwiseAndActComputeWithBroadcast(
     const framework::ExecutionContext &ctx,
     const framework::DDim &x_dim,
     const framework::DDim &y_dim_untrimed,
-    const framework::Tensor &x,
-    const framework::Tensor &y,
+    const phi::DenseTensor &x,
+    const phi::DenseTensor &y,
     CompoundFunctor compound_functor,
     int axis,
-    framework::Tensor *out,
-    framework::Tensor *intermediate_out) {
+    phi::DenseTensor *out,
+    phi::DenseTensor *intermediate_out) {
   axis = (axis == -1 ? x_dim.size() - y_dim_untrimed.size() : axis);
   auto y_dim = trim_trailing_singular_dims(y_dim_untrimed);
   axis = (y_dim.size() == 0) ? x_dim.size() : axis;
@@ -642,15 +642,15 @@ void FusedElemwiseAndActGradComputeNoBroadcast(
     const framework::ExecutionContext &ctx,
     const framework::DDim &x_dim,
     const framework::DDim &y_dim,
-    const framework::Tensor *x,
-    const framework::Tensor *y,
-    const framework::Tensor *intermediate_out,
-    const framework::Tensor *out,
-    const framework::Tensor *dout,
+    const phi::DenseTensor *x,
+    const phi::DenseTensor *y,
+    const phi::DenseTensor *intermediate_out,
+    const phi::DenseTensor *out,
+    const phi::DenseTensor *dout,
     int axis,
-    framework::Tensor *dx,
-    framework::Tensor *dy,
-    framework::Tensor *dintermediate,
+    phi::DenseTensor *dx,
+    phi::DenseTensor *dy,
+    phi::DenseTensor *dintermediate,
     DX_OP dx_op,
     DY_OP dy_op,
     DIntermediate_OP dintermediate_op) {
@@ -1244,15 +1244,15 @@ void FusedElemwiseAndActGradComputeWithBroadcast(
     const framework::ExecutionContext &ctx,
     const framework::DDim &x_dim,
     const framework::DDim &y_dim_untrimed,
-    const framework::Tensor *x,
-    const framework::Tensor *y,
-    const framework::Tensor *intermediate_out,
-    const framework::Tensor *out,
-    const framework::Tensor *dout,
+    const phi::DenseTensor *x,
+    const phi::DenseTensor *y,
+    const phi::DenseTensor *intermediate_out,
+    const phi::DenseTensor *out,
+    const phi::DenseTensor *dout,
     int axis,
-    framework::Tensor *dx,
-    framework::Tensor *dy,
-    framework::Tensor *dintermediate,
+    phi::DenseTensor *dx,
+    phi::DenseTensor *dy,
+    phi::DenseTensor *dintermediate,
     DX_OP dx_op,
     DY_OP dy_op,
     DIntermediate_OP dintermediate_op) {
@@ -1385,15 +1385,15 @@ template <typename DeviceContext,
           bool UseIntermediateOut,
           bool SameShapeOfIntermediateOutAndOut>
 void FusedElemwiseAndActGradComputeEx(const framework::ExecutionContext &ctx,
-                                      const framework::Tensor *x,
-                                      const framework::Tensor *y,
-                                      const framework::Tensor *out,
-                                      const framework::Tensor *intermediate_out,
-                                      const framework::Tensor *dout,
+                                      const phi::DenseTensor *x,
+                                      const phi::DenseTensor *y,
+                                      const phi::DenseTensor *out,
+                                      const phi::DenseTensor *intermediate_out,
+                                      const phi::DenseTensor *dout,
                                       int axis,
-                                      framework::Tensor *dx,
-                                      framework::Tensor *dy,
-                                      framework::Tensor *dintermediate,
+                                      phi::DenseTensor *dx,
+                                      phi::DenseTensor *dy,
+                                      phi::DenseTensor *dintermediate,
                                       DX_OP dx_op,
                                       DY_OP dy_op,
                                       DIntermediate_OP dintermediate_op) {
@@ -1497,12 +1497,12 @@ template <typename DeviceContext,
           bool KeepIntermediateOut,
           bool SameShapeOfIntermediateOutAndOut>
 void FusedElemwiseAndActComputeEx(const framework::ExecutionContext &ctx,
-                                  const framework::Tensor &x,
-                                  const framework::Tensor &y,
+                                  const phi::DenseTensor &x,
+                                  const phi::DenseTensor &y,
                                   int axis,
                                   CompoundFunctor compound_functor,
-                                  framework::Tensor *out,
-                                  framework::Tensor *intermediate_out) {
+                                  phi::DenseTensor *out,
+                                  phi::DenseTensor *intermediate_out) {
   if (KeepIntermediateOut) {
     PADDLE_ENFORCE_NOT_NULL(
         intermediate_out,
@@ -1578,9 +1578,9 @@ void FusedElemwiseAndActComputeEx(const framework::ExecutionContext &ctx,
 template <typename DeviceContext, typename T>
 static inline void GetDoubleGradSafeTensor(
     const framework::ExecutionContext &ctx,
-    const framework::Tensor *x,
-    const framework::Tensor *ddx,
-    framework::Tensor *ddx_safe) {
+    const phi::DenseTensor *x,
+    const phi::DenseTensor *ddx,
+    phi::DenseTensor *ddx_safe) {
   const auto &dev_ctx = ctx.template device_context<DeviceContext>();
   phi::funcs::GetDoubleGradSafeTensor<DeviceContext, T>(
       dev_ctx, *x, ddx, ddx_safe);
@@ -1599,10 +1599,10 @@ template <ElementwiseType ET, typename T, typename Functor>
 void GetGradXAndYOut(const phi::GPUContext &dev_ctx,
                      const platform::Place &place,
                      int axis,
-                     std::vector<const framework::Tensor *> ins,
-                     const framework::Tensor *dout,
-                     framework::Tensor *dx,
-                     framework::Tensor *dy,
+                     std::vector<const phi::DenseTensor *> ins,
+                     const phi::DenseTensor *dout,
+                     phi::DenseTensor *dx,
+                     phi::DenseTensor *dy,
                      Functor func) {
   phi::GetGradXAndYOut<ET, T, Functor>(
       dev_ctx, place, axis, ins, *dout, dx, dy, func);
@@ -1612,9 +1612,9 @@ template <ElementwiseType ET, typename T, typename Functor>
 void GetGradXOrYOut(const phi::GPUContext &dev_ctx,
                     const platform::Place &place,
                     int axis,
-                    std::vector<const framework::Tensor *> ins,
-                    const framework::Tensor *dout,
-                    framework::Tensor *dxy,
+                    std::vector<const phi::DenseTensor *> ins,
+                    const phi::DenseTensor *dout,
+                    phi::DenseTensor *dxy,
                     Functor func) {
   phi::GetGradXOrYOut<ET, T, Functor>(
       dev_ctx, place, axis, ins, *dout, dxy, func);
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
index db1cc766a3e61..1e9b87c965656 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
@@ -29,8 +29,8 @@ using ElementwiseType = phi::ElementwiseType;
 template <typename OutT, typename Functor, int NumOuts = 1>
 void LaunchSameDimsElementwiseCudaKernel(
     const KPDevice &ctx,
-    const std::vector<const framework::Tensor *> &ins,
-    std::vector<framework::Tensor *> *outs,
+    const std::vector<const phi::DenseTensor *> &ins,
+    std::vector<phi::DenseTensor *> *outs,
     Functor func) {
   std::vector<const phi::DenseTensor *> pt_inputs;
   std::vector<phi::DenseTensor *> pt_outputs;
diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op_mlu.cc b/paddle/fluid/operators/elementwise/elementwise_pow_op_mlu.cc
index 431122641ec3d..6942377049b47 100644
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op_mlu.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class ElementwisePowMLUKernel : public framework::OpKernel<T> {
@@ -32,11 +32,11 @@ template <typename T>
 class ElementwisePowGradMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* y = ctx.Input<phi::DenseTensor>("Y");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
     int axis = ctx.Attr<int>("axis");
     auto place = ctx.GetPlace();
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc
index 9e935bb683232..18853222ba6b7 100644
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class ElementwisePowNPUKernel : public framework::OpKernel<T> {
@@ -30,9 +30,9 @@ class ElementwisePowNPUKernel : public framework::OpKernel<T> {
     auto& dev_ctx =
         ctx.template device_context<paddle::platform::NPUDeviceContext>();
 
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* y = ctx.Input<phi::DenseTensor>("Y");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     auto place = ctx.GetPlace();
     int axis = ctx.Attr<int>("axis");
@@ -72,11 +72,11 @@ class ElementwisePowGradNPUKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto& dev_ctx =
         ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* y = ctx.Input<phi::DenseTensor>("Y");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
     int axis = ctx.Attr<int>("axis");
     auto place = ctx.GetPlace();
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_mlu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_mlu.cc
index 49fae0a9f5b61..0f56044d268e4 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_mlu.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class ElementwiseSubMLUKernel : public framework::OpKernel<T> {
@@ -36,11 +36,11 @@ class ElementwiseSubGradMLUKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto& dev_ctx =
         ctx.template device_context<paddle::platform::MLUDeviceContext>();
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* y = ctx.Input<phi::DenseTensor>("Y");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
     int axis = ctx.Attr<int>("axis");
     axis = (axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis);
 
@@ -79,7 +79,7 @@ class ElementwiseSubGradMLUKernel : public framework::OpKernel<T> {
     }
     if (dy) {
       dy->mutable_data<T>(ctx.GetPlace());
-      Tensor* tmp_dout = const_cast<Tensor*>(dout);
+      phi::DenseTensor* tmp_dout = const_cast<phi::DenseTensor*>(dout);
       if (dy->dims() != dout->dims()) {
         std::vector<int> dst_dims_vec;
         std::vector<int> reduce_axes;
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
index ca4c469ce2c66..8df295a972559 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
@@ -21,15 +21,15 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class ElementwiseSubNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* y = ctx.Input<phi::DenseTensor>("Y");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     out->mutable_data<T>(ctx.GetPlace());
 
@@ -46,9 +46,9 @@ template <typename T>
 class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
 
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
@@ -75,7 +75,7 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
       for (auto i = 0; i < reduce_ndim; ++i) {
         axes.push_back(i);
       }
-      Tensor* tmp_dout = const_cast<Tensor*>(dout);
+      phi::DenseTensor* tmp_dout = const_cast<phi::DenseTensor*>(dout);
       Tensor reduced_dout(dx->type());
       if (axes.size() != 0) {
         std::vector<int64_t> reduced_dout_dims;
@@ -123,7 +123,7 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
       for (auto i = 0; i < reduce_ndim; ++i) {
         axes.push_back(i);
       }
-      Tensor* tmp_dout = const_cast<Tensor*>(dout);
+      phi::DenseTensor* tmp_dout = const_cast<phi::DenseTensor*>(dout);
       Tensor reduced_dy(dy->type());
       Tensor reduced_dout(dy->type());
 
@@ -145,7 +145,7 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
 
       // stage 2
       axes.clear();
-      Tensor* tmp_dy = tmp_dout;
+      phi::DenseTensor* tmp_dy = tmp_dout;
       for (auto i = 0; i < dy->dims().size(); ++i) {
         if (dy->dims()[i] == 1) {
           axes.push_back(i);
diff --git a/paddle/fluid/operators/elementwise/elementwise_xpu.h b/paddle/fluid/operators/elementwise/elementwise_xpu.h
index 403ba5a592fd0..10e4813008af4 100644
--- a/paddle/fluid/operators/elementwise/elementwise_xpu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_xpu.h
@@ -68,11 +68,11 @@ void XPUElementwiseGrad(const framework::ExecutionContext& ctx,
                                           const std::vector<int>&,
                                           const std::vector<int>&)> func,
                         bool use_x_y_data) {
-  auto* x = ctx.Input<framework::Tensor>("X");
-  auto* y = ctx.Input<framework::Tensor>("Y");
-  auto* dz = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-  auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-  auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+  auto* x = ctx.Input<phi::DenseTensor>("X");
+  auto* y = ctx.Input<phi::DenseTensor>("Y");
+  auto* dz = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+  auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+  auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
   int axis = ctx.Attr<int>("axis");
 
   auto& dev_ctx =
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
index 42d749b7b8e3e..c830d5a5bc5df 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
@@ -28,10 +28,9 @@ using dnnl::memory;
 using dnnl::primitive;
 using dnnl::stream;
 using framework::DataLayout;
-using framework::Tensor;
 
-inline std::vector<int64_t> CalculateBroadcastedDims(const Tensor* x,
-                                                     const Tensor* y) {
+inline std::vector<int64_t> CalculateBroadcastedDims(
+    const phi::DenseTensor* x, const phi::DenseTensor* y) {
   const auto src_tz = phi::vectorize(x->dims());
   const auto dst_tz = phi::vectorize(y->dims());
 
@@ -60,9 +59,9 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
     const auto& mkldnn_engine = dev_ctx.GetEngine();
 
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* z = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* y = ctx.Input<phi::DenseTensor>("Y");
+    auto* z = ctx.Output<phi::DenseTensor>("Out");
 
     float scale_x = ctx.Attr<float>("Scale_x");
     float scale_y = ctx.Attr<float>("Scale_y");
@@ -136,19 +135,19 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     ElemwiseGradKernel<T>::Compute(ctx);
-    using Tensor = framework::Tensor;
+    using Tensor = phi::DenseTensor;
 
     auto& dev_ctx =
         ctx.template device_context<platform::MKLDNNDeviceContext>();
     const auto& onednn_engine = dev_ctx.GetEngine();
 
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Input<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* y = ctx.Input<phi::DenseTensor>("Y");
+    auto* out = ctx.Input<phi::DenseTensor>("Out");
 
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
 
     // oneDNN's binary is optimized for broadcasting y into x, so in other case
     // we have to swap tensors to achieve optimal performance
diff --git a/paddle/fluid/operators/empty_op.cc b/paddle/fluid/operators/empty_op.cc
index aed1ca284a1af..47dc2eb383249 100644
--- a/paddle/fluid/operators/empty_op.cc
+++ b/paddle/fluid/operators/empty_op.cc
@@ -55,7 +55,7 @@ class EmptyOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const framework::Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     if (var_name == "ShapeTensor" || var_name == "ShapeTensorList") {
       return expected_kernel_type;
diff --git a/paddle/fluid/operators/expand_as_op.cc b/paddle/fluid/operators/expand_as_op.cc
index 3d32c9b8a148f..b793d835fca98 100644
--- a/paddle/fluid/operators/expand_as_op.cc
+++ b/paddle/fluid/operators/expand_as_op.cc
@@ -17,8 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class ExpandAsOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/expand_as_op.h b/paddle/fluid/operators/expand_as_op.h
index 7a856a0153dd4..58b6b619c231a 100644
--- a/paddle/fluid/operators/expand_as_op.h
+++ b/paddle/fluid/operators/expand_as_op.h
@@ -23,7 +23,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
@@ -38,7 +38,7 @@ template <typename DeviceContext, typename T>
 class ExpandAsKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto rank = context.Input<Tensor>("X")->dims().size();
+    auto rank = context.Input<phi::DenseTensor>("X")->dims().size();
     switch (rank) {
       case 1:
         ExpandAs<1>(context);
@@ -69,10 +69,10 @@ class ExpandAsKernel : public framework::OpKernel<T> {
  protected:
   template <int Rank>
   void ExpandAs(const framework::ExecutionContext& context) const {
-    auto* in0 = context.Input<Tensor>("X");
+    auto* in0 = context.Input<phi::DenseTensor>("X");
     auto in_dims = in0->dims();
-    auto* target_tensor = context.Input<Tensor>("target_tensor");
-    auto* out0 = context.Output<Tensor>("Out");
+    auto* target_tensor = context.Input<phi::DenseTensor>("target_tensor");
+    auto* out0 = context.Output<phi::DenseTensor>("Out");
     Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
     int bcast_dims_remainder = 0;
     auto x_dims = in0->dims();
@@ -113,8 +113,8 @@ template <typename DeviceContext, typename T>
 class ExpandAsGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("X");
-    auto* target_tensor = context.Input<Tensor>("target_tensor");
+    auto* in0 = context.Input<phi::DenseTensor>("X");
+    auto* target_tensor = context.Input<phi::DenseTensor>("target_tensor");
     auto x_dims = in0->dims();
     auto y_dims = target_tensor->dims();
     std::vector<int> bcast_dims;
@@ -138,8 +138,10 @@ class ExpandAsGradKernel : public framework::OpKernel<T> {
     }
     // no need reduce, just copy
     if (just_copy) {
-      auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
-      auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+      auto* in0 =
+          context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+      auto* out0 =
+          context.Output<phi::DenseTensor>(framework::GradVarName("X"));
       out0->mutable_data<T>(context.GetPlace());
       framework::TensorCopy(
           *in0, context.GetPlace(), context.device_context(), out0);
@@ -194,8 +196,8 @@ class ExpandAsGradKernel : public framework::OpKernel<T> {
                         const std::vector<int>& reduce_dims_vec) const {
     size_t reshape_size = reshape_dims_vec.size();
     size_t reduce_size = reduce_dims_vec.size();
-    auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* in0 = context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* out0 = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
     out0->mutable_data<T>(context.GetPlace());
     auto x_grad = EigenVector<T>::Flatten(*out0);
     Eigen::DSizes<Eigen::DenseIndex, Dims * 2> reshape_dims;
diff --git a/paddle/fluid/operators/expand_as_v2_op.cc b/paddle/fluid/operators/expand_as_v2_op.cc
index 6fcf301897f29..772ef09219817 100644
--- a/paddle/fluid/operators/expand_as_v2_op.cc
+++ b/paddle/fluid/operators/expand_as_v2_op.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class ExpandAsV2Op : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/expand_as_v2_op.h b/paddle/fluid/operators/expand_as_v2_op.h
index 5533e7bf91205..1205fc0447f1e 100644
--- a/paddle/fluid/operators/expand_as_v2_op.h
+++ b/paddle/fluid/operators/expand_as_v2_op.h
@@ -24,7 +24,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
diff --git a/paddle/fluid/operators/expand_as_v2_op_mlu.cc b/paddle/fluid/operators/expand_as_v2_op_mlu.cc
index 3a7ced3a0cef3..8184af44916bb 100644
--- a/paddle/fluid/operators/expand_as_v2_op_mlu.cc
+++ b/paddle/fluid/operators/expand_as_v2_op_mlu.cc
@@ -20,13 +20,13 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class ExpandAsV2MLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto rank = context.Input<Tensor>("X")->dims().size();
+    auto rank = context.Input<phi::DenseTensor>("X")->dims().size();
     auto target_shape = context.Attr<std::vector<int>>("target_shape");
     auto target_rank = target_shape.size();
     PADDLE_ENFORCE_GE(target_rank,
@@ -55,7 +55,7 @@ class ExpandAsV2MLUKernel : public framework::OpKernel<T> {
 
  protected:
   void ExpandAs(const framework::ExecutionContext& context) const {
-    auto* in0 = context.Input<Tensor>("X");
+    auto* in0 = context.Input<phi::DenseTensor>("X");
     auto in_dims = in0->dims();
     auto target_shape = context.Attr<std::vector<int>>("target_shape");
     auto vec_in_dims = phi::vectorize<int>(in_dims);
@@ -79,7 +79,7 @@ class ExpandAsV2MLUKernel : public framework::OpKernel<T> {
                 target_shape[i]));
       }
     }
-    auto* out0 = context.Output<Tensor>("Out");
+    auto* out0 = context.Output<phi::DenseTensor>("Out");
 
     framework::DDim out_dims = phi::make_ddim(target_shape);
 
diff --git a/paddle/fluid/operators/expand_as_v2_op_npu.cc b/paddle/fluid/operators/expand_as_v2_op_npu.cc
index 69513d26a6fff..0ac693ff600c5 100644
--- a/paddle/fluid/operators/expand_as_v2_op_npu.cc
+++ b/paddle/fluid/operators/expand_as_v2_op_npu.cc
@@ -21,7 +21,7 @@ template <typename DeviceContext, typename T>
 class ExpandAsV2NPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto rank = context.Input<Tensor>("X")->dims().size();
+    auto rank = context.Input<phi::DenseTensor>("X")->dims().size();
     auto target_shape = context.Attr<std::vector<int>>("target_shape");
     auto target_rank = target_shape.size();
     PADDLE_ENFORCE_GE(target_rank,
@@ -50,7 +50,7 @@ class ExpandAsV2NPUKernel : public framework::OpKernel<T> {
 
  protected:
   void ExpandAs(const framework::ExecutionContext& context) const {
-    auto* in0 = context.Input<framework::Tensor>("X");
+    auto* in0 = context.Input<phi::DenseTensor>("X");
     auto in_dims = in0->dims();
     auto target_shape = context.Attr<std::vector<int>>("target_shape");
     auto vec_in_dims = phi::vectorize<int>(in_dims);
@@ -74,7 +74,7 @@ class ExpandAsV2NPUKernel : public framework::OpKernel<T> {
                 target_shape[i]));
       }
     }
-    auto* out0 = context.Output<framework::Tensor>("Out");
+    auto* out0 = context.Output<phi::DenseTensor>("Out");
 
     framework::DDim out_dims = phi::make_ddim(target_shape);
 
diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc
index 1261b7777010e..67b8102181e1b 100644
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class ExpandOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -88,7 +86,7 @@ class ExpandOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     if (var_name == "expand_times_tensor" || var_name == "ExpandTimes") {
       return expected_kernel_type;
@@ -217,7 +215,7 @@ class ExpandGradOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     if (var_name == "expand_times_tensor") {
       return expected_kernel_type;
diff --git a/paddle/fluid/operators/expand_op.h b/paddle/fluid/operators/expand_op.h
index 72eab31e157ad..dc7e42f48333e 100644
--- a/paddle/fluid/operators/expand_op.h
+++ b/paddle/fluid/operators/expand_op.h
@@ -30,7 +30,7 @@ inline std::vector<int> get_expand_times(
   if (ctx.HasInput("ExpandTimes")) {
     auto* expand_tensor = ctx.Input<framework::LoDTensor>("ExpandTimes");
     auto* expand_data = expand_tensor->data<int>();
-    framework::Tensor cpu_expand_tensor;
+    phi::DenseTensor cpu_expand_tensor;
     if (platform::is_gpu_place(expand_tensor->place())) {
       paddle::framework::TensorCopySync(
           *expand_tensor, platform::CPUPlace(), &cpu_expand_tensor);
@@ -56,20 +56,20 @@ inline std::vector<int> get_expand_times(
   }
 
   auto list_expand_times_tensor =
-      ctx.MultiInput<framework::Tensor>("expand_times_tensor");
+      ctx.MultiInput<phi::DenseTensor>("expand_times_tensor");
   if (list_expand_times_tensor.size() > 0) {
     // get tensor from
     std::vector<int> vec_epxand_times;
     for (size_t i = 0; i < list_expand_times_tensor.size(); ++i) {
       auto tensor = list_expand_times_tensor[i];
       if (platform::is_gpu_place(tensor->place())) {
-        framework::Tensor temp;
+        phi::DenseTensor temp;
         paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
         vec_epxand_times.push_back(*temp.data<int32_t>());
       }
 #ifdef PADDLE_WITH_XPU
       else if (platform::is_xpu_place(tensor->place())) {  // NOLINT
-        framework::Tensor temp;
+        phi::DenseTensor temp;
         paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
         vec_epxand_times.push_back(*temp.data<int32_t>());
       }
@@ -85,7 +85,7 @@ inline std::vector<int> get_expand_times(
   }
 }
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
@@ -101,7 +101,7 @@ template <typename DeviceContext, typename T>
 class ExpandKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto rank = context.Input<Tensor>("X")->dims().size();
+    auto rank = context.Input<phi::DenseTensor>("X")->dims().size();
     PADDLE_ENFORCE_GE(
         rank,
         1,
@@ -142,7 +142,7 @@ class ExpandKernel : public framework::OpKernel<T> {
  protected:
   template <int Rank>
   void Expand(const framework::ExecutionContext& context) const {
-    auto* in0 = context.Input<Tensor>("X");
+    auto* in0 = context.Input<phi::DenseTensor>("X");
 
     auto in_dims = in0->dims();
     auto expand_times = get_expand_times(context);
@@ -154,7 +154,7 @@ class ExpandKernel : public framework::OpKernel<T> {
                           "of dimensions (%d) of the input.",
                           expand_times.size(),
                           static_cast<size_t>(in_dims.size())));
-    auto* out0 = context.Output<Tensor>("Out");
+    auto* out0 = context.Output<phi::DenseTensor>("Out");
     Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
     for (size_t i = 0; i < expand_times.size(); ++i) {
       bcast_dims[i] = expand_times[i];
@@ -187,7 +187,7 @@ template <typename DeviceContext, typename T>
 class ExpandGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("X");
+    auto* in0 = context.Input<phi::DenseTensor>("X");
     // auto& expand_times = context.Attr<std::vector<int>>("expand_times");
     auto expand_times = get_expand_times(context);
     auto x_dims = in0->dims();
@@ -214,8 +214,10 @@ class ExpandGradKernel : public framework::OpKernel<T> {
     }
     // no need reduce, just copy
     if (just_copy) {
-      auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
-      auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+      auto* in0 =
+          context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+      auto* out0 =
+          context.Output<phi::DenseTensor>(framework::GradVarName("X"));
       out0->mutable_data<T>(context.GetPlace());
       framework::TensorCopy(
           *in0, context.GetPlace(), context.device_context(), out0);
@@ -285,8 +287,8 @@ class ExpandGradKernel : public framework::OpKernel<T> {
                           "reduce dimensions (%d).",
                           reduce_size,
                           reduce_dims_vec.size()));
-    auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* in0 = context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* out0 = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
     out0->mutable_data<T>(context.GetPlace());
     auto x_grad = EigenVector<T>::Flatten(*out0);
     Eigen::DSizes<Eigen::DenseIndex, Dims * 2> reshape_dims;
diff --git a/paddle/fluid/operators/expand_op_npu.cc b/paddle/fluid/operators/expand_op_npu.cc
index 45870767699ea..f1c81cb1b9ca0 100644
--- a/paddle/fluid/operators/expand_op_npu.cc
+++ b/paddle/fluid/operators/expand_op_npu.cc
@@ -26,7 +26,7 @@ template <typename DeviceContext, typename T>
 class ExpandNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto rank = context.Input<Tensor>("X")->dims().size();
+    auto rank = context.Input<phi::DenseTensor>("X")->dims().size();
     PADDLE_ENFORCE_GE(
         rank,
         1,
diff --git a/paddle/fluid/operators/expand_v2_op.cc b/paddle/fluid/operators/expand_v2_op.cc
index fd92a43318c58..fb82f0b6524ba 100644
--- a/paddle/fluid/operators/expand_v2_op.cc
+++ b/paddle/fluid/operators/expand_v2_op.cc
@@ -28,8 +28,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class ExpandV2Op : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -53,7 +51,7 @@ class ExpandV2Op : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     if (var_name == "expand_shapes_tensor" || var_name == "Shape") {
       return expected_kernel_type;
@@ -179,7 +177,7 @@ class ExpandV2GradOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     if (var_name == "expand_shapes_tensor" || var_name == "Shape") {
       return expected_kernel_type;
diff --git a/paddle/fluid/operators/expand_v2_op.h b/paddle/fluid/operators/expand_v2_op.h
index 2bf31ff221c5f..3c6d017977951 100644
--- a/paddle/fluid/operators/expand_v2_op.h
+++ b/paddle/fluid/operators/expand_v2_op.h
@@ -31,7 +31,7 @@ inline std::vector<int> get_expand_shape(
   if (ctx.HasInput("Shape")) {
     auto* shape_tensor = ctx.Input<framework::LoDTensor>("Shape");
     auto* shape_data = shape_tensor->data<int>();
-    framework::Tensor cpu_shape_tensor;
+    phi::DenseTensor cpu_shape_tensor;
     if (platform::is_gpu_place(shape_tensor->place())) {
       paddle::framework::TensorCopySync(
           *shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
@@ -64,34 +64,34 @@ inline std::vector<int> get_expand_shape(
   }
 
   auto list_expand_shapes_tensor =
-      ctx.MultiInput<framework::Tensor>("expand_shapes_tensor");
+      ctx.MultiInput<phi::DenseTensor>("expand_shapes_tensor");
   if (list_expand_shapes_tensor.size() > 0) {
     // get tensor from
     std::vector<int> vec_epxand_shape;
     for (size_t i = 0; i < list_expand_shapes_tensor.size(); ++i) {
       auto tensor = list_expand_shapes_tensor[i];
       if (platform::is_gpu_place(tensor->place())) {
-        framework::Tensor temp;
+        phi::DenseTensor temp;
         paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
         vec_epxand_shape.push_back(*temp.data<int32_t>());
       }
 #ifdef PADDLE_WITH_ASCEND_CL
       else if (platform::is_npu_place(tensor->place())) {  // NOLINT
-        framework::Tensor temp;
+        phi::DenseTensor temp;
         paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
         vec_epxand_shape.push_back(*temp.data<int32_t>());
       }
 #endif
 #ifdef PADDLE_WITH_XPU
       else if (platform::is_xpu_place(tensor->place())) {  // NOLINT
-        framework::Tensor temp;
+        phi::DenseTensor temp;
         paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
         vec_epxand_shape.push_back(*temp.data<int32_t>());
       }
 #endif
 #ifdef PADDLE_WITH_MLU
       else if (platform::is_mlu_place(tensor->place())) {  // NOLINT
-        framework::Tensor temp;
+        phi::DenseTensor temp;
         paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
         vec_epxand_shape.push_back(*temp.data<int32_t>());
       }
diff --git a/paddle/fluid/operators/expand_v2_op_mlu.cc b/paddle/fluid/operators/expand_v2_op_mlu.cc
index 9dbf3df06d51a..4ae0b4192ab53 100644
--- a/paddle/fluid/operators/expand_v2_op_mlu.cc
+++ b/paddle/fluid/operators/expand_v2_op_mlu.cc
@@ -24,8 +24,8 @@ template <typename T>
 class ExpandV2MLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* X = ctx.Input<framework::Tensor>("X");
-    auto* Out = ctx.Output<framework::Tensor>("Out");
+    auto* X = ctx.Input<phi::DenseTensor>("X");
+    auto* Out = ctx.Output<phi::DenseTensor>("Out");
     auto in_dims = X->dims();
     auto expand_shape = get_expand_shape(ctx);
     auto vec_in_dims = phi::vectorize<int>(in_dims);
diff --git a/paddle/fluid/operators/expand_v2_op_npu.cc b/paddle/fluid/operators/expand_v2_op_npu.cc
index feb45f7d2e48c..d5748328b1d4d 100644
--- a/paddle/fluid/operators/expand_v2_op_npu.cc
+++ b/paddle/fluid/operators/expand_v2_op_npu.cc
@@ -19,13 +19,13 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 template <typename DeviceContext, typename T>
 class ExpandV2NPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* X = ctx.Input<framework::Tensor>("X");
-    auto* Out = ctx.Output<framework::Tensor>("Out");
+    auto* X = ctx.Input<phi::DenseTensor>("X");
+    auto* Out = ctx.Output<phi::DenseTensor>("Out");
 
     auto in_dims = X->dims();
     auto expand_shape = get_expand_shape(ctx);
@@ -158,8 +158,8 @@ template <typename DeviceContext, typename T>
 class ExpandV2NPUGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     dx->mutable_data<T>(ctx.GetPlace());
 
     auto stream =
diff --git a/paddle/fluid/operators/eye_op_npu.cc b/paddle/fluid/operators/eye_op_npu.cc
index 3cf0f3830be19..6a01992c83335 100644
--- a/paddle/fluid/operators/eye_op_npu.cc
+++ b/paddle/fluid/operators/eye_op_npu.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class EyeNPUKernel : public framework::OpKernel<T> {
@@ -36,7 +36,7 @@ class EyeNPUKernel : public framework::OpKernel<T> {
     framework::NPUAttributeMap attr_input = {
         {"num_rows", num_rows}, {"num_columns", num_columns}, {"dtype", dtype}};
 
-    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
 
     const auto& runner = NpuOpRunner("Eye", {}, {*out}, attr_input);
diff --git a/paddle/fluid/operators/fake_dequantize_op.cc b/paddle/fluid/operators/fake_dequantize_op.cc
index 4e1df4f98ab57..4f140e0a00d18 100644
--- a/paddle/fluid/operators/fake_dequantize_op.cc
+++ b/paddle/fluid/operators/fake_dequantize_op.cc
@@ -25,10 +25,10 @@ namespace operators {
 template <typename T>
 struct DequantizeFunctor<phi::CPUContext, T> {
   void operator()(const phi::CPUContext& dev_ctx,
-                  const framework::Tensor* in,
-                  const framework::Tensor* scale,
+                  const phi::DenseTensor* in,
+                  const phi::DenseTensor* scale,
                   T max_range,
-                  framework::Tensor* out) {
+                  phi::DenseTensor* out) {
     auto in_e = framework::EigenVector<T>::Flatten(*in);
     const T* scale_factor = scale->data<T>();
     auto out_e = framework::EigenVector<T>::Flatten(*out);
@@ -41,13 +41,13 @@ struct DequantizeFunctor<phi::CPUContext, T> {
 template <typename T>
 struct ChannelDequantizeFunctor<phi::CPUContext, T> {
   void operator()(const phi::CPUContext& dev_ctx,
-                  const framework::Tensor* in,
-                  const framework::Tensor** scales,
+                  const phi::DenseTensor* in,
+                  const phi::DenseTensor** scales,
                   const int scale_num,
                   T max_range,
                   const int quant_axis,
                   const int x_num_col_dims,
-                  framework::Tensor* out) {
+                  phi::DenseTensor* out) {
     if (scale_num == 1) {
       // Dequant op is before quantized op
       // Dequantize the weight of quantized op
@@ -57,8 +57,8 @@ struct ChannelDequantizeFunctor<phi::CPUContext, T> {
       if (quant_axis == 0) {
         for (int64_t i = 0; i < channel; i++) {
           T s = scale_factor[i];
-          framework::Tensor one_channel_in = in->Slice(i, i + 1);
-          framework::Tensor one_channel_out = out->Slice(i, i + 1);
+          phi::DenseTensor one_channel_in = in->Slice(i, i + 1);
+          phi::DenseTensor one_channel_out = out->Slice(i, i + 1);
           auto in_e = framework::EigenVector<T>::Flatten(one_channel_in);
           auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
           auto& dev = *dev_ctx.eigen_device();
@@ -120,14 +120,14 @@ struct ChannelDequantizeFunctor<phi::CPUContext, T> {
         const T* scale_one = scales[0]->data<T>();
         const T* scale_two = scales[1]->data<T>();
         for (int i = 0; i < batch_size; i++) {
-          framework::Tensor one_batch_in = in->Slice(i, i + 1).Resize(
+          phi::DenseTensor one_batch_in = in->Slice(i, i + 1).Resize(
               phi::slice_ddim(in->dims(), 1, in->dims().size()));
-          framework::Tensor one_batch_out = out->Slice(i, i + 1).Resize(
+          phi::DenseTensor one_batch_out = out->Slice(i, i + 1).Resize(
               phi::slice_ddim(out->dims(), 1, out->dims().size()));
           for (int j = 0; j < channel; j++) {
             T s = scale_one[j];
-            framework::Tensor one_channel_in = one_batch_in.Slice(j, j + 1);
-            framework::Tensor one_channel_out = one_batch_out.Slice(j, j + 1);
+            phi::DenseTensor one_channel_in = one_batch_in.Slice(j, j + 1);
+            phi::DenseTensor one_channel_out = one_batch_out.Slice(j, j + 1);
             auto in_e = framework::EigenVector<T>::Flatten(one_channel_in);
             auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
             auto& dev = *dev_ctx.eigen_device();
diff --git a/paddle/fluid/operators/fake_dequantize_op.cu.h b/paddle/fluid/operators/fake_dequantize_op.cu.h
index 17b0d9787169e..20088c11f2aa0 100644
--- a/paddle/fluid/operators/fake_dequantize_op.cu.h
+++ b/paddle/fluid/operators/fake_dequantize_op.cu.h
@@ -33,10 +33,10 @@ __global__ void KeDequantize(
 template <typename T>
 struct DequantizeFunctor<phi::GPUContext, T> {
   void operator()(const phi::GPUContext& dev_ctx,
-                  const framework::Tensor* in,
-                  const framework::Tensor* scale,
+                  const phi::DenseTensor* in,
+                  const phi::DenseTensor* scale,
                   T max_range,
-                  framework::Tensor* out) {
+                  phi::DenseTensor* out) {
     const T* in_data = in->data<T>();
     const T* scale_factor = scale->data<T>();
     T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
@@ -102,13 +102,13 @@ __global__ void DequantizeTwoScale(const T* in,
 template <typename T>
 struct ChannelDequantizeFunctor<phi::GPUContext, T> {
   void operator()(const phi::GPUContext& dev_ctx,
-                  const framework::Tensor* in,
-                  const framework::Tensor** scales,
+                  const phi::DenseTensor* in,
+                  const phi::DenseTensor** scales,
                   const int scale_num,
                   T max_range,
                   const int quant_axis,
                   const int x_num_col_dims,
-                  framework::Tensor* out) {
+                  phi::DenseTensor* out) {
     auto in_dims = in->dims();
     const T* in_data = in->data<T>();
     T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
diff --git a/paddle/fluid/operators/fake_dequantize_op.h b/paddle/fluid/operators/fake_dequantize_op.h
index cf8a7e148e40c..fba98963031b7 100644
--- a/paddle/fluid/operators/fake_dequantize_op.h
+++ b/paddle/fluid/operators/fake_dequantize_op.h
@@ -26,31 +26,31 @@ namespace operators {
 template <typename DeviceContext, typename T>
 struct DequantizeFunctor {
   void operator()(const DeviceContext& dev_ctx,
-                  const framework::Tensor* in,
-                  const framework::Tensor* scale,
+                  const phi::DenseTensor* in,
+                  const phi::DenseTensor* scale,
                   T max_range,
-                  framework::Tensor* out);
+                  phi::DenseTensor* out);
 };
 
 template <typename DeviceContext, typename T>
 struct ChannelDequantizeFunctor {
   void operator()(const DeviceContext& dev_ctx,
-                  const framework::Tensor* in,
-                  const framework::Tensor** scales,
+                  const phi::DenseTensor* in,
+                  const phi::DenseTensor** scales,
                   const int scale_num,
                   T max_range,
                   const int quant_axis,
                   const int x_num_col_dims,
-                  framework::Tensor* out);
+                  phi::DenseTensor* out);
 };
 
 template <typename DeviceContext, typename T>
 class FakeDequantizeMaxAbsKernel : public framework::OpKernel<T> {
  public:
   virtual void Compute(const framework::ExecutionContext& ctx) const {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* scale = ctx.Input<framework::Tensor>("Scale");
-    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* in = ctx.Input<phi::DenseTensor>("X");
+    auto* scale = ctx.Input<phi::DenseTensor>("Scale");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     float max_range = ctx.Attr<float>("max_range");
 
@@ -66,9 +66,9 @@ template <typename DeviceContext, typename T>
 class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel<T> {
  public:
   virtual void Compute(const framework::ExecutionContext& ctx) const {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto scales = ctx.MultiInput<framework::Tensor>("Scales");
-    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* in = ctx.Input<phi::DenseTensor>("X");
+    auto scales = ctx.MultiInput<phi::DenseTensor>("Scales");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     auto quant_bits = ctx.Attr<std::vector<int>>("quant_bits");
     auto quant_axis = ctx.Attr<int>("quant_axis");
diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc
index cb8263714a5e4..a97a52145d127 100644
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -46,7 +46,7 @@ template struct FindAbsMaxFunctor<phi::CPUContext, float>;
 template <typename T>
 struct FindChannelAbsMaxFunctor<phi::CPUContext, T> {
   void operator()(const phi::CPUContext &ctx,
-                  const framework::Tensor &in_tensor,
+                  const phi::DenseTensor &in_tensor,
                   const int quant_axis,
                   T *out_abs_max) {
     // At present, channelwise quantization supports conv2d, depthwise_conv2d
@@ -91,11 +91,11 @@ template struct FindChannelAbsMaxFunctor<phi::CPUContext, float>;
 template <typename T>
 struct ClipAndFakeQuantFunctor<phi::CPUContext, T> {
   void operator()(const phi::CPUContext &ctx,
-                  const framework::Tensor &in,
-                  const framework::Tensor &scale,
+                  const phi::DenseTensor &in,
+                  const phi::DenseTensor &scale,
                   const int bin_cnt,
                   const int round_type,
-                  framework::Tensor *out) {
+                  phi::DenseTensor *out) {
     T s = scale.data<T>()[0];
     T inv_s = inverse(s);
     platform::Transform<phi::CPUContext> trans;
@@ -122,11 +122,11 @@ template struct ClipAndFakeQuantFunctor<phi::CPUContext, float>;
 template <typename T>
 struct ClipAndFakeQuantDequantFunctor<phi::CPUContext, T> {
   void operator()(const phi::CPUContext &ctx,
-                  const framework::Tensor &in,
-                  const framework::Tensor &scale,
+                  const phi::DenseTensor &in,
+                  const phi::DenseTensor &scale,
                   const int bin_cnt,
                   const int round_type,
-                  framework::Tensor *out) {
+                  phi::DenseTensor *out) {
     T s = scale.data<T>()[0];
     T inv_s = inverse(s);
 
@@ -156,12 +156,12 @@ template struct ClipAndFakeQuantDequantFunctor<phi::CPUContext, float>;
 template <typename T>
 struct ChannelClipAndFakeQuantFunctor<phi::CPUContext, T> {
   void operator()(const phi::CPUContext &ctx,
-                  const framework::Tensor &in,
-                  const framework::Tensor &scale,
+                  const phi::DenseTensor &in,
+                  const phi::DenseTensor &scale,
                   const int bin_cnt,
                   const int round_type,
                   const int quant_axis,
-                  framework::Tensor *out) {
+                  phi::DenseTensor *out) {
     // At present, channelwise quantization supports conv2d, depthwise_conv2d
     // conv2d_transpose and mul
     PADDLE_ENFORCE_EQ(
@@ -201,7 +201,7 @@ struct ChannelClipAndFakeQuantFunctor<phi::CPUContext, T> {
         for (int64_t i = 0; i < channel; i++) {
           T s = scale_data[i];
           T inv_s = inverse(s);
-          framework::Tensor one_channel_out = out->Slice(i, i + 1);
+          phi::DenseTensor one_channel_out = out->Slice(i, i + 1);
           auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
           out_e.device(*ctx.eigen_device()) = (bin_cnt * inv_s * out_e).round();
         }
@@ -238,12 +238,12 @@ template struct ChannelClipAndFakeQuantFunctor<phi::CPUContext, float>;
 template <typename T>
 struct ChannelClipFakeQuantDequantFunctor<phi::CPUContext, T> {
   void operator()(const phi::CPUContext &ctx,
-                  const framework::Tensor &in,
-                  const framework::Tensor &scale,
+                  const phi::DenseTensor &in,
+                  const phi::DenseTensor &scale,
                   const int bin_cnt,
                   const int round_type,
                   const int quant_axis,
-                  framework::Tensor *out) {
+                  phi::DenseTensor *out) {
     PADDLE_ENFORCE_EQ(
         quant_axis == 0 || quant_axis == 1,
         true,
@@ -280,7 +280,7 @@ struct ChannelClipFakeQuantDequantFunctor<phi::CPUContext, T> {
       }
       for (int i = 0; i < channel; i++) {
         T s = scale_data[i];
-        framework::Tensor one_channel_out = out->Slice(i, i + 1);
+        phi::DenseTensor one_channel_out = out->Slice(i, i + 1);
         auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
         if (round_type == 0) {
           out_e.device(*ctx.eigen_device()) =
@@ -328,12 +328,12 @@ template struct ChannelClipFakeQuantDequantFunctor<phi::CPUContext, float>;
 template <typename T>
 struct FindRangeAbsMaxFunctor<phi::CPUContext, T> {
   void operator()(const phi::CPUContext &ctx,
-                  const framework::Tensor &cur_scale,
-                  const framework::Tensor &last_scale,
-                  const framework::Tensor &iter,
+                  const phi::DenseTensor &cur_scale,
+                  const phi::DenseTensor &last_scale,
+                  const phi::DenseTensor &iter,
                   const int window_size,
-                  framework::Tensor *scales_arr,
-                  framework::Tensor *out_scale) {
+                  phi::DenseTensor *scales_arr,
+                  phi::DenseTensor *out_scale) {
     T *scale_arr = scales_arr->mutable_data<T>(ctx.GetPlace());
     int64_t it = iter.data<int64_t>()[0];
     int idx = it % window_size;
@@ -357,13 +357,13 @@ template struct FindRangeAbsMaxFunctor<phi::CPUContext, float>;
 template <typename T>
 struct FindMovingAverageAbsMaxFunctor<phi::CPUContext, T> {
   void operator()(const phi::CPUContext &ctx,
-                  const framework::Tensor &in_accum,
-                  const framework::Tensor &in_state,
+                  const phi::DenseTensor &in_accum,
+                  const phi::DenseTensor &in_state,
                   const T *cur_scale,
                   const float rate,
-                  framework::Tensor *out_state,
-                  framework::Tensor *out_accum,
-                  framework::Tensor *out_scale) {
+                  phi::DenseTensor *out_state,
+                  phi::DenseTensor *out_accum,
+                  phi::DenseTensor *out_scale) {
     T accum = in_accum.data<T>()[0];
     T state = in_state.data<T>()[0];
     T scale = cur_scale[0];
diff --git a/paddle/fluid/operators/fake_quantize_op.cu.h b/paddle/fluid/operators/fake_quantize_op.cu.h
index 22ba8254cdc2c..10988d8807c6e 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu.h
+++ b/paddle/fluid/operators/fake_quantize_op.cu.h
@@ -81,7 +81,7 @@ struct FindAbsMaxFunctor<phi::GPUContext, T> {
     int grid = (block - 1 + num) / block;
     grid = (grid > block) ? block : grid;
 
-    framework::Tensor max;
+    phi::DenseTensor max;
     T *max_data = max.mutable_data<T>(phi::make_ddim({grid}), ctx.GetPlace());
     FindAbsMaxKernel<T>
         <<<grid, block, 1024 * sizeof(T), ctx.stream()>>>(in, num, max_data);
@@ -165,7 +165,7 @@ __global__ void FindChannelAbsMaxKernelQuantAxis1(
 template <typename T>
 struct FindChannelAbsMaxFunctor<phi::GPUContext, T> {
   void operator()(const phi::GPUContext &ctx,
-                  const framework::Tensor &in_tensor,
+                  const phi::DenseTensor &in_tensor,
                   const int quant_axis,
                   T *out_abs_max) {
     PADDLE_ENFORCE_EQ(
@@ -290,11 +290,11 @@ __global__ void ClipAndQuantDequantKernel(const T *in,
 template <typename T>
 struct ClipAndFakeQuantFunctor<phi::GPUContext, T> {
   void operator()(const phi::GPUContext &ctx,
-                  const framework::Tensor &in,
-                  const framework::Tensor &scale,
+                  const phi::DenseTensor &in,
+                  const phi::DenseTensor &scale,
                   const int bin_cnt,
                   const int round_type,
-                  framework::Tensor *out) {
+                  phi::DenseTensor *out) {
     int num = in.numel();
     int block = 1024;
     int grid = (block - 1 + num) / block;
@@ -313,11 +313,11 @@ template struct ClipAndFakeQuantFunctor<phi::GPUContext, float>;
 template <typename T>
 struct ClipAndFakeQuantDequantFunctor<phi::GPUContext, T> {
   void operator()(const phi::GPUContext &ctx,
-                  const framework::Tensor &in,
-                  const framework::Tensor &scale,
+                  const phi::DenseTensor &in,
+                  const phi::DenseTensor &scale,
                   const int bin_cnt,
                   const int round_type,
-                  framework::Tensor *out) {
+                  phi::DenseTensor *out) {
     int num = in.numel();
     int block = 1024;
     int grid = (block - 1 + num) / block;
@@ -409,12 +409,12 @@ __global__ void ChannelClipAndQuantKernelQuantAxisN(const T *in,
 template <typename T>
 struct ChannelClipAndFakeQuantFunctor<phi::GPUContext, T> {
   void operator()(const phi::GPUContext &ctx,
-                  const framework::Tensor &in,
-                  const framework::Tensor &scale,
+                  const phi::DenseTensor &in,
+                  const phi::DenseTensor &scale,
                   const int bin_cnt,
                   const int round_type,
                   const int quant_axis,
-                  framework::Tensor *out) {
+                  phi::DenseTensor *out) {
     PADDLE_ENFORCE_EQ(
         quant_axis == 0 || quant_axis == 1,
         true,
@@ -491,18 +491,18 @@ __global__ void FindRangeAbsMaxAndFillArray(const T *cur_scale,
 template <typename T>
 struct FindRangeAbsMaxFunctor<phi::GPUContext, T> {
   void operator()(const phi::GPUContext &ctx,
-                  const framework::Tensor &cur_scale,
-                  const framework::Tensor &last_scale,
-                  const framework::Tensor &iter,
+                  const phi::DenseTensor &cur_scale,
+                  const phi::DenseTensor &last_scale,
+                  const phi::DenseTensor &iter,
                   const int window_size,
-                  framework::Tensor *scales_arr,
-                  framework::Tensor *out_scale) {
+                  phi::DenseTensor *scales_arr,
+                  phi::DenseTensor *out_scale) {
     const auto gpu_place = ctx.GetPlace();
 
     T *scale_arr = scales_arr->mutable_data<T>(gpu_place);
     T *out_scale_data = out_scale->mutable_data<T>(gpu_place);
 
-    framework::Tensor need_find_max, out_size;
+    phi::DenseTensor need_find_max, out_size;
     int *find_max = need_find_max.mutable_data<int>({1}, gpu_place);
     int *out_size_data = out_size.mutable_data<int>({1}, gpu_place);
 
@@ -559,13 +559,13 @@ template struct FindRangeAbsMaxFunctor<phi::GPUContext, float>;
 template <typename T>
 struct FindMovingAverageAbsMaxFunctor<phi::GPUContext, T> {
   void operator()(const phi::GPUContext &ctx,
-                  const framework::Tensor &in_accum,
-                  const framework::Tensor &in_state,
+                  const phi::DenseTensor &in_accum,
+                  const phi::DenseTensor &in_state,
                   const T *cur_scale,
                   const float rate,
-                  framework::Tensor *out_state,
-                  framework::Tensor *out_accum,
-                  framework::Tensor *out_scale) {
+                  phi::DenseTensor *out_state,
+                  phi::DenseTensor *out_accum,
+                  phi::DenseTensor *out_scale) {
     const auto gpu_place = ctx.GetPlace();
 
     T rate_t = static_cast<T>(rate);
@@ -660,12 +660,12 @@ __global__ void ChannelClipAndQuantDequantKernelQuantAxis1(const T *in,
 template <typename T>
 struct ChannelClipFakeQuantDequantFunctor<phi::GPUContext, T> {
   void operator()(const phi::GPUContext &ctx,
-                  const framework::Tensor &in,
-                  const framework::Tensor &scale,
+                  const phi::DenseTensor &in,
+                  const phi::DenseTensor &scale,
                   const int bin_cnt,
                   const int round_type,
                   const int quant_axis,
-                  framework::Tensor *out) {
+                  phi::DenseTensor *out) {
     // At present, channelwise quantization supports conv2d, depthwise_conv2d
     // conv2d_transpose and mul
     PADDLE_ENFORCE_EQ(
diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h
index 92aaa1fb248b9..bbe0c4d38eae0 100644
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -76,38 +76,38 @@ struct FindAbsMaxFunctor {
 template <typename DeviceContext, typename T>
 struct ClipAndFakeQuantFunctor {
   void operator()(const DeviceContext &ctx,
-                  const framework::Tensor &in,
-                  const framework::Tensor &scale,
+                  const phi::DenseTensor &in,
+                  const phi::DenseTensor &scale,
                   const int bin_cnt,
                   const int round_type,
-                  framework::Tensor *out);
+                  phi::DenseTensor *out);
 };
 
 template <typename DeviceContext, typename T>
 struct ClipAndFakeQuantDequantFunctor {
   void operator()(const DeviceContext &ctx,
-                  const framework::Tensor &in,
-                  const framework::Tensor &scale,
+                  const phi::DenseTensor &in,
+                  const phi::DenseTensor &scale,
                   const int bin_cnt,
                   int round_type,
-                  framework::Tensor *out);
+                  phi::DenseTensor *out);
 };
 
 template <typename DeviceContext, typename T>
 struct FindRangeAbsMaxFunctor {
   void operator()(const DeviceContext &ctx,
-                  const framework::Tensor &cur_scale,
-                  const framework::Tensor &last_scale,
-                  const framework::Tensor &iter,
+                  const phi::DenseTensor &cur_scale,
+                  const phi::DenseTensor &last_scale,
+                  const phi::DenseTensor &iter,
                   const int window_size,
-                  framework::Tensor *scales_arr,
-                  framework::Tensor *out_scale);
+                  phi::DenseTensor *scales_arr,
+                  phi::DenseTensor *out_scale);
 };
 
 template <typename DeviceContext, typename T>
 struct FindChannelAbsMaxFunctor {
   void operator()(const DeviceContext &ctx,
-                  const framework::Tensor &in_tensor,
+                  const phi::DenseTensor &in_tensor,
                   const int quant_axis,
                   T *out_abs_max);
 };
@@ -115,44 +115,44 @@ struct FindChannelAbsMaxFunctor {
 template <typename DeviceContext, typename T>
 struct ChannelClipAndFakeQuantFunctor {
   void operator()(const DeviceContext &ctx,
-                  const framework::Tensor &in,
-                  const framework::Tensor &scale,
+                  const phi::DenseTensor &in,
+                  const phi::DenseTensor &scale,
                   const int bin_cnt,
                   const int round_type,
                   const int quant_axis,
-                  framework::Tensor *out);
+                  phi::DenseTensor *out);
 };
 
 template <typename DeviceContext, typename T>
 struct ChannelClipFakeQuantDequantFunctor {
   void operator()(const DeviceContext &ctx,
-                  const framework::Tensor &in,
-                  const framework::Tensor &scale,
+                  const phi::DenseTensor &in,
+                  const phi::DenseTensor &scale,
                   const int bin_cnt,
                   int round_type,
                   const int quant_axis,
-                  framework::Tensor *out);
+                  phi::DenseTensor *out);
 };
 
 template <typename DeviceContext, typename T>
 struct FindMovingAverageAbsMaxFunctor {
   void operator()(const DeviceContext &ctx,
-                  const framework::Tensor &in_accum,
-                  const framework::Tensor &in_state,
+                  const phi::DenseTensor &in_accum,
+                  const phi::DenseTensor &in_state,
                   const T *cur_scale,
                   const float rate,
-                  framework::Tensor *out_state,
-                  framework::Tensor *out_accum,
-                  framework::Tensor *out_scale);
+                  phi::DenseTensor *out_state,
+                  phi::DenseTensor *out_accum,
+                  phi::DenseTensor *out_scale);
 };
 
 template <typename DeviceContext, typename T>
 class FakeAbsMaxKernelBase : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    auto *in = context.Input<framework::Tensor>("X");
-    auto *out = context.Output<framework::Tensor>("Out");
-    auto *out_scale = context.Output<framework::Tensor>("OutScale");
+    auto *in = context.Input<phi::DenseTensor>("X");
+    auto *out = context.Output<phi::DenseTensor>("Out");
+    auto *out_scale = context.Output<phi::DenseTensor>("OutScale");
     T *out_s = out_scale->mutable_data<T>(context.GetPlace());
 
     int bit_length = context.Attr<int>("bit_length");
@@ -169,22 +169,22 @@ class FakeAbsMaxKernelBase : public framework::OpKernel<T> {
 
  protected:
   virtual void RunClipFunctor(const DeviceContext &dev_ctx,
-                              const framework::Tensor &in,
-                              const framework::Tensor &scale,
+                              const phi::DenseTensor &in,
+                              const phi::DenseTensor &scale,
                               int bin_cnt,
                               int round_type,
-                              framework::Tensor *out) const = 0;
+                              phi::DenseTensor *out) const = 0;
 };
 
 template <typename DeviceContext, typename T>
 class FakeQuantizeAbsMaxKernel : public FakeAbsMaxKernelBase<DeviceContext, T> {
  protected:
   void RunClipFunctor(const DeviceContext &dev_ctx,
-                      const framework::Tensor &in,
-                      const framework::Tensor &scale,
+                      const phi::DenseTensor &in,
+                      const phi::DenseTensor &scale,
                       int bin_cnt,
                       int round_type,
-                      framework::Tensor *out) const override {
+                      phi::DenseTensor *out) const override {
     ClipAndFakeQuantFunctor<DeviceContext, T>()(
         dev_ctx, in, scale, bin_cnt, round_type, out);
   }
@@ -195,11 +195,11 @@ class FakeQuantizeDequantizeAbsMaxKernel
     : public FakeAbsMaxKernelBase<DeviceContext, T> {
  protected:
   void RunClipFunctor(const DeviceContext &dev_ctx,
-                      const framework::Tensor &in,
-                      const framework::Tensor &scale,
+                      const phi::DenseTensor &in,
+                      const phi::DenseTensor &scale,
                       int bin_cnt,
                       int round_type,
-                      framework::Tensor *out) const override {
+                      phi::DenseTensor *out) const override {
     ClipAndFakeQuantDequantFunctor<DeviceContext, T>()(
         dev_ctx, in, scale, bin_cnt, round_type, out);
   }
@@ -209,10 +209,10 @@ template <typename DeviceContext, typename T>
 class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    auto *in = context.Input<framework::Tensor>("X");
+    auto *in = context.Input<phi::DenseTensor>("X");
 
-    auto *out = context.Output<framework::Tensor>("Out");
-    auto *out_scale = context.Output<framework::Tensor>("OutScale");
+    auto *out = context.Output<phi::DenseTensor>("Out");
+    auto *out_scale = context.Output<phi::DenseTensor>("OutScale");
     out->mutable_data<T>(context.GetPlace());
 
     int bit_length = context.Attr<int>("bit_length");
@@ -237,9 +237,9 @@ class FakeChannelWiseQuantizeDequantizeAbsMaxKernel
     : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    auto *in = context.Input<framework::Tensor>("X");
-    auto *out = context.Output<framework::Tensor>("Out");
-    auto *out_scale = context.Output<framework::Tensor>("OutScale");
+    auto *in = context.Input<phi::DenseTensor>("X");
+    auto *out = context.Output<phi::DenseTensor>("Out");
+    auto *out_scale = context.Output<phi::DenseTensor>("OutScale");
     T *out_scale_data = out_scale->mutable_data<T>(context.GetPlace());
     auto &dev_ctx = context.template device_context<DeviceContext>();
     out->mutable_data<T>(dev_ctx.GetPlace());
@@ -261,10 +261,10 @@ template <typename DeviceContext, typename T>
 class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    auto *in = context.Input<framework::Tensor>("X");
-    auto *in_scale = context.Input<framework::Tensor>("InScale");
+    auto *in = context.Input<phi::DenseTensor>("X");
+    auto *in_scale = context.Input<phi::DenseTensor>("InScale");
 
-    auto *out = context.Output<framework::Tensor>("Out");
+    auto *out = context.Output<phi::DenseTensor>("Out");
     out->mutable_data<T>(context.GetPlace());
 
     bool is_test = context.Attr<bool>("is_test");
@@ -281,14 +281,14 @@ class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel<T> {
     }
 
     // training
-    auto *out_scale = context.Output<framework::Tensor>("OutScale");
-    auto *out_scales = context.Output<framework::Tensor>("OutScales");
-    auto *iter = context.Input<framework::Tensor>("Iter");
+    auto *out_scale = context.Output<phi::DenseTensor>("OutScale");
+    auto *out_scales = context.Output<phi::DenseTensor>("OutScales");
+    auto *iter = context.Input<phi::DenseTensor>("Iter");
 
     int window_size = context.Attr<int>("window_size");
     out_scale->mutable_data<T>(context.GetPlace());
 
-    framework::Tensor cur_scale;
+    phi::DenseTensor cur_scale;
     T *cur_scale_data = cur_scale.mutable_data<T>({1}, context.GetPlace());
     FindAbsMaxFunctor<DeviceContext, T>()(
         dev_ctx, in->data<T>(), in->numel(), cur_scale_data);
@@ -308,9 +308,9 @@ template <typename DeviceContext, typename T>
 class FakeMovingAverageAbsMaxKernelBase : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    auto *in = context.Input<framework::Tensor>("X");
-    auto *in_scale = context.Input<framework::Tensor>("InScale");
-    auto *out = context.Output<framework::Tensor>("Out");
+    auto *in = context.Input<phi::DenseTensor>("X");
+    auto *in_scale = context.Input<phi::DenseTensor>("InScale");
+    auto *out = context.Output<phi::DenseTensor>("Out");
     out->mutable_data<T>(context.GetPlace());
 
     bool is_test = context.Attr<bool>("is_test");
@@ -326,8 +326,8 @@ class FakeMovingAverageAbsMaxKernelBase : public framework::OpKernel<T> {
     }
 
     // training
-    auto *in_accum = context.Input<framework::Tensor>("InAccum");
-    auto *in_state = context.Input<framework::Tensor>("InState");
+    auto *in_accum = context.Input<phi::DenseTensor>("InAccum");
+    auto *in_state = context.Input<phi::DenseTensor>("InState");
 
     phi::DenseTensor tmp_scale;
     tmp_scale.Resize(phi::make_dim(1));
@@ -336,9 +336,9 @@ class FakeMovingAverageAbsMaxKernelBase : public framework::OpKernel<T> {
     FindAbsMaxFunctor<DeviceContext, T>()(
         dev_ctx, in->data<T>(), in->numel(), cur_scale_data);
 
-    auto *out_state = context.Output<framework::Tensor>("OutState");
-    auto *out_accum = context.Output<framework::Tensor>("OutAccum");
-    auto *out_scale = context.Output<framework::Tensor>("OutScale");
+    auto *out_state = context.Output<phi::DenseTensor>("OutState");
+    auto *out_accum = context.Output<phi::DenseTensor>("OutAccum");
+    auto *out_scale = context.Output<phi::DenseTensor>("OutScale");
     out_state->mutable_data<T>(context.GetPlace());
     out_accum->mutable_data<T>(context.GetPlace());
     out_scale->mutable_data<T>(context.GetPlace());
@@ -360,11 +360,11 @@ class FakeMovingAverageAbsMaxKernelBase : public framework::OpKernel<T> {
 
  protected:
   virtual void RunClipFunctor(const DeviceContext &dev_ctx,
-                              const framework::Tensor &in,
-                              const framework::Tensor &in_scale,
+                              const phi::DenseTensor &in,
+                              const phi::DenseTensor &in_scale,
                               int bin_cnt,
                               int round_type,
-                              framework::Tensor *out) const = 0;
+                              phi::DenseTensor *out) const = 0;
 };
 
 template <typename DeviceContext, typename T>
@@ -372,11 +372,11 @@ class FakeQuantizeMovingAverageAbsMaxKernel
     : public FakeMovingAverageAbsMaxKernelBase<DeviceContext, T> {
  protected:
   void RunClipFunctor(const DeviceContext &dev_ctx,
-                      const framework::Tensor &in,
-                      const framework::Tensor &in_scale,
+                      const phi::DenseTensor &in,
+                      const phi::DenseTensor &in_scale,
                       int bin_cnt,
                       int round_type,
-                      framework::Tensor *out) const override {
+                      phi::DenseTensor *out) const override {
     ClipAndFakeQuantFunctor<DeviceContext, T>()(
         dev_ctx, in, in_scale, bin_cnt, round_type, out);
   }
@@ -387,11 +387,11 @@ class FakeQuantizeDequantizeMovingAverageAbsMaxKernel
     : public FakeMovingAverageAbsMaxKernelBase<DeviceContext, T> {
  protected:
   void RunClipFunctor(const DeviceContext &dev_ctx,
-                      const framework::Tensor &in,
-                      const framework::Tensor &in_scale,
+                      const phi::DenseTensor &in,
+                      const phi::DenseTensor &in_scale,
                       int bin_cnt,
                       int round_type,
-                      framework::Tensor *out) const override {
+                      phi::DenseTensor *out) const override {
     ClipAndFakeQuantDequantFunctor<DeviceContext, T>()(
         dev_ctx, in, in_scale, bin_cnt, round_type, out);
   }
@@ -401,11 +401,11 @@ template <typename DeviceContext, typename T>
 class MovingAverageAbsMaxScaleKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    auto *in = context.Input<framework::Tensor>("X");
+    auto *in = context.Input<phi::DenseTensor>("X");
     auto &dev_ctx = context.template device_context<DeviceContext>();
 
     if (context.HasOutput("Out")) {
-      auto *out = context.Output<framework::Tensor>("Out");
+      auto *out = context.Output<phi::DenseTensor>("Out");
       out->mutable_data<T>(context.GetPlace());
       framework::TensorCopy(*in, context.GetPlace(), dev_ctx, out);
     }
@@ -417,8 +417,8 @@ class MovingAverageAbsMaxScaleKernel : public framework::OpKernel<T> {
     }
 
     // training
-    auto *in_accum = context.Input<framework::Tensor>("InAccum");
-    auto *in_state = context.Input<framework::Tensor>("InState");
+    auto *in_accum = context.Input<phi::DenseTensor>("InAccum");
+    auto *in_state = context.Input<phi::DenseTensor>("InState");
     phi::DenseTensor tmp_scale;
     tmp_scale.Resize(phi::make_dim(1));
     T *cur_scale_data = dev_ctx.template Alloc<T>(&tmp_scale);
@@ -426,9 +426,9 @@ class MovingAverageAbsMaxScaleKernel : public framework::OpKernel<T> {
     FindAbsMaxFunctor<DeviceContext, T>()(
         dev_ctx, in->data<T>(), in->numel(), cur_scale_data);
 
-    auto *out_state = context.Output<framework::Tensor>("OutState");
-    auto *out_accum = context.Output<framework::Tensor>("OutAccum");
-    auto *out_scale = context.Output<framework::Tensor>("OutScale");
+    auto *out_state = context.Output<phi::DenseTensor>("OutState");
+    auto *out_accum = context.Output<phi::DenseTensor>("OutAccum");
+    auto *out_scale = context.Output<phi::DenseTensor>("OutScale");
     out_state->mutable_data<T>(context.GetPlace());
     out_accum->mutable_data<T>(context.GetPlace());
     out_scale->mutable_data<T>(context.GetPlace());
diff --git a/paddle/fluid/operators/fc_op.h b/paddle/fluid/operators/fc_op.h
index 24380b29ee125..87c2d75328fc1 100644
--- a/paddle/fluid/operators/fc_op.h
+++ b/paddle/fluid/operators/fc_op.h
@@ -24,7 +24,7 @@ namespace paddle {
 namespace operators {
 enum { kFCMKLDNNFP32 = 1, kFCMKLDNNINT8 = 2 };
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 inline void FCOutputSize(const framework::DDim& in_dims,
                          const framework::DDim& w_dims,
@@ -59,8 +59,8 @@ class FCOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const paddle::framework::ExecutionContext& ctx) const override {
     auto* input = ctx.Input<framework::LoDTensor>("Input");
-    auto* w = ctx.Input<Tensor>("W");
-    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* w = ctx.Input<phi::DenseTensor>("W");
+    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
     auto* output = ctx.Output<framework::LoDTensor>("Out");
     int in_num_col_dims = ctx.Attr<int>("in_num_col_dims");
     bool with_relu =
diff --git a/paddle/fluid/operators/feed_forward_test.cu b/paddle/fluid/operators/feed_forward_test.cu
index 43776e98a0225..d337d975c9aa3 100644
--- a/paddle/fluid/operators/feed_forward_test.cu
+++ b/paddle/fluid/operators/feed_forward_test.cu
@@ -549,8 +549,8 @@ class TestFeedForward {
   bool has_bias_;
   int size_src_, size_weight_, size_bias_, size_output_;
 
-  framework::Tensor src_, weight_, bias_, out_, bias_out_;
-  framework::Tensor dinput_, dweight_, dbias_, doutput_;
+  phi::DenseTensor src_, weight_, bias_, out_, bias_out_;
+  phi::DenseTensor dinput_, dweight_, dbias_, doutput_;
   std::vector<T> src_vec_, weight_vec_, bias_vec_, out_vec_, bias_out_vec_;
   std::vector<T> dinput_vec_, dweight_vec_, dbias_vec_, doutput_vec_;
 
diff --git a/paddle/fluid/operators/fill_any_like_op.cc b/paddle/fluid/operators/fill_any_like_op.cc
index eb66cc88b3145..bf79a98d21df4 100644
--- a/paddle/fluid/operators/fill_any_like_op.cc
+++ b/paddle/fluid/operators/fill_any_like_op.cc
@@ -43,7 +43,7 @@ class FillAnyLikeOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const framework::Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     return framework::OpKernelType(expected_kernel_type.data_type_,
                                    expected_kernel_type.place_,
diff --git a/paddle/fluid/operators/fill_any_like_op_mlu.cc b/paddle/fluid/operators/fill_any_like_op_mlu.cc
index af45f2feb4ee0..5ef52d7b07ec8 100644
--- a/paddle/fluid/operators/fill_any_like_op_mlu.cc
+++ b/paddle/fluid/operators/fill_any_like_op_mlu.cc
@@ -28,7 +28,7 @@ class FillAnyLikeMLUKernel : public framework::OpKernel<T> {
                                 T>::type>::type;
 
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
 
     float value = ctx.Attr<float>("value");
diff --git a/paddle/fluid/operators/fill_any_like_op_npu.cc b/paddle/fluid/operators/fill_any_like_op_npu.cc
index af483d56eeaad..22f2c29bfa8ab 100644
--- a/paddle/fluid/operators/fill_any_like_op_npu.cc
+++ b/paddle/fluid/operators/fill_any_like_op_npu.cc
@@ -30,7 +30,7 @@ class FillAnyLikeNPUKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     auto data_type = static_cast<framework::proto::VarType::Type>(
         context.Attr<int>("dtype"));
-    auto* out = context.Output<framework::Tensor>("Out");
+    auto* out = context.Output<phi::DenseTensor>("Out");
     out->mutable_data<T>(context.GetPlace());
 
     float value = context.Attr<float>("value");
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op_mlu.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op_mlu.cc
index ba426dfe62a35..6fda2f1283fb3 100644
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op_mlu.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op_mlu.cc
@@ -29,7 +29,7 @@ class FillConstantBatchSizeLikeOpMLUKernel : public framework::OpKernel<T> {
     auto str_value = ctx.Attr<std::string>("str_value");
     auto force_cpu = ctx.Attr<bool>("force_cpu");
 
-    auto *out = ctx.Output<Tensor>("Out");
+    auto *out = ctx.Output<phi::DenseTensor>("Out");
     auto *in = ctx.Input<framework::LoDTensor>("Input");
     if (in->lod().size() && ctx.Attr<int>("input_dim_idx") == 0) {
       // set the correct batch size for the LoDTensor.
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
index 479b2e19096e5..1f27dbdd4d77e 100644
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class FillConstantBatchSizeLikeOpNPUKernel : public framework::OpKernel<T> {
@@ -32,7 +32,7 @@ class FillConstantBatchSizeLikeOpNPUKernel : public framework::OpKernel<T> {
     auto str_value = ctx.Attr<std::string>("str_value");
     auto force_cpu = ctx.Attr<bool>("force_cpu");
 
-    auto *out = ctx.Output<Tensor>("Out");
+    auto *out = ctx.Output<phi::DenseTensor>("Out");
     auto *in = ctx.Input<framework::LoDTensor>("Input");
     if (in->lod().size() && ctx.Attr<int>("input_dim_idx") == 0) {
       // set the correct batch size for the LoDTensor.
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index 28167c4736fa3..4c63b9969fd10 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -58,7 +58,7 @@ class FillConstantOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const framework::Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     if (var_name == "ShapeTensor" || var_name == "ShapeTensorList") {
       return expected_kernel_type;
diff --git a/paddle/fluid/operators/fill_constant_op_mlu.cc b/paddle/fluid/operators/fill_constant_op_mlu.cc
index 487962a7de8ca..664d70609e939 100644
--- a/paddle/fluid/operators/fill_constant_op_mlu.cc
+++ b/paddle/fluid/operators/fill_constant_op_mlu.cc
@@ -26,7 +26,7 @@ class FillConstantMLUKernel : public framework::OpKernel<T> {
     auto str_value = ctx.Attr<std::string>("str_value");
     auto float_value = ctx.Attr<float>("value");
 
-    auto *out_var = ctx.Output<framework::Tensor>("Out");
+    auto *out_var = ctx.Output<phi::DenseTensor>("Out");
 
     T value;
     if (str_value.empty()) {
@@ -55,7 +55,7 @@ class FillConstantMLUKernel : public framework::OpKernel<T> {
     const T *value_data = &value;
     cnnlPointerMode_t pointer_mode = CNNL_POINTER_MODE_HOST;
     if (ctx.HasInput("ValueTensor")) {
-      auto *value_tensor = ctx.Input<framework::Tensor>("ValueTensor");
+      auto *value_tensor = ctx.Input<phi::DenseTensor>("ValueTensor");
       PADDLE_ENFORCE_EQ(
           value_tensor->numel(),
           1,
diff --git a/paddle/fluid/operators/fill_constant_op_npu.cc b/paddle/fluid/operators/fill_constant_op_npu.cc
index 47e26b0d415fa..1947020be857d 100644
--- a/paddle/fluid/operators/fill_constant_op_npu.cc
+++ b/paddle/fluid/operators/fill_constant_op_npu.cc
@@ -28,7 +28,7 @@ class FillConstantNPUKernel : public framework::OpKernel<T> {
     auto str_value = ctx.Attr<std::string>("str_value");
     auto float_value = ctx.Attr<float>("value");
 
-    auto *out_var = ctx.Output<framework::Tensor>("Out");
+    auto *out_var = ctx.Output<phi::DenseTensor>("Out");
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
diff --git a/paddle/fluid/operators/fill_diagonal_op.cc b/paddle/fluid/operators/fill_diagonal_op.cc
index 4bf9635ae45dd..8a7f5daa9f857 100644
--- a/paddle/fluid/operators/fill_diagonal_op.cc
+++ b/paddle/fluid/operators/fill_diagonal_op.cc
@@ -73,9 +73,9 @@ class FillIDiagonalGradOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    // Note: don't get data type from ctx.Input<framework::Tensor>("Input");
+    // Note: don't get data type from ctx.Input<phi::DenseTensor>("Input");
     auto dtype = framework::TransToProtoVarType(
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"))->type());
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"))->type());
     return framework::OpKernelType(dtype, ctx.GetPlace());
   }
 };
diff --git a/paddle/fluid/operators/fill_diagonal_tensor_op.cc b/paddle/fluid/operators/fill_diagonal_tensor_op.cc
index ccf9b7aa35938..5a7f56cbfd04d 100644
--- a/paddle/fluid/operators/fill_diagonal_tensor_op.cc
+++ b/paddle/fluid/operators/fill_diagonal_tensor_op.cc
@@ -72,9 +72,9 @@ class FillDiagonalTensorGradOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    // Note: don't get data type from ctx.Input<framework::Tensor>("Input");
+    // Note: don't get data type from ctx.Input<phi::DenseTensor>("Input");
     auto dtype =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"))->type();
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"))->type();
     return framework::OpKernelType(framework::TransToProtoVarType(dtype),
                                    ctx.GetPlace());
   }
diff --git a/paddle/fluid/operators/fill_zeros_like_op.h b/paddle/fluid/operators/fill_zeros_like_op.h
index 9c967cf70e2f2..331af861cdff4 100644
--- a/paddle/fluid/operators/fill_zeros_like_op.h
+++ b/paddle/fluid/operators/fill_zeros_like_op.h
@@ -23,7 +23,7 @@ template <typename DeviceContext, typename T>
 class FillZerosLikeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* out = context.Output<framework::Tensor>("Out");
+    auto* out = context.Output<phi::DenseTensor>("Out");
     out->mutable_data<T>(context.GetPlace());
 
     phi::funcs::SetConstant<DeviceContext, T> setter;
diff --git a/paddle/fluid/operators/fill_zeros_like_op_npu.cc b/paddle/fluid/operators/fill_zeros_like_op_npu.cc
index 3963dc505ad0c..be5160eef4404 100644
--- a/paddle/fluid/operators/fill_zeros_like_op_npu.cc
+++ b/paddle/fluid/operators/fill_zeros_like_op_npu.cc
@@ -22,8 +22,8 @@ template <typename DeviceContext, typename T>
 class FillZerosLikeNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
+    auto* x = context.Input<phi::DenseTensor>("X");
+    auto* out = context.Output<phi::DenseTensor>("Out");
 
     out->mutable_data<T>(context.GetPlace());
     auto stream =
diff --git a/paddle/fluid/operators/filter_by_instag_op.cu b/paddle/fluid/operators/filter_by_instag_op.cu
index 5df1e4f651aa9..5777dcf714589 100644
--- a/paddle/fluid/operators/filter_by_instag_op.cu
+++ b/paddle/fluid/operators/filter_by_instag_op.cu
@@ -43,7 +43,7 @@ namespace cg = cooperative_groups;
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using SelectedRows = phi::SelectedRows;
 using LoDTensor = framework::LoDTensor;
 
@@ -360,7 +360,7 @@ class FilterByInstagGPUKernel : public framework::OpKernel<T> {
 
     // X3 is local fc tag list
     // LoD [[0, Sum(fc1), Sum(fc1, fc2) ...]]
-    const Tensor* x3 = context.Input<Tensor>("Filter_tag");
+    const phi::DenseTensor* x3 = context.Input<phi::DenseTensor>("Filter_tag");
     const int64_t* x3_data = x3->data<int64_t>();
 
     Vector<size_t> x2_lods;
diff --git a/paddle/fluid/operators/filter_by_instag_op.h b/paddle/fluid/operators/filter_by_instag_op.h
index 66178b180a9e3..869d44430812c 100644
--- a/paddle/fluid/operators/filter_by_instag_op.h
+++ b/paddle/fluid/operators/filter_by_instag_op.h
@@ -29,7 +29,7 @@
 
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using SelectedRows = phi::SelectedRows;
 using LoDTensor = framework::LoDTensor;
 
@@ -50,7 +50,7 @@ class FilterByInstagKernel : public framework::OpKernel<T> {
     auto* x2 = context.Input<LoDTensor>("Ins_tag");
     // X3 is local fc tag list
     // LoD [[0, Sum(fc1), Sum(fc1, fc2) ...]]
-    auto* x3 = context.Input<Tensor>("Filter_tag");
+    auto* x3 = context.Input<phi::DenseTensor>("Filter_tag");
 
     std::unordered_set<int64_t> filter_tag;
     auto* x3_data = x3->data<int64_t>();
diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc
index e160fc6f09ad0..036f3b8222422 100644
--- a/paddle/fluid/operators/flatten_op.cc
+++ b/paddle/fluid/operators/flatten_op.cc
@@ -28,7 +28,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class FlattenOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/flatten_op_npu.cc b/paddle/fluid/operators/flatten_op_npu.cc
index 7b7f0133d8a11..93e69d0de6159 100644
--- a/paddle/fluid/operators/flatten_op_npu.cc
+++ b/paddle/fluid/operators/flatten_op_npu.cc
@@ -56,14 +56,14 @@ class Flatten2GradNPUKernel : public framework::OpKernel<T> {
   }
 };
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class FlattenContiguousRangeNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *X = ctx.Input<Tensor>("X");
-    auto *Out = ctx.Output<Tensor>("Out");
+    auto *X = ctx.Input<phi::DenseTensor>("X");
+    auto *Out = ctx.Output<phi::DenseTensor>("Out");
     int start_axis = ctx.Attr<int>("start_axis");
     int stop_axis = ctx.Attr<int>("stop_axis");
 
diff --git a/paddle/fluid/operators/flip_op.cc b/paddle/fluid/operators/flip_op.cc
index 7f00fad6e3d12..4c14418690a85 100644
--- a/paddle/fluid/operators/flip_op.cc
+++ b/paddle/fluid/operators/flip_op.cc
@@ -26,7 +26,6 @@ namespace paddle {
 namespace operators {
 
 using framework::OpKernelType;
-using framework::Tensor;
 
 class FlipOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/fsp_op.h b/paddle/fluid/operators/fsp_op.h
index 1faace15454aa..0f8072520be2f 100644
--- a/paddle/fluid/operators/fsp_op.h
+++ b/paddle/fluid/operators/fsp_op.h
@@ -20,15 +20,15 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class FSPOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* y = context.Input<Tensor>("Y");
-    auto* output = context.Output<Tensor>("Out");
+    auto* x = context.Input<phi::DenseTensor>("X");
+    auto* y = context.Input<phi::DenseTensor>("Y");
+    auto* output = context.Output<phi::DenseTensor>("Out");
     output->mutable_data<T>(context.GetPlace());
     auto x_dims = x->dims();
     auto y_dims = y->dims();
@@ -69,12 +69,13 @@ template <typename DeviceContext, typename T>
 class FSPGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* d_y = context.Output<Tensor>(framework::GradVarName("Y"));
+    auto* d_x = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* d_y = context.Output<phi::DenseTensor>(framework::GradVarName("Y"));
     if (d_x == nullptr && d_y == nullptr) {
       return;
     }
-    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* d_out =
+        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
     auto d_out_dims = d_out->dims();
     auto batch_size = d_out_dims[0];
     auto x_channel = d_out_dims[1];
@@ -89,7 +90,7 @@ class FSPGradOpKernel : public framework::OpKernel<T> {
       set_zero(context.template device_context<DeviceContext>(),
                d_x,
                static_cast<T>(0));
-      auto* y = context.Input<Tensor>("Y");
+      auto* y = context.Input<phi::DenseTensor>("Y");
       auto y_dims = y->dims();
       h = y_dims[2];
       w = y_dims[3];
@@ -122,7 +123,7 @@ class FSPGradOpKernel : public framework::OpKernel<T> {
       set_zero(context.template device_context<DeviceContext>(),
                d_y,
                static_cast<T>(0));
-      auto* x = context.Input<Tensor>("X");
+      auto* x = context.Input<phi::DenseTensor>("X");
       auto x_dims = x->dims();
       h = x_dims[2];
       w = x_dims[3];
diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h
index 2b8b857966de1..b44faf3150115 100644
--- a/paddle/fluid/operators/fused/attn_bias_add.cu.h
+++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h
@@ -324,7 +324,7 @@ void Launch2DColumnReduce(const phi::GPUContext& dev_ctx,
     BiasAddBwSinglePassKernel<T>
         <<<grid, block, 0, stream>>>(d_out, reduce_num, left_num, d_bias);
   } else {
-    framework::Tensor tmp_sum;
+    phi::DenseTensor tmp_sum;
     tmp_sum.Resize({grid.y, left_num});
     dev_ctx.template Alloc<ReduceParamType<T>>(
         &tmp_sum, tmp_sum.numel() * sizeof(ReduceParamType<T>));
diff --git a/paddle/fluid/operators/fused/attn_gemm.h b/paddle/fluid/operators/fused/attn_gemm.h
index 07947f522cdae..c8ea19d463a1b 100644
--- a/paddle/fluid/operators/fused/attn_gemm.h
+++ b/paddle/fluid/operators/fused/attn_gemm.h
@@ -24,7 +24,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 // support gemm-nt and gemm-nn, which is used in fused_attention_op.
 template <typename T>
 class AttnMatMul {
@@ -47,11 +47,11 @@ class AttnMatMul {
 
   ~AttnMatMul() {}
 
-  void ComputeForward(const framework::Tensor* weight,
-                      const framework::Tensor* input,
-                      const framework::Tensor* bias,
-                      framework::Tensor* output,
-                      framework::Tensor* bias_out) {
+  void ComputeForward(const phi::DenseTensor* weight,
+                      const phi::DenseTensor* input,
+                      const phi::DenseTensor* bias,
+                      phi::DenseTensor* output,
+                      phi::DenseTensor* bias_out) {
     // Note: for blas.GEMM API in Paddle, it treats all inputs as row-major.
     // here: (transa, transb): nt, input * weight.
     CBLAS_TRANSPOSE transA = transA_ ? CblasTrans : CblasNoTrans;
@@ -73,19 +73,19 @@ class AttnMatMul {
               output->data<T>());
     if (compute_bias_) {
       // bias_out = output + bias
-      std::vector<const Tensor*> ins = {output, bias};
-      std::vector<Tensor*> outs = {bias_out};
+      std::vector<const phi::DenseTensor*> ins = {output, bias};
+      std::vector<phi::DenseTensor*> outs = {bias_out};
       phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
           dev_ctx_, ins, &outs, -1, phi::funcs::AddFunctor<T>());
     }
   }
 
-  void ComputeBackward(const framework::Tensor* input,
-                       const framework::Tensor* weight,
-                       const framework::Tensor* d_output,
-                       framework::Tensor* d_input,
-                       framework::Tensor* d_weight,
-                       framework::Tensor* d_bias,
+  void ComputeBackward(const phi::DenseTensor* input,
+                       const phi::DenseTensor* weight,
+                       const phi::DenseTensor* d_output,
+                       phi::DenseTensor* d_input,
+                       phi::DenseTensor* d_weight,
+                       phi::DenseTensor* d_bias,
                        bool use_addto = false) {
     T alpha = static_cast<T>(1.0);
     T beta_dA = use_addto ? static_cast<T>(1.0) : static_cast<T>(0.0);
diff --git a/paddle/fluid/operators/fused/attn_gemm_int8.h b/paddle/fluid/operators/fused/attn_gemm_int8.h
index ba114df9085fb..98a45deac3c8d 100644
--- a/paddle/fluid/operators/fused/attn_gemm_int8.h
+++ b/paddle/fluid/operators/fused/attn_gemm_int8.h
@@ -26,7 +26,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class AttnMatmulINT8 {
@@ -41,15 +41,15 @@ class AttnMatmulINT8 {
 
   // This function is used to execute GEMM, with input and output's types are
   // both T.
-  void ComputeForward(const framework::Tensor* weight,
-                      const framework::Tensor* input,
-                      framework::Tensor* input_tmp,
-                      const framework::Tensor* bias,
-                      framework::Tensor* output,
-                      framework::Tensor* output_tmp,
-                      framework::Tensor* bias_out,
+  void ComputeForward(const phi::DenseTensor* weight,
+                      const phi::DenseTensor* input,
+                      phi::DenseTensor* input_tmp,
+                      const phi::DenseTensor* bias,
+                      phi::DenseTensor* output,
+                      phi::DenseTensor* output_tmp,
+                      phi::DenseTensor* bias_out,
                       const float quant_in_scale,
-                      const framework::Tensor* dequant_out_scale,
+                      const phi::DenseTensor* dequant_out_scale,
                       const int quant_out_scale_offset,
                       const int quant_round_type = 1,
                       const float quant_max_bound = 127.0,
@@ -80,8 +80,8 @@ class AttnMatmulINT8 {
 
     if (compute_bias_) {
       // bias_out = output + bias
-      std::vector<const framework::Tensor*> ins = {output, bias};
-      std::vector<framework::Tensor*> outs = {bias_out};
+      std::vector<const phi::DenseTensor*> ins = {output, bias};
+      std::vector<phi::DenseTensor*> outs = {bias_out};
       phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
           dev_ctx_, ins, &outs, -1, phi::funcs::AddFunctor<T>());
       PADDLE_ENFORCE_EQ(cudaGetLastError(),
@@ -95,11 +95,11 @@ class AttnMatmulINT8 {
 
   // This function is used to execute GEMM, with input and output's types are
   // both INT8.
-  void ComputeForwardINT8ToINT8(const framework::Tensor* weight,
-                                framework::Tensor* input,
-                                const framework::Tensor* bias,
-                                framework::Tensor* output,
-                                framework::Tensor* bias_out) {
+  void ComputeForwardINT8ToINT8(const phi::DenseTensor* weight,
+                                phi::DenseTensor* input,
+                                const phi::DenseTensor* bias,
+                                phi::DenseTensor* output,
+                                phi::DenseTensor* bias_out) {
     helpers_[0]->GEMM(input->data<int8_t>(),
                       weight->data<int8_t>(),
                       output->data<int32_t>(),
@@ -108,14 +108,14 @@ class AttnMatmulINT8 {
 
   // This function is used to execute GEMM, with input and output's types are
   // INT8 and T.
-  void ComputeForwardINT8ToT(const framework::Tensor* weight,
+  void ComputeForwardINT8ToT(const phi::DenseTensor* weight,
                              const float quant_in_scale,
-                             framework::Tensor* input,
-                             const framework::Tensor* bias,
-                             framework::Tensor* output,
-                             framework::Tensor* output_tmp,
-                             framework::Tensor* bias_out,
-                             const framework::Tensor* dequant_out_scale,
+                             phi::DenseTensor* input,
+                             const phi::DenseTensor* bias,
+                             phi::DenseTensor* output,
+                             phi::DenseTensor* output_tmp,
+                             phi::DenseTensor* bias_out,
+                             const phi::DenseTensor* dequant_out_scale,
                              const int quant_out_scale_offset) {
     helpers_[0]->GEMM(input->data<int8_t>(),
                       weight->data<int8_t>(),
@@ -133,8 +133,8 @@ class AttnMatmulINT8 {
 
     if (compute_bias_) {
       // bias_out = output + bias
-      std::vector<const framework::Tensor*> ins = {output, bias};
-      std::vector<framework::Tensor*> outs = {bias_out};
+      std::vector<const phi::DenseTensor*> ins = {output, bias};
+      std::vector<phi::DenseTensor*> outs = {bias_out};
       phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
           dev_ctx_, ins, &outs, -1, phi::funcs::AddFunctor<T>());
       PADDLE_ENFORCE_EQ(cudaGetLastError(),
@@ -148,13 +148,13 @@ class AttnMatmulINT8 {
 
   // This function is used to execute GEMM, with input and output's types are T
   // and INT8.
-  void ComputeForwardTToINT8(const framework::Tensor* weight,
+  void ComputeForwardTToINT8(const phi::DenseTensor* weight,
                              const float quant_in_scale,
-                             const framework::Tensor* input,
-                             framework::Tensor* input_tmp,
-                             const framework::Tensor* bias,
-                             framework::Tensor* output,
-                             framework::Tensor* bias_out,
+                             const phi::DenseTensor* input,
+                             phi::DenseTensor* input_tmp,
+                             const phi::DenseTensor* bias,
+                             phi::DenseTensor* output,
+                             phi::DenseTensor* bias_out,
                              const int quant_round_type = 1,
                              const float quant_max_bound = 127.0,
                              const float quant_min_bound = -127.0) {
diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu
index 6f0ebc2c7ebf6..5eee2c9332830 100644
--- a/paddle/fluid/operators/fused/conv_fusion_op.cu
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cu
@@ -27,7 +27,7 @@ namespace paddle {
 namespace operators {
 
 #if PADDLE_WITH_HIP || CUDNN_VERSION >= 7100
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
 using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
 using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
@@ -45,11 +45,11 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
-    auto* input = ctx.Input<Tensor>("Input");
-    auto* filter = ctx.Input<Tensor>("Filter");
-    auto* bias = ctx.Input<Tensor>("Bias");
-    auto* residual = ctx.Input<Tensor>("ResidualData");
-    auto* output = ctx.Output<Tensor>("Output");
+    auto* input = ctx.Input<phi::DenseTensor>("Input");
+    auto* filter = ctx.Input<phi::DenseTensor>("Filter");
+    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
+    auto* residual = ctx.Input<phi::DenseTensor>("ResidualData");
+    auto* output = ctx.Output<phi::DenseTensor>("Output");
     dev_ctx.template Alloc<T>(output, output->numel() * sizeof(T));
 
     std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
@@ -523,10 +523,10 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
 #endif
     std::vector<int> channels = ctx.Attr<std::vector<int>>("split_channels");
     if (channels.size()) {
-      auto outs = ctx.MultiOutput<framework::Tensor>("Outputs");
+      auto outs = ctx.MultiOutput<phi::DenseTensor>("Outputs");
       if (x_dims[0] == 1) {
         // share data with Output
-        framework::Tensor t;
+        phi::DenseTensor t;
         t.ShareDataWith(*output);
         auto y_dims = output->dims();
         t.Resize({y_dims[1], y_dims[2], y_dims[3]});
diff --git a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
index e11792a5dfb61..5f30ee4cc832c 100644
--- a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
@@ -31,7 +31,7 @@ DECLARE_bool(cudnn_batchnorm_spatial_persistent);
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace op = paddle::operators;
-using Tensor = paddle::framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 USE_OP_ITSELF(batch_norm);
 PD_DECLARE_KERNEL(batch_norm, GPU, ALL_LAYOUT);
@@ -40,7 +40,7 @@ USE_CUDA_ONLY_OP(fused_bn_add_activation_grad);
 
 template <typename T>
 void InitRandomTensor(const std::vector<int64_t> &dims,
-                      framework::Tensor *cpu_out) {
+                      phi::DenseTensor *cpu_out) {
   T *cpu_out_ptr =
       cpu_out->mutable_data<T>(phi::make_ddim(dims), platform::CPUPlace());
   std::default_random_engine random(0);
@@ -53,7 +53,7 @@ void InitRandomTensor(const std::vector<int64_t> &dims,
 template <typename T>
 void InitConstantTensor(const std::vector<int64_t> &dims,
                         T value,
-                        framework::Tensor *cpu_out) {
+                        phi::DenseTensor *cpu_out) {
   T *cpu_out_ptr =
       cpu_out->mutable_data<T>(phi::make_ddim(dims), platform::CPUPlace());
   for (int i = 0; i < cpu_out->numel(); ++i) {
@@ -63,8 +63,8 @@ void InitConstantTensor(const std::vector<int64_t> &dims,
 
 template <typename T>
 void CheckOutput(std::string name,
-                 const framework::Tensor &cpu_res,
-                 const framework::Tensor &cpu_base,
+                 const phi::DenseTensor &cpu_res,
+                 const phi::DenseTensor &cpu_base,
                  float diff,
                  bool is_relative_atol = false) {
   if (cpu_res.dims().size() == cpu_base.dims().size()) {
@@ -102,9 +102,9 @@ void CheckOutput(std::string name,
 }
 
 template <typename T>
-void ComputeSumAndSquareSum(const framework::Tensor &cpu_x,
-                            framework::Tensor *cpu_sum,
-                            framework::Tensor *cpu_sum_of_square) {
+void ComputeSumAndSquareSum(const phi::DenseTensor &cpu_x,
+                            phi::DenseTensor *cpu_sum,
+                            phi::DenseTensor *cpu_sum_of_square) {
   // x is in NHWC format.
   const auto &dims = cpu_x.dims();
   int64_t c = dims[3];
@@ -129,8 +129,7 @@ void ComputeSumAndSquareSum(const framework::Tensor &cpu_x,
 }
 
 template <typename T>
-void ComputeInplaceAdd(const framework::Tensor &cpu_x,
-                       framework::Tensor *cpu_y) {
+void ComputeInplaceAdd(const phi::DenseTensor &cpu_x, phi::DenseTensor *cpu_y) {
   EXPECT_EQ(cpu_x.dims(), cpu_y->dims());
 
   const T *cpu_x_ptr = cpu_x.data<T>();
@@ -141,7 +140,7 @@ void ComputeInplaceAdd(const framework::Tensor &cpu_x,
 }
 
 template <typename T>
-void ComputeInplaceRelu(framework::Tensor *cpu_x) {
+void ComputeInplaceRelu(phi::DenseTensor *cpu_x) {
   T *cpu_x_ptr = cpu_x->data<T>();
   for (int64_t i = 0; i < cpu_x->numel(); ++i) {
     cpu_x_ptr[i] =
@@ -389,10 +388,10 @@ class CudnnBNAddReluTester {
 
     auto select = [&](Tensor *in) { return has_shortcut_ ? in : nullptr; };
 
-    framework::Tensor cpu_mean_base_x;
-    framework::Tensor cpu_var_base_x;
-    framework::Tensor cpu_mean_base_z;
-    framework::Tensor cpu_var_base_z;
+    phi::DenseTensor cpu_mean_base_x;
+    phi::DenseTensor cpu_var_base_x;
+    phi::DenseTensor cpu_mean_base_z;
+    phi::DenseTensor cpu_var_base_z;
     if (!has_shortcut_ && fuse_add_ && (act_type_ == "relu")) {
       BaselineForwardFusedBNAddRelu(*ctx,
                                     &cpu_mean_base_x,
@@ -416,11 +415,11 @@ class CudnnBNAddReluTester {
                       select(&saved_reserve_space_z_));
     }
 
-    framework::Tensor cpu_mean_x;
-    framework::Tensor cpu_var_x;
-    framework::Tensor cpu_y;
-    framework::Tensor cpu_mean_z;
-    framework::Tensor cpu_var_z;
+    phi::DenseTensor cpu_mean_x;
+    phi::DenseTensor cpu_var_x;
+    phi::DenseTensor cpu_y;
+    phi::DenseTensor cpu_mean_z;
+    phi::DenseTensor cpu_var_z;
     FusedForward(*ctx,
                  &cpu_mean_x,
                  &cpu_var_x,
@@ -470,17 +469,17 @@ class CudnnBNAddReluTester {
     phi::GPUContext *ctx = static_cast<phi::GPUContext *>(
         platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
 
-    framework::Tensor cpu_dx_base;
-    framework::Tensor cpu_dz_base;
-    framework::Tensor cpu_dscale_base;
-    framework::Tensor cpu_dbias_base;
+    phi::DenseTensor cpu_dx_base;
+    phi::DenseTensor cpu_dz_base;
+    phi::DenseTensor cpu_dscale_base;
+    phi::DenseTensor cpu_dbias_base;
     BaselineBackwardFusedBNAddRelu(
         *ctx, &cpu_dx_base, &cpu_dz_base, &cpu_dscale_base, &cpu_dbias_base);
 
-    framework::Tensor cpu_dx;
-    framework::Tensor cpu_dz;
-    framework::Tensor cpu_dscale;
-    framework::Tensor cpu_dbias;
+    phi::DenseTensor cpu_dx;
+    phi::DenseTensor cpu_dz;
+    phi::DenseTensor cpu_dscale;
+    phi::DenseTensor cpu_dbias;
     FusedBackward(*ctx, &cpu_dx, &cpu_dz, &cpu_dscale, &cpu_dbias);
 
     CheckOutput<T>("DX", cpu_dx, cpu_dx_base, diff, is_relative_atol);
@@ -546,7 +545,7 @@ class CudnnBNAddReluTester {
                             cpu_y,
                             saved_reserve_space_x);
     if (has_shortcut_) {
-      framework::Tensor cpu_z_out;
+      phi::DenseTensor cpu_z_out;
       InitMeanVar(cpu_mean_z, cpu_var_z, cpu_saved_mean_z, cpu_saved_var_z);
       ComputeBatchNormForward(ctx,
                               cpu_z_,
@@ -624,8 +623,8 @@ class CudnnBNAddReluTester {
                                    Tensor *saved_var,
                                    Tensor *equiv_scale,
                                    Tensor *equiv_bias) {
-    framework::Tensor cpu_sum;
-    framework::Tensor cpu_sum_of_square;
+    phi::DenseTensor cpu_sum;
+    phi::DenseTensor cpu_sum_of_square;
     ComputeSumAndSquareSum<T>(cpu_x, &cpu_sum, &cpu_sum_of_square);
 
     auto place = ctx.GetPlace();
@@ -678,17 +677,17 @@ class CudnnBNAddReluTester {
                     Tensor *cpu_var_z = nullptr,
                     Tensor *cpu_saved_mean_z = nullptr,
                     Tensor *cpu_saved_var_z = nullptr) {
-    framework::Tensor x;
-    framework::Tensor sum_x;
-    framework::Tensor sum_of_square_x;
-    framework::Tensor bn_scale_x;
-    framework::Tensor bn_bias_x;
-
-    framework::Tensor z;
-    framework::Tensor sum_z;
-    framework::Tensor sum_of_square_z;
-    framework::Tensor bn_scale_z;
-    framework::Tensor bn_bias_z;
+    phi::DenseTensor x;
+    phi::DenseTensor sum_x;
+    phi::DenseTensor sum_of_square_x;
+    phi::DenseTensor bn_scale_x;
+    phi::DenseTensor bn_bias_x;
+
+    phi::DenseTensor z;
+    phi::DenseTensor sum_z;
+    phi::DenseTensor sum_of_square_z;
+    phi::DenseTensor bn_scale_z;
+    phi::DenseTensor bn_bias_z;
 
     auto place = ctx.GetPlace();
     paddle::framework::TensorCopySync(cpu_x_, place, &x);
@@ -696,22 +695,22 @@ class CudnnBNAddReluTester {
       paddle::framework::TensorCopySync(cpu_z_, place, &z);
     }
 
-    framework::Tensor mean_x;
-    framework::Tensor var_x;
-    framework::Tensor saved_mean_x;
-    framework::Tensor saved_var_x;
-    framework::Tensor equiv_scale_x;
-    framework::Tensor equiv_bias_x;
+    phi::DenseTensor mean_x;
+    phi::DenseTensor var_x;
+    phi::DenseTensor saved_mean_x;
+    phi::DenseTensor saved_var_x;
+    phi::DenseTensor equiv_scale_x;
+    phi::DenseTensor equiv_bias_x;
 
-    framework::Tensor mean_z;
-    framework::Tensor var_z;
-    framework::Tensor saved_mean_z;
-    framework::Tensor saved_var_z;
-    framework::Tensor equiv_scale_z;
-    framework::Tensor equiv_bias_z;
+    phi::DenseTensor mean_z;
+    phi::DenseTensor var_z;
+    phi::DenseTensor saved_mean_z;
+    phi::DenseTensor saved_var_z;
+    phi::DenseTensor equiv_scale_z;
+    phi::DenseTensor equiv_bias_z;
 
-    framework::Tensor y;
-    framework::Tensor bitmask;
+    phi::DenseTensor y;
+    phi::DenseTensor bitmask;
 
     InitMeanVar(cpu_mean_x, cpu_var_x, cpu_saved_mean_x, cpu_saved_var_x);
     paddle::framework::TensorCopySync(*cpu_mean_x, place, &mean_x);
@@ -810,17 +809,17 @@ class CudnnBNAddReluTester {
                      Tensor *cpu_dz,
                      Tensor *cpu_dscale,
                      Tensor *cpu_dbias) {
-    framework::Tensor dy;
-    framework::Tensor x;
-    framework::Tensor bn_scale;
-    framework::Tensor bn_bias;
-    framework::Tensor saved_mean;
-    framework::Tensor saved_var;
-    framework::Tensor bitmask;
-    framework::Tensor dx;
-    framework::Tensor dz;
-    framework::Tensor dscale;
-    framework::Tensor dbias;
+    phi::DenseTensor dy;
+    phi::DenseTensor x;
+    phi::DenseTensor bn_scale;
+    phi::DenseTensor bn_bias;
+    phi::DenseTensor saved_mean;
+    phi::DenseTensor saved_var;
+    phi::DenseTensor bitmask;
+    phi::DenseTensor dx;
+    phi::DenseTensor dz;
+    phi::DenseTensor dscale;
+    phi::DenseTensor dbias;
 
     auto place = ctx.GetPlace();
     paddle::framework::TensorCopySync(cpu_dy_, place, &dy);
@@ -880,27 +879,27 @@ class CudnnBNAddReluTester {
   bool has_shortcut_;
 
   // Forward input
-  framework::Tensor cpu_x_;
-  framework::Tensor cpu_bn_scale_x_;
-  framework::Tensor cpu_bn_bias_x_;
-  framework::Tensor cpu_z_;
-  framework::Tensor cpu_bn_scale_z_;
-  framework::Tensor cpu_bn_bias_z_;
+  phi::DenseTensor cpu_x_;
+  phi::DenseTensor cpu_bn_scale_x_;
+  phi::DenseTensor cpu_bn_bias_x_;
+  phi::DenseTensor cpu_z_;
+  phi::DenseTensor cpu_bn_scale_z_;
+  phi::DenseTensor cpu_bn_bias_z_;
 
   // Backward input
-  framework::Tensor cpu_dy_;
-  framework::Tensor cpu_bitmask_;
-  framework::Tensor cpu_saved_mean_x_;
-  framework::Tensor cpu_saved_var_x_;
-  framework::Tensor cpu_saved_mean_z_;
-  framework::Tensor cpu_saved_var_z_;
-  framework::Tensor cpu_saved_mean_base_x_;
-  framework::Tensor cpu_saved_var_base_x_;
-  framework::Tensor saved_reserve_space_x_;
-  framework::Tensor cpu_saved_mean_base_z_;
-  framework::Tensor cpu_saved_var_base_z_;
-  framework::Tensor saved_reserve_space_z_;
-  framework::Tensor cpu_y_base_;
+  phi::DenseTensor cpu_dy_;
+  phi::DenseTensor cpu_bitmask_;
+  phi::DenseTensor cpu_saved_mean_x_;
+  phi::DenseTensor cpu_saved_var_x_;
+  phi::DenseTensor cpu_saved_mean_z_;
+  phi::DenseTensor cpu_saved_var_z_;
+  phi::DenseTensor cpu_saved_mean_base_x_;
+  phi::DenseTensor cpu_saved_var_base_x_;
+  phi::DenseTensor saved_reserve_space_x_;
+  phi::DenseTensor cpu_saved_mean_base_z_;
+  phi::DenseTensor cpu_saved_var_base_z_;
+  phi::DenseTensor saved_reserve_space_z_;
+  phi::DenseTensor cpu_y_base_;
 
   double eps_ = 1e-5;
   float momentum_ = 0.9;
diff --git a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
index 86588331ec2b1..b2201c89295ca 100644
--- a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
@@ -20,7 +20,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 namespace dynload = platform::dynload;
 template <typename T>
 using BatchNormParamType =
diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
index cde4ed061423e..01e5e24e0a016 100644
--- a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
@@ -19,7 +19,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 namespace dynload = platform::dynload;
 
 template <typename T>
diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
index ef93612ffce39..be518866f5f00 100644
--- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
@@ -28,7 +28,7 @@ limitations under the License. */
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace op = paddle::operators;
-using Tensor = paddle::framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 USE_OP_ITSELF(conv2d);
 USE_OP_ITSELF(conv2d_grad);
@@ -37,7 +37,7 @@ PD_DECLARE_KERNEL(conv2d_grad, GPUDNN, ALL_LAYOUT);
 
 template <typename T>
 void InitRandomTensor(const std::vector<int64_t> &dims,
-                      framework::Tensor *cpu_out) {
+                      phi::DenseTensor *cpu_out) {
   T *cpu_out_ptr =
       cpu_out->mutable_data<T>(phi::make_ddim(dims), platform::CPUPlace());
 
@@ -49,8 +49,8 @@ void InitRandomTensor(const std::vector<int64_t> &dims,
 }
 
 template <typename T>
-void TransposeNchwToNhwc(const framework::Tensor &cpu_in,
-                         framework::Tensor *cpu_out) {
+void TransposeNchwToNhwc(const phi::DenseTensor &cpu_in,
+                         phi::DenseTensor *cpu_out) {
   const auto &in_dims = cpu_in.dims();
   EXPECT_EQ(cpu_in.dims().size(), 4);
 
@@ -73,8 +73,8 @@ void TransposeNchwToNhwc(const framework::Tensor &cpu_in,
 }
 
 template <typename T>
-void CheckOutput(const framework::Tensor &cpu_res,
-                 const framework::Tensor &cpu_base,
+void CheckOutput(const phi::DenseTensor &cpu_res,
+                 const phi::DenseTensor &cpu_base,
                  float diff,
                  bool is_relative_atol = false) {
   EXPECT_EQ(cpu_res.dims(), cpu_base.dims());
@@ -134,8 +134,8 @@ void ComputeConv2DBackward(const phi::GPUContext &ctx,
                            const Tensor &cpu_input,
                            const Tensor &cpu_filter,
                            const Tensor &cpu_output_grad,
-                           framework::Tensor *cpu_input_grad,
-                           framework::Tensor *cpu_filter_grad,
+                           phi::DenseTensor *cpu_input_grad,
+                           phi::DenseTensor *cpu_filter_grad,
                            int stride,
                            int padding,
                            int dilation) {
@@ -191,9 +191,9 @@ void ComputeConv2DBackward(const phi::GPUContext &ctx,
 }
 
 template <typename T>
-void ComputeSumAndSquareSum(const framework::Tensor &cpu_out,
-                            framework::Tensor *cpu_sum,
-                            framework::Tensor *cpu_sum_of_square) {
+void ComputeSumAndSquareSum(const phi::DenseTensor &cpu_out,
+                            phi::DenseTensor *cpu_sum,
+                            phi::DenseTensor *cpu_sum_of_square) {
   const auto &dims = cpu_out.dims();
   int64_t c = dims[3];
 
@@ -245,15 +245,15 @@ class CudnnNormConvolutionTester {
     phi::GPUContext *ctx = static_cast<phi::GPUContext *>(
         platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
 
-    framework::Tensor cpu_output_base;
-    framework::Tensor cpu_sum_base;
-    framework::Tensor cpu_sum_of_square_base;
+    phi::DenseTensor cpu_output_base;
+    phi::DenseTensor cpu_sum_base;
+    phi::DenseTensor cpu_sum_of_square_base;
     BaselineForward(
         *ctx, &cpu_output_base, &cpu_sum_base, &cpu_sum_of_square_base);
 
-    framework::Tensor cpu_output;
-    framework::Tensor cpu_sum;
-    framework::Tensor cpu_sum_of_square;
+    phi::DenseTensor cpu_output;
+    phi::DenseTensor cpu_sum;
+    phi::DenseTensor cpu_sum_of_square;
     FusedForward(*ctx, &cpu_output, &cpu_sum, &cpu_sum_of_square);
 
     // Check forward correctness between baseline and results of normconv.
@@ -267,15 +267,15 @@ class CudnnNormConvolutionTester {
     phi::GPUContext *ctx = static_cast<phi::GPUContext *>(
         platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
 
-    framework::Tensor cpu_input_grad_base;
-    framework::Tensor cpu_filter_nchw_grad_base;
-    framework::Tensor cpu_filter_nhwc_grad_base;
+    phi::DenseTensor cpu_input_grad_base;
+    phi::DenseTensor cpu_filter_nchw_grad_base;
+    phi::DenseTensor cpu_filter_nhwc_grad_base;
     BaselineBackward(*ctx, &cpu_input_grad_base, &cpu_filter_nchw_grad_base);
     TransposeNchwToNhwc<T>(cpu_filter_nchw_grad_base,
                            &cpu_filter_nhwc_grad_base);
 
-    framework::Tensor cpu_input_grad;
-    framework::Tensor cpu_filter_nhwc_grad;
+    phi::DenseTensor cpu_input_grad;
+    phi::DenseTensor cpu_filter_nhwc_grad;
     FusedBackward(*ctx, &cpu_input_grad, &cpu_filter_nhwc_grad);
 
     // Check backward correctness between baseline and results of normconv.
@@ -301,9 +301,9 @@ class CudnnNormConvolutionTester {
   }
 
   void BaselineForward(const phi::GPUContext &ctx,
-                       framework::Tensor *cpu_output_base,
-                       framework::Tensor *cpu_sum_base,
-                       framework::Tensor *cpu_sum_of_square_base) {
+                       phi::DenseTensor *cpu_output_base,
+                       phi::DenseTensor *cpu_sum_base,
+                       phi::DenseTensor *cpu_sum_of_square_base) {
     ComputeConv2DForward(
         ctx, cpu_input_, cpu_filter_nchw_, cpu_output_base, stride_, padding_);
     ComputeSumAndSquareSum<T>(
@@ -311,8 +311,8 @@ class CudnnNormConvolutionTester {
   }
 
   void BaselineBackward(const phi::GPUContext &ctx,
-                        framework::Tensor *cpu_input_grad_base,
-                        framework::Tensor *cpu_filter_grad_base) {
+                        phi::DenseTensor *cpu_input_grad_base,
+                        phi::DenseTensor *cpu_filter_grad_base) {
     ComputeConv2DBackward(ctx,
                           cpu_input_,
                           cpu_filter_nchw_,
@@ -326,14 +326,14 @@ class CudnnNormConvolutionTester {
 
   // get forward results of cudnn_norm_conv
   void FusedForward(const phi::GPUContext &ctx,
-                    framework::Tensor *cpu_output,
-                    framework::Tensor *cpu_sum,
-                    framework::Tensor *cpu_sum_of_square) {
-    framework::Tensor input;
-    framework::Tensor filter_nhwc;
-    framework::Tensor output;
-    framework::Tensor sum;
-    framework::Tensor sum_of_square;
+                    phi::DenseTensor *cpu_output,
+                    phi::DenseTensor *cpu_sum,
+                    phi::DenseTensor *cpu_sum_of_square) {
+    phi::DenseTensor input;
+    phi::DenseTensor filter_nhwc;
+    phi::DenseTensor output;
+    phi::DenseTensor sum;
+    phi::DenseTensor sum_of_square;
 
     auto place = ctx.GetPlace();
     paddle::framework::TensorCopySync(cpu_input_, place, &input);
@@ -364,13 +364,13 @@ class CudnnNormConvolutionTester {
   }
 
   void FusedBackward(const phi::GPUContext &ctx,
-                     framework::Tensor *cpu_input_grad,
-                     framework::Tensor *cpu_filter_grad) {
-    framework::Tensor input;
-    framework::Tensor filter_nhwc;
-    framework::Tensor output_grad;
-    framework::Tensor input_grad;
-    framework::Tensor filter_grad;
+                     phi::DenseTensor *cpu_input_grad,
+                     phi::DenseTensor *cpu_filter_grad) {
+    phi::DenseTensor input;
+    phi::DenseTensor filter_nhwc;
+    phi::DenseTensor output_grad;
+    phi::DenseTensor input_grad;
+    phi::DenseTensor filter_grad;
 
     auto place = ctx.GetPlace();
     paddle::framework::TensorCopySync(cpu_input_, place, &input);
@@ -415,12 +415,12 @@ class CudnnNormConvolutionTester {
   const int group_ = 1;
 
   // Forward input
-  framework::Tensor cpu_input_;
-  framework::Tensor cpu_filter_nchw_;
-  framework::Tensor cpu_filter_nhwc_;
+  phi::DenseTensor cpu_input_;
+  phi::DenseTensor cpu_filter_nchw_;
+  phi::DenseTensor cpu_filter_nhwc_;
 
   // Backward input
-  framework::Tensor cpu_output_grad_;
+  phi::DenseTensor cpu_output_grad_;
 };
 
 // test for fp16, kernel = 1, output_channels = input_channels
diff --git a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
index 60cf314c5ea3c..188f767daf1c8 100644
--- a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
@@ -19,7 +19,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 template <typename T>
 using CudnnDataType = platform::CudnnDataType<T>;
 namespace dynload = platform::dynload;
diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h
index 7de59dd9ee2e3..4854f81eae469 100644
--- a/paddle/fluid/operators/fused/fmha_ref.h
+++ b/paddle/fluid/operators/fused/fmha_ref.h
@@ -27,7 +27,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class AttnDropoutParam {
  public:
@@ -46,7 +46,7 @@ class AttnDropoutParam {
                    bool is_upscale_in_train,
                    bool is_fix_seed,
                    int seed_val,
-                   const Tensor* seed) {
+                   const phi::DenseTensor* seed) {
     is_test_ = is_test;
     dropout_implementation_ = dropout_implementation;
     dropout_prob_ = dropout_prob;
@@ -61,7 +61,7 @@ class AttnDropoutParam {
   bool is_upscale_in_train_;
   bool is_fix_seed_;
   int seed_val_;
-  const Tensor* seed_;
+  const phi::DenseTensor* seed_;
 };
 
 template <typename T>
@@ -82,18 +82,18 @@ class FMHARef {
 
   ~FMHARef() {}
 
-  void ComputeForward(const Tensor& qkv_input_tensor,
-                      const Tensor* cache_kv_tensor,
-                      const Tensor* src_mask_tensor,
-                      Tensor* transpose_2_out_tensor,
-                      Tensor* cache_kv_out_tensor,
-                      Tensor* qk_out_tensor,
-                      Tensor* src_mask_out_tensor,
-                      Tensor* softmax_out_tensor,
-                      Tensor* dropout_mask_out_tensor,
-                      Tensor* dropout_out_tensor,
-                      Tensor* qktv_out_tensor,
-                      Tensor* fmha_out_tensor) {
+  void ComputeForward(const phi::DenseTensor& qkv_input_tensor,
+                      const phi::DenseTensor* cache_kv_tensor,
+                      const phi::DenseTensor* src_mask_tensor,
+                      phi::DenseTensor* transpose_2_out_tensor,
+                      phi::DenseTensor* cache_kv_out_tensor,
+                      phi::DenseTensor* qk_out_tensor,
+                      phi::DenseTensor* src_mask_out_tensor,
+                      phi::DenseTensor* softmax_out_tensor,
+                      phi::DenseTensor* dropout_mask_out_tensor,
+                      phi::DenseTensor* dropout_out_tensor,
+                      phi::DenseTensor* qktv_out_tensor,
+                      phi::DenseTensor* fmha_out_tensor) {
     // input shape: [bs, seq_len, 3, num_head, head_dim]
     // transpose with perm [2, 0, 3, 1, 4],
     // output_shape: [3, bs, num_head, seq_len, head_dim]
@@ -138,8 +138,8 @@ class FMHARef {
       float alpha = 1.0 / sqrt(head_dim_);
       auto q_tensor = transpose_2_out_tensor->Slice(0, 1);
       auto functor = phi::funcs::ScaleFunctor<T>(alpha);
-      std::vector<const framework::Tensor*> ins = {&q_tensor};
-      std::vector<framework::Tensor*> outs = {&q_tensor};
+      std::vector<const phi::DenseTensor*> ins = {&q_tensor};
+      std::vector<phi::DenseTensor*> outs = {&q_tensor};
       phi::funcs::ElementwiseKernel<T>(dev_ctx_, ins, &outs, functor);
     }
 
@@ -179,8 +179,8 @@ class FMHARef {
                                         seq_len_,
                                         dev_ctx_.stream());
       } else {
-        std::vector<const Tensor*> ins;
-        std::vector<Tensor*> outs;
+        std::vector<const phi::DenseTensor*> ins;
+        std::vector<phi::DenseTensor*> outs;
         ins.emplace_back(qk_out_tensor);
         ins.emplace_back(src_mask_tensor);
         outs.emplace_back(src_mask_out_tensor);
@@ -216,7 +216,7 @@ class FMHARef {
           dropout_param_.is_upscale_in_train_,
           dropout_param_.is_fix_seed_,
           dropout_param_.seed_val_,
-          static_cast<const Tensor&>(*softmax_out_tensor),
+          static_cast<const phi::DenseTensor&>(*softmax_out_tensor),
           dropout_param_.seed_,
           dropout_mask_out_tensor,
           dropout_out_tensor,
@@ -258,22 +258,22 @@ class FMHARef {
         dev_ctx_, *qktv_out_tensor, perm_3, fmha_out_tensor);
   }
 
-  void ComputeBackward(const Tensor& transpose_2_out_tensor,
-                       const Tensor* src_mask_tensor,
-                       const Tensor& softmax_out_tensor,
-                       const Tensor& dropout_mask_out_tensor,
-                       const Tensor& dropout_out_tensor,
-                       const Tensor& qk_out_tensor,
-                       const Tensor& src_mask_out_tensor,
-                       const Tensor& fmha_out_grad_tensor,
-                       Tensor* qktv_out_grad_tensor,
-                       Tensor* dropout_out_grad_tensor,
-                       Tensor* softmax_out_grad_tensor,
-                       Tensor* src_mask_out_grad_tensor,
-                       Tensor* qk_out_grad_tensor,
-                       Tensor* transpose_2_out_grad_tensor,
-                       Tensor* src_mask_grad_tensor,
-                       Tensor* qkv_input_grad_tensor) {
+  void ComputeBackward(const phi::DenseTensor& transpose_2_out_tensor,
+                       const phi::DenseTensor* src_mask_tensor,
+                       const phi::DenseTensor& softmax_out_tensor,
+                       const phi::DenseTensor& dropout_mask_out_tensor,
+                       const phi::DenseTensor& dropout_out_tensor,
+                       const phi::DenseTensor& qk_out_tensor,
+                       const phi::DenseTensor& src_mask_out_tensor,
+                       const phi::DenseTensor& fmha_out_grad_tensor,
+                       phi::DenseTensor* qktv_out_grad_tensor,
+                       phi::DenseTensor* dropout_out_grad_tensor,
+                       phi::DenseTensor* softmax_out_grad_tensor,
+                       phi::DenseTensor* src_mask_out_grad_tensor,
+                       phi::DenseTensor* qk_out_grad_tensor,
+                       phi::DenseTensor* transpose_2_out_grad_tensor,
+                       phi::DenseTensor* src_mask_grad_tensor,
+                       phi::DenseTensor* qkv_input_grad_tensor) {
     auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx_);
     int q_size = batch_size_ * seq_len_ * num_head_ * head_dim_;
     int k_size = q_size;
@@ -385,7 +385,7 @@ class FMHARef {
           false,
           dropout_param_.dropout_prob_,
           dropout_param_.is_upscale_in_train_,
-          static_cast<const Tensor&>(*dropout_out_grad_tensor),
+          static_cast<const phi::DenseTensor&>(*dropout_out_grad_tensor),
           dropout_mask_out_tensor,
           softmax_out_grad_tensor,
           false);
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cc b/paddle/fluid/operators/fused/fused_attention_op.cc
index 90f6d34535196..e1c3bcdd83f46 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cc
+++ b/paddle/fluid/operators/fused/fused_attention_op.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class FusedAttentionOp : public framework::OperatorWithKernel {
  public:
@@ -257,7 +257,7 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    auto input = ctx.Input<Tensor>("X");
+    auto input = ctx.Input<phi::DenseTensor>("X");
     auto input_data_type = framework::TransToProtoVarType(input->dtype());
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
@@ -567,7 +567,7 @@ class FusedAttentionGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    auto input = ctx.Input<Tensor>("X");
+    auto input = ctx.Input<phi::DenseTensor>("X");
     auto input_data_type = framework::TransToProtoVarType(input->dtype());
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu
index 059d94031ac8e..62ea3f723dc9e 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_attention_op.cu
@@ -38,10 +38,10 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
-static void AllReduce(framework::Tensor &tensor,  // NOLINT
+static void AllReduce(phi::DenseTensor &tensor,  // NOLINT
                       const int ring_id,
                       const phi::GPUContext &ctx) {
   if (ring_id == -1) return;
@@ -82,46 +82,47 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     using U = LayerNormParamType<T>;
-    auto *input_x = ctx.Input<Tensor>("X");
+    auto *input_x = ctx.Input<phi::DenseTensor>("X");
     auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
     const auto pre_layer_norm = ctx.Attr<bool>("pre_layer_norm");
     const float epsilon = ctx.Attr<float>("epsilon");
-    auto *ln_scale = ctx.Input<Tensor>("LnScale");
-    auto *ln_bias = ctx.Input<Tensor>("LnBias");
-    auto *ln_mean = ctx.Output<Tensor>("LnMean");
-    auto *ln_var = ctx.Output<Tensor>("LnVariance");
-    auto *ln_out = ctx.Output<Tensor>("LnOut");
+    auto *ln_scale = ctx.Input<phi::DenseTensor>("LnScale");
+    auto *ln_bias = ctx.Input<phi::DenseTensor>("LnBias");
+    auto *ln_mean = ctx.Output<phi::DenseTensor>("LnMean");
+    auto *ln_var = ctx.Output<phi::DenseTensor>("LnVariance");
+    auto *ln_out = ctx.Output<phi::DenseTensor>("LnOut");
 
     // x: qkv's input [batch_size, seq_len, dim_embed]
     // y: qkv's weight: [3, num_head, dim_head, dim_embed]
-    auto *qkv_weight = ctx.Input<Tensor>("QKVW");
-    auto *qkv_bias = ctx.Input<Tensor>("QKVBias");
-    auto *qkv_out = ctx.Output<Tensor>("QKVOut");
-    auto *qkv_bias_out = ctx.Output<Tensor>("QKVBiasOut");
-
-    auto *src_mask = ctx.Input<Tensor>("SrcMask");
-    auto *transpose_out_2 = ctx.Output<Tensor>("TransposeOut2");
-    auto *cache_kv = ctx.Input<Tensor>("CacheKV");
-    auto *cache_kv_out = ctx.Output<Tensor>("CacheKVOut");
-    auto *qk_out = ctx.Output<Tensor>("QKOut");
-    auto *qktv_out = ctx.Output<Tensor>("QKTVOut");
-    auto *softmax_out = ctx.Output<Tensor>("SoftmaxOut");
-    auto *attn_dropout_mask_out = ctx.Output<Tensor>("AttnDropoutMaskOut");
-    auto *attn_dropout_out = ctx.Output<Tensor>("AttnDropoutOut");
-    auto *src_mask_out = ctx.Output<Tensor>("SrcMaskOut");
-    auto *fmha_out = ctx.Output<Tensor>("FMHAOut");
-
-    auto *out_linear_weight = ctx.Input<Tensor>("OutLinearW");
-    auto *out_linear_bias = ctx.Input<Tensor>("OutLinearBias");
-    auto *out_linear_out = ctx.Output<Tensor>("OutLinearOut");
-
-    auto *ln_scale_2 = ctx.Input<Tensor>("Ln2Scale");
-    auto *ln_bias_2 = ctx.Input<Tensor>("Ln2Bias");
-    auto *dropout_mask_out = ctx.Output<Tensor>("DropoutMaskOut");
+    auto *qkv_weight = ctx.Input<phi::DenseTensor>("QKVW");
+    auto *qkv_bias = ctx.Input<phi::DenseTensor>("QKVBias");
+    auto *qkv_out = ctx.Output<phi::DenseTensor>("QKVOut");
+    auto *qkv_bias_out = ctx.Output<phi::DenseTensor>("QKVBiasOut");
+
+    auto *src_mask = ctx.Input<phi::DenseTensor>("SrcMask");
+    auto *transpose_out_2 = ctx.Output<phi::DenseTensor>("TransposeOut2");
+    auto *cache_kv = ctx.Input<phi::DenseTensor>("CacheKV");
+    auto *cache_kv_out = ctx.Output<phi::DenseTensor>("CacheKVOut");
+    auto *qk_out = ctx.Output<phi::DenseTensor>("QKOut");
+    auto *qktv_out = ctx.Output<phi::DenseTensor>("QKTVOut");
+    auto *softmax_out = ctx.Output<phi::DenseTensor>("SoftmaxOut");
+    auto *attn_dropout_mask_out =
+        ctx.Output<phi::DenseTensor>("AttnDropoutMaskOut");
+    auto *attn_dropout_out = ctx.Output<phi::DenseTensor>("AttnDropoutOut");
+    auto *src_mask_out = ctx.Output<phi::DenseTensor>("SrcMaskOut");
+    auto *fmha_out = ctx.Output<phi::DenseTensor>("FMHAOut");
+
+    auto *out_linear_weight = ctx.Input<phi::DenseTensor>("OutLinearW");
+    auto *out_linear_bias = ctx.Input<phi::DenseTensor>("OutLinearBias");
+    auto *out_linear_out = ctx.Output<phi::DenseTensor>("OutLinearOut");
+
+    auto *ln_scale_2 = ctx.Input<phi::DenseTensor>("Ln2Scale");
+    auto *ln_bias_2 = ctx.Input<phi::DenseTensor>("Ln2Bias");
+    auto *dropout_mask_out = ctx.Output<phi::DenseTensor>("DropoutMaskOut");
     auto *bias_dropout_residual_out =
-        ctx.Output<Tensor>("BiasDropoutResidualOut");
-    auto *ln_mean_2 = ctx.Output<Tensor>("Ln2Mean");
-    auto *ln_var_2 = ctx.Output<Tensor>("Ln2Variance");
+        ctx.Output<phi::DenseTensor>("BiasDropoutResidualOut");
+    auto *ln_mean_2 = ctx.Output<phi::DenseTensor>("Ln2Mean");
+    auto *ln_var_2 = ctx.Output<phi::DenseTensor>("Ln2Variance");
     const float ln_epsilon = ctx.Attr<float>("ln_epsilon");
 
     float attn_dropout_rate = ctx.Attr<float>("attn_dropout_rate");
@@ -130,13 +131,14 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
         ctx.Attr<std::string>("attn_dropout_implementation");
     bool is_upscale_in_train_1 =
         (dropout_implementation_1 == "upscale_in_train");
-    auto *seed_1 = ctx.HasInput("Seed1") ? ctx.Input<Tensor>("Seed1") : nullptr;
+    auto *seed_1 =
+        ctx.HasInput("Seed1") ? ctx.Input<phi::DenseTensor>("Seed1") : nullptr;
     bool is_fix_seed_1 = ctx.Attr<bool>("attn_dropout_fix_seed");
     int seed_val_1 = ctx.Attr<int>("attn_dropout_seed");
     int ring_id = ctx.Attr<int>("ring_id");
 
     // final output.
-    auto *out = ctx.Output<Tensor>("Y");
+    auto *out = ctx.Output<phi::DenseTensor>("Y");
 
     // get data ptr for qkv part.
     const auto input_x_dims = input_x->dims();
@@ -377,29 +379,30 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
         ctx.Attr<std::string>("attn_dropout_implementation");
     bool is_upscale_in_train_1 =
         (dropout_implementation_1 == "upscale_in_train");
-    auto *seed_1 = ctx.HasInput("Seed1") ? ctx.Input<Tensor>("Seed1") : nullptr;
+    auto *seed_1 =
+        ctx.HasInput("Seed1") ? ctx.Input<phi::DenseTensor>("Seed1") : nullptr;
     bool is_fix_seed_1 = ctx.Attr<bool>("attn_dropout_fix_seed");
     int seed_val_1 = ctx.Attr<int>("attn_dropout_seed");
     int ring_id = ctx.Attr<int>("ring_id");
 
     // get inputs.
-    auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto *d_y = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
     auto *d_y_data = d_y->data<T>();
 
     // fw input
-    auto *input_x = ctx.Input<Tensor>("X");
-    auto *ln_scale = ctx.Input<Tensor>("LnScale");
-    auto *ln_2_scale = ctx.Input<Tensor>("Ln2Scale");
+    auto *input_x = ctx.Input<phi::DenseTensor>("X");
+    auto *ln_scale = ctx.Input<phi::DenseTensor>("LnScale");
+    auto *ln_2_scale = ctx.Input<phi::DenseTensor>("Ln2Scale");
     auto *x_data = input_x->data<T>();
     auto *ln_scale_data = (ln_scale == nullptr ? nullptr : ln_scale->data<U>());
     auto *ln_2_scale_data =
         (ln_2_scale == nullptr ? nullptr : ln_2_scale->data<U>());
     // fw parameters.
-    auto *src_mask = ctx.Input<Tensor>("SrcMask");
-    auto *qkv_weight = ctx.Input<Tensor>("QKVW");
-    auto *qkv_bias = ctx.Input<Tensor>("QKVBias");
-    auto *out_linear_weight = ctx.Input<Tensor>("OutLinearW");
-    auto *out_linear_bias = ctx.Input<Tensor>("OutLinearBias");
+    auto *src_mask = ctx.Input<phi::DenseTensor>("SrcMask");
+    auto *qkv_weight = ctx.Input<phi::DenseTensor>("QKVW");
+    auto *qkv_bias = ctx.Input<phi::DenseTensor>("QKVBias");
+    auto *out_linear_weight = ctx.Input<phi::DenseTensor>("OutLinearW");
+    auto *out_linear_bias = ctx.Input<phi::DenseTensor>("OutLinearBias");
     auto *src_mask_data = (src_mask == nullptr ? nullptr : src_mask->data<T>());
     auto *qkv_weight_data = qkv_weight->data<T>();
     auto *qkv_bias_data = (qkv_bias == nullptr) ? nullptr : qkv_bias->data<T>();
@@ -408,20 +411,21 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
         (out_linear_bias == nullptr) ? nullptr : out_linear_bias->data<T>();
 
     // fw output
-    auto *fmha_out = ctx.Input<Tensor>("FMHAOut");
-    auto *transpose_out_2 = ctx.Input<Tensor>("TransposeOut2");
-    auto *qk_out = ctx.Input<Tensor>("QKOut");
-    auto *qktv_out = ctx.Input<Tensor>("QKTVOut");
-    auto *softmax_out = ctx.Input<Tensor>("SoftmaxOut");
-    auto *attn_dropout_mask_out = ctx.Input<Tensor>("AttnDropoutMaskOut");
-    auto *attn_dropout_out = ctx.Input<Tensor>("AttnDropoutOut");
-    auto *src_mask_out = ctx.Input<Tensor>("SrcMaskOut");
-    auto *out_linear_out = ctx.Input<Tensor>("OutLinearOut");
-    auto *ln_2_mean = ctx.Input<Tensor>("Ln2Mean");
-    auto *ln_2_var = ctx.Input<Tensor>("Ln2Variance");
-    auto *dropout_mask_out = ctx.Input<Tensor>("DropoutMaskOut");
+    auto *fmha_out = ctx.Input<phi::DenseTensor>("FMHAOut");
+    auto *transpose_out_2 = ctx.Input<phi::DenseTensor>("TransposeOut2");
+    auto *qk_out = ctx.Input<phi::DenseTensor>("QKOut");
+    auto *qktv_out = ctx.Input<phi::DenseTensor>("QKTVOut");
+    auto *softmax_out = ctx.Input<phi::DenseTensor>("SoftmaxOut");
+    auto *attn_dropout_mask_out =
+        ctx.Input<phi::DenseTensor>("AttnDropoutMaskOut");
+    auto *attn_dropout_out = ctx.Input<phi::DenseTensor>("AttnDropoutOut");
+    auto *src_mask_out = ctx.Input<phi::DenseTensor>("SrcMaskOut");
+    auto *out_linear_out = ctx.Input<phi::DenseTensor>("OutLinearOut");
+    auto *ln_2_mean = ctx.Input<phi::DenseTensor>("Ln2Mean");
+    auto *ln_2_var = ctx.Input<phi::DenseTensor>("Ln2Variance");
+    auto *dropout_mask_out = ctx.Input<phi::DenseTensor>("DropoutMaskOut");
     auto *bias_dropout_residual_out =
-        ctx.Input<Tensor>("BiasDropoutResidualOut");
+        ctx.Input<phi::DenseTensor>("BiasDropoutResidualOut");
     auto *fmha_out_data = fmha_out->data<T>();
     auto *transpose_out_2_data = transpose_out_2->data<T>();
     auto *qk_out_data = qk_out->data<T>();
@@ -433,25 +437,29 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
     auto *dropout_mask_out_data = dropout_mask_out->data<uint8_t>();
 
     // output's grad
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_qkv_out = ctx.Output<Tensor>(framework::GradVarName("QKVOut"));
+    auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto *d_qkv_out =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("QKVOut"));
     auto *d_qkv_bias_out =
-        ctx.Output<Tensor>(framework::GradVarName("QKVBiasOut"));
-    auto *d_qktv_out = ctx.Output<Tensor>(framework::GradVarName("QKTVOut"));
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("QKVBiasOut"));
+    auto *d_qktv_out =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("QKTVOut"));
     auto *d_transpose_out_2 =
-        ctx.Output<Tensor>(framework::GradVarName("TransposeOut2"));
-    auto *d_qk_out = ctx.Output<Tensor>(framework::GradVarName("QKOut"));
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("TransposeOut2"));
+    auto *d_qk_out =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("QKOut"));
     auto *d_softmax_out =
-        ctx.Output<Tensor>(framework::GradVarName("SoftmaxOut"));
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("SoftmaxOut"));
     auto *d_attn_dropout_out =
-        ctx.Output<Tensor>(framework::GradVarName("AttnDropoutOut"));
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("AttnDropoutOut"));
     auto *d_src_mask_out =
-        ctx.Output<Tensor>(framework::GradVarName("SrcMaskOut"));
-    auto *d_fmha_out = ctx.Output<Tensor>(framework::GradVarName("FMHAOut"));
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("SrcMaskOut"));
+    auto *d_fmha_out =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("FMHAOut"));
     auto *d_out_linear_out =
-        ctx.Output<Tensor>(framework::GradVarName("OutLinearOut"));
-    auto *d_bias_dropout_residual_out =
-        ctx.Output<Tensor>(framework::GradVarName("BiasDropoutResidualOut"));
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("OutLinearOut"));
+    auto *d_bias_dropout_residual_out = ctx.Output<phi::DenseTensor>(
+        framework::GradVarName("BiasDropoutResidualOut"));
     auto *d_x_data = dev_ctx.template Alloc<T>(d_x, d_x->numel() * sizeof(T));
     // when qkv_bias is not nullptr, d_qkv_out is equals to d_qkv_bias_out, the
     // space can be reused.
@@ -485,14 +493,18 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
         d_out_linear_out, d_out_linear_out->numel() * sizeof(T));
 
     // parameter grad
-    auto *d_qkv_weight = ctx.Output<Tensor>(framework::GradVarName("QKVW"));
-    auto *d_qkv_bias = ctx.Output<Tensor>(framework::GradVarName("QKVBias"));
+    auto *d_qkv_weight =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("QKVW"));
+    auto *d_qkv_bias =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("QKVBias"));
     auto *d_out_linear_weight =
-        ctx.Output<Tensor>(framework::GradVarName("OutLinearW"));
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("OutLinearW"));
     auto *d_out_linear_bias =
-        ctx.Output<Tensor>(framework::GradVarName("OutLinearBias"));
-    auto *d_ln_2_scale = ctx.Output<Tensor>(framework::GradVarName("Ln2Scale"));
-    auto *d_ln_2_bias = ctx.Output<Tensor>(framework::GradVarName("Ln2Bias"));
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("OutLinearBias"));
+    auto *d_ln_2_scale =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Ln2Scale"));
+    auto *d_ln_2_bias =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Ln2Bias"));
 
     auto *d_qkv_weight_data = dev_ctx.template Alloc<T>(
         d_qkv_weight, d_qkv_weight->numel() * sizeof(T));
@@ -664,16 +676,19 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
     }
 
     if (pre_layer_norm) {
-      auto *ln_mean = ctx.Input<Tensor>("LnMean");
-      auto *ln_var = ctx.Input<Tensor>("LnVariance");
-      auto *ln_out = ctx.Input<Tensor>("LnOut");
+      auto *ln_mean = ctx.Input<phi::DenseTensor>("LnMean");
+      auto *ln_var = ctx.Input<phi::DenseTensor>("LnVariance");
+      auto *ln_out = ctx.Input<phi::DenseTensor>("LnOut");
       auto *ln_mean_data = ln_mean->data<U>();
       auto *ln_var_data = ln_var->data<U>();
       auto *ln_out_data = ln_out->data<T>();
 
-      auto *d_ln_out = ctx.Output<Tensor>(framework::GradVarName("LnOut"));
-      auto *d_ln_scale = ctx.Output<Tensor>(framework::GradVarName("LnScale"));
-      auto *d_ln_bias = ctx.Output<Tensor>(framework::GradVarName("LnBias"));
+      auto *d_ln_out =
+          ctx.Output<phi::DenseTensor>(framework::GradVarName("LnOut"));
+      auto *d_ln_scale =
+          ctx.Output<phi::DenseTensor>(framework::GradVarName("LnScale"));
+      auto *d_ln_bias =
+          ctx.Output<phi::DenseTensor>(framework::GradVarName("LnBias"));
       auto *d_ln_out_data =
           dev_ctx.template Alloc<T>(d_ln_out, d_ln_out->numel() * sizeof(T));
       auto *d_ln_scale_data =
diff --git a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
index 3e888a2e67fc7..94131197060b5 100644
--- a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
+++ b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class FusedBiasDropoutResidualLnOp : public framework::OperatorWithKernel {
  public:
@@ -64,7 +64,7 @@ class FusedBiasDropoutResidualLnOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    auto input = ctx.Input<Tensor>("X");
+    auto input = ctx.Input<phi::DenseTensor>("X");
     auto input_data_type = framework::TransToProtoVarType(input->dtype());
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
@@ -194,7 +194,7 @@ class FusedBiasDropoutResidualLnGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    auto input = ctx.Input<Tensor>("X");
+    auto input = ctx.Input<phi::DenseTensor>("X");
     auto input_data_type = framework::TransToProtoVarType(input->dtype());
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
diff --git a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu
index b194f07c848da..6da533aa77f3c 100644
--- a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu
+++ b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu
@@ -25,7 +25,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class FusedBiasDropoutResidualLnOpKernel : public framework::OpKernel<T> {
@@ -33,18 +33,18 @@ class FusedBiasDropoutResidualLnOpKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext &ctx) const override {
     auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
     using U = LayerNormParamType<T>;
-    auto *input_x = ctx.Input<Tensor>("X");
-    auto *bias = ctx.Input<Tensor>("Bias");
-    auto *residual = ctx.Input<Tensor>("Residual");
+    auto *input_x = ctx.Input<phi::DenseTensor>("X");
+    auto *bias = ctx.Input<phi::DenseTensor>("Bias");
+    auto *residual = ctx.Input<phi::DenseTensor>("Residual");
     const float ln_epsilon = ctx.Attr<float>("ln_epsilon");
-    auto *ln_scale = ctx.Input<Tensor>("LnScale");
-    auto *ln_bias = ctx.Input<Tensor>("LnBias");
-    auto *dropout_mask_out = ctx.Output<Tensor>("DropoutMaskOut");
+    auto *ln_scale = ctx.Input<phi::DenseTensor>("LnScale");
+    auto *ln_bias = ctx.Input<phi::DenseTensor>("LnBias");
+    auto *dropout_mask_out = ctx.Output<phi::DenseTensor>("DropoutMaskOut");
     auto *bias_dropout_residual_out =
-        ctx.Output<Tensor>("BiasDropoutResidualOut");
-    auto *ln_mean = ctx.Output<Tensor>("LnMean");
-    auto *ln_var = ctx.Output<Tensor>("LnVariance");
-    auto *y = ctx.Output<Tensor>("Y");
+        ctx.Output<phi::DenseTensor>("BiasDropoutResidualOut");
+    auto *ln_mean = ctx.Output<phi::DenseTensor>("LnMean");
+    auto *ln_var = ctx.Output<phi::DenseTensor>("LnVariance");
+    auto *y = ctx.Output<phi::DenseTensor>("Y");
     auto *x_data = input_x->data<T>();
     auto *bias_data = (bias == nullptr) ? nullptr : bias->data<T>();
     auto *residual_data = (residual == nullptr) ? nullptr : residual->data<T>();
@@ -96,13 +96,13 @@ class FusedBiasDropoutResidualLnGradKernel : public framework::OpKernel<T> {
     using U = LayerNormParamType<T>;
     const float ln_epsilon = ctx.Attr<float>("ln_epsilon");
     auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
-    auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto *ln_scale = ctx.Input<Tensor>("LnScale");
-    auto *dropout_mask_out = ctx.Input<Tensor>("DropoutMaskOut");
+    auto *d_y = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
+    auto *ln_scale = ctx.Input<phi::DenseTensor>("LnScale");
+    auto *dropout_mask_out = ctx.Input<phi::DenseTensor>("DropoutMaskOut");
     auto *bias_dropout_residual_out =
-        ctx.Input<Tensor>("BiasDropoutResidualOut");
-    auto *ln_mean = ctx.Input<Tensor>("LnMean");
-    auto *ln_var = ctx.Input<Tensor>("LnVariance");
+        ctx.Input<phi::DenseTensor>("BiasDropoutResidualOut");
+    auto *ln_mean = ctx.Input<phi::DenseTensor>("LnMean");
+    auto *ln_var = ctx.Input<phi::DenseTensor>("LnVariance");
     auto *d_y_data = d_y->data<T>();
     auto *ln_scale_data = (ln_scale == nullptr ? nullptr : ln_scale->data<U>());
     auto *dropout_mask_out_data = dropout_mask_out->data<uint8_t>();
@@ -110,13 +110,16 @@ class FusedBiasDropoutResidualLnGradKernel : public framework::OpKernel<T> {
     auto *ln_mean_data = ln_mean->data<U>();
     auto *ln_var_data = ln_var->data<U>();
 
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_residual = ctx.Output<Tensor>(framework::GradVarName("Residual"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-    auto *d_bias_dropout_residual_out =
-        ctx.Output<Tensor>(framework::GradVarName("BiasDropoutResidualOut"));
-    auto *d_ln_scale = ctx.Output<Tensor>(framework::GradVarName("LnScale"));
-    auto *d_ln_bias = ctx.Output<Tensor>(framework::GradVarName("LnBias"));
+    auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto *d_residual =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Residual"));
+    auto *d_bias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
+    auto *d_bias_dropout_residual_out = ctx.Output<phi::DenseTensor>(
+        framework::GradVarName("BiasDropoutResidualOut"));
+    auto *d_ln_scale =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("LnScale"));
+    auto *d_ln_bias =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("LnBias"));
     auto *d_x_data = dev_ctx.Alloc<T>(d_x, d_x->numel() * sizeof(T));
     auto *d_residual_data =
         dev_ctx.Alloc<T>(d_residual, d_residual->numel() * sizeof(T));
diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.cc b/paddle/fluid/operators/fused/fused_bn_activation_op.cc
index 2fdd38bc266fc..9a773fa91dc9c 100644
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.cc
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cc
@@ -168,26 +168,26 @@ framework::OpKernelType FusedBatchNormActOp::GetExpectedKernelType(
   if (input_data_type == framework::proto::VarType::FP64) {
     bn_param_type = framework::proto::VarType::FP64;
   }
-  PADDLE_ENFORCE_EQ(
-      bn_param_type,
-      framework::TransToProtoVarType(ctx.Input<Tensor>("Scale")->dtype()),
-      platform::errors::PreconditionNotMet(
-          "Scale input should be of float type"));
-  PADDLE_ENFORCE_EQ(
-      bn_param_type,
-      framework::TransToProtoVarType(ctx.Input<Tensor>("Bias")->dtype()),
-      platform::errors::PreconditionNotMet(
-          "Bias input should be of float type"));
-  PADDLE_ENFORCE_EQ(
-      bn_param_type,
-      framework::TransToProtoVarType(ctx.Input<Tensor>("Mean")->dtype()),
-      platform::errors::PreconditionNotMet(
-          "Mean input should be of float type"));
-  PADDLE_ENFORCE_EQ(
-      bn_param_type,
-      framework::TransToProtoVarType(ctx.Input<Tensor>("Variance")->dtype()),
-      platform::errors::PreconditionNotMet(
-          "Variance input should be of float type"));
+  PADDLE_ENFORCE_EQ(bn_param_type,
+                    framework::TransToProtoVarType(
+                        ctx.Input<phi::DenseTensor>("Scale")->dtype()),
+                    platform::errors::PreconditionNotMet(
+                        "Scale input should be of float type"));
+  PADDLE_ENFORCE_EQ(bn_param_type,
+                    framework::TransToProtoVarType(
+                        ctx.Input<phi::DenseTensor>("Bias")->dtype()),
+                    platform::errors::PreconditionNotMet(
+                        "Bias input should be of float type"));
+  PADDLE_ENFORCE_EQ(bn_param_type,
+                    framework::TransToProtoVarType(
+                        ctx.Input<phi::DenseTensor>("Mean")->dtype()),
+                    platform::errors::PreconditionNotMet(
+                        "Mean input should be of float type"));
+  PADDLE_ENFORCE_EQ(bn_param_type,
+                    framework::TransToProtoVarType(
+                        ctx.Input<phi::DenseTensor>("Variance")->dtype()),
+                    platform::errors::PreconditionNotMet(
+                        "Variance input should be of float type"));
 
   framework::LibraryType library = framework::LibraryType::kPlain;
   framework::DataLayout layout = framework::DataLayout::kAnyLayout;
diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_activation_op.cu
index 1a22de67b53db..c7fbdc88abb33 100644
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cu
@@ -30,7 +30,7 @@ DECLARE_bool(cudnn_batchnorm_spatial_persistent);
 
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 template <typename T>
 using CudnnDataType = platform::CudnnDataType<T>;
 template <typename T>
@@ -59,35 +59,35 @@ class FusedBatchNormActKernel<phi::GPUContext, T>
 
     // Get the size for each dimension.
     // NHWC [batch_size, in_height, in_width, in_channels]
-    const auto *x = ctx.Input<Tensor>("X");
+    const auto *x = ctx.Input<phi::DenseTensor>("X");
     const auto &x_dims = x->dims();
     PADDLE_ENFORCE_EQ(x_dims.size() >= 2 && x_dims.size() <= 5,
                       true,
                       platform::errors::PreconditionNotMet(
                           "The Input dim size should be between 2 and 5"));
 
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
+    const auto *scale = ctx.Input<phi::DenseTensor>("Scale");
+    const auto *bias = ctx.Input<phi::DenseTensor>("Bias");
 
     // Run training mode.
     // obtain running mean and running inv var, and see if we need to
     // initialize them.
-    auto *mean_out = ctx.Output<Tensor>("MeanOut");
-    auto *variance_out = ctx.Output<Tensor>("VarianceOut");
+    auto *mean_out = ctx.Output<phi::DenseTensor>("MeanOut");
+    auto *variance_out = ctx.Output<phi::DenseTensor>("VarianceOut");
     dev_ctx.Alloc<BatchNormParamType<T>>(
         mean_out, mean_out->numel() * sizeof(BatchNormParamType<T>));
     dev_ctx.Alloc<BatchNormParamType<T>>(
         variance_out, variance_out->numel() * sizeof(BatchNormParamType<T>));
 
-    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
-    auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
+    auto *saved_mean = ctx.Output<phi::DenseTensor>("SavedMean");
+    auto *saved_variance = ctx.Output<phi::DenseTensor>("SavedVariance");
     dev_ctx.Alloc<BatchNormParamType<T>>(
         saved_mean, saved_mean->numel() * sizeof(BatchNormParamType<T>));
     dev_ctx.Alloc<BatchNormParamType<T>>(
         saved_variance,
         saved_variance->numel() * sizeof(BatchNormParamType<T>));
 
-    auto *y = ctx.Output<Tensor>("Y");
+    auto *y = ctx.Output<phi::DenseTensor>("Y");
     dev_ctx.Alloc<T>(y, y->numel() * sizeof(T));
 
     int N, C, H, W, D;
@@ -147,7 +147,7 @@ class FusedBatchNormActKernel<phi::GPUContext, T>
     // Create reserve space and workspace for batch norm.
     // Create tensor for each batchnorm op, it will be used in the
     // backward. Thus this tensor shouldn't be temp.
-    auto *reserve_space = ctx.Output<Tensor>("ReserveSpace");
+    auto *reserve_space = ctx.Output<phi::DenseTensor>("ReserveSpace");
     PADDLE_ENFORCE_NOT_NULL(
         reserve_space,
         platform::errors::NotFound(
@@ -243,12 +243,12 @@ class FusedBatchNormActGradKernel<phi::GPUContext, T>
     double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
     std::string act_type = ctx.Attr<std::string>("act_type");
     auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto *y = ctx.Input<Tensor>("Y");
-    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-    const auto *reserve_space = ctx.Input<Tensor>("ReserveSpace");
+    const auto *x = ctx.Input<phi::DenseTensor>("X");
+    const auto *y = ctx.Input<phi::DenseTensor>("Y");
+    const auto *d_y = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
+    const auto *scale = ctx.Input<phi::DenseTensor>("Scale");
+    const auto *bias = ctx.Input<phi::DenseTensor>("Bias");
+    const auto *reserve_space = ctx.Input<phi::DenseTensor>("ReserveSpace");
 
     const auto &x_dims = x->dims();
 
@@ -261,9 +261,10 @@ class FusedBatchNormActGradKernel<phi::GPUContext, T>
     ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
 
     // init output
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+    auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto *d_scale =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
+    auto *d_bias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
 
     dev_ctx.Alloc<T>(d_x, d_x->numel() * sizeof(T));
     PADDLE_ENFORCE_EQ(
@@ -330,8 +331,8 @@ class FusedBatchNormActGradKernel<phi::GPUContext, T>
     PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor(
         bn_param_desc_, data_desc_, mode_));
 
-    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
-    const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
+    const auto *saved_mean = ctx.Input<phi::DenseTensor>("SavedMean");
+    const auto *saved_var = ctx.Input<phi::DenseTensor>("SavedVariance");
     const auto *saved_mean_data =
         saved_mean->template data<BatchNormParamType<T>>();
     const auto *saved_var_data =
diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.h b/paddle/fluid/operators/fused/fused_bn_activation_op.h
index c848a917c1dac..d7e5d236359bd 100644
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.h
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.h
@@ -26,7 +26,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class FusedBatchNormActOp : public framework::OperatorWithKernel {
  public:
@@ -39,7 +39,7 @@ class FusedBatchNormActOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override;
 };
 
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
index 84bcd9e591966..cf0f97cdc0037 100644
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
@@ -145,11 +145,13 @@ framework::OpKernelType FusedBatchNormAddActOp::GetExpectedKernelType(
 
   PADDLE_ENFORCE_EQ(
       bn_param_type,
-      framework::TransToProtoVarType(ctx.Input<Tensor>("Scale")->dtype()),
+      framework::TransToProtoVarType(
+          ctx.Input<phi::DenseTensor>("Scale")->dtype()),
       platform::errors::InvalidArgument("Scale input should be of float type"));
   PADDLE_ENFORCE_EQ(
       bn_param_type,
-      framework::TransToProtoVarType(ctx.Input<Tensor>("Bias")->dtype()),
+      framework::TransToProtoVarType(
+          ctx.Input<phi::DenseTensor>("Bias")->dtype()),
       platform::errors::InvalidArgument("Bias input should be of float type"));
 
   framework::LibraryType library = framework::LibraryType::kPlain;
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
index 6d541f0784234..5a192b2df5c94 100644
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
@@ -30,7 +30,7 @@ DECLARE_bool(cudnn_batchnorm_spatial_persistent);
 
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 template <typename T>
 using CudnnDataType = platform::CudnnDataType<T>;
 template <typename T>
@@ -59,29 +59,29 @@ class FusedBatchNormAddActKernel<phi::GPUContext, T>
 
     // Get the size for each dimension.
     // NHWC [batch_size, in_height, in_width, in_channels]
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto *z = ctx.Input<Tensor>("Z");
+    const auto *x = ctx.Input<phi::DenseTensor>("X");
+    const auto *z = ctx.Input<phi::DenseTensor>("Z");
     const auto &in_dims = x->dims();
 
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
+    const auto *scale = ctx.Input<phi::DenseTensor>("Scale");
+    const auto *bias = ctx.Input<phi::DenseTensor>("Bias");
 
-    auto *mean_out = ctx.Output<Tensor>("MeanOut");
-    auto *variance_out = ctx.Output<Tensor>("VarianceOut");
+    auto *mean_out = ctx.Output<phi::DenseTensor>("MeanOut");
+    auto *variance_out = ctx.Output<phi::DenseTensor>("VarianceOut");
     dev_ctx.Alloc<BatchNormParamType<T>>(
         mean_out, mean_out->numel() * sizeof(BatchNormParamType<T>));
     dev_ctx.Alloc<BatchNormParamType<T>>(
         variance_out, variance_out->numel() * sizeof(BatchNormParamType<T>));
 
-    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
-    auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
+    auto *saved_mean = ctx.Output<phi::DenseTensor>("SavedMean");
+    auto *saved_variance = ctx.Output<phi::DenseTensor>("SavedVariance");
     dev_ctx.Alloc<BatchNormParamType<T>>(
         saved_mean, saved_mean->numel() * sizeof(BatchNormParamType<T>));
     dev_ctx.Alloc<BatchNormParamType<T>>(
         saved_variance,
         saved_variance->numel() * sizeof(BatchNormParamType<T>));
 
-    auto *y = ctx.Output<Tensor>("Y");
+    auto *y = ctx.Output<phi::DenseTensor>("Y");
     dev_ctx.Alloc<T>(y, y->numel() * sizeof(T));
 
     int N, C, H, W, D;
@@ -124,7 +124,7 @@ class FusedBatchNormAddActKernel<phi::GPUContext, T>
     // Create reserve space and workspace for batch norm.
     // Create tensor for each batchnorm op, it will be used in the
     // backward. Thus this tensor shouldn't be temp.
-    auto *reserve_space = ctx.Output<Tensor>("ReserveSpace");
+    auto *reserve_space = ctx.Output<phi::DenseTensor>("ReserveSpace");
     PADDLE_ENFORCE_NOT_NULL(
         reserve_space,
         platform::errors::NotFound(
@@ -220,12 +220,12 @@ class FusedBatchNormAddActGradKernel<phi::GPUContext, T>
     double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
     std::string act_type = ctx.Attr<std::string>("act_type");
 
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto *y = ctx.Input<Tensor>("Y");
-    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-    const auto *reserve_space = ctx.Input<Tensor>("ReserveSpace");
+    const auto *x = ctx.Input<phi::DenseTensor>("X");
+    const auto *y = ctx.Input<phi::DenseTensor>("Y");
+    const auto *d_y = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
+    const auto *scale = ctx.Input<phi::DenseTensor>("Scale");
+    const auto *bias = ctx.Input<phi::DenseTensor>("Bias");
+    const auto *reserve_space = ctx.Input<phi::DenseTensor>("ReserveSpace");
 
     auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
     const auto &in_dims = x->dims();
@@ -235,10 +235,11 @@ class FusedBatchNormAddActGradKernel<phi::GPUContext, T>
     ExtractNCWHD(in_dims, data_layout, &N, &C, &H, &W, &D);
 
     // init output
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_z = ctx.Output<Tensor>(framework::GradVarName("Z"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+    auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto *d_z = ctx.Output<phi::DenseTensor>(framework::GradVarName("Z"));
+    auto *d_scale =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
+    auto *d_bias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
 
     d_x->mutable_data<T>(ctx.GetPlace());
     d_z->mutable_data<T>(ctx.GetPlace());
@@ -286,8 +287,8 @@ class FusedBatchNormAddActGradKernel<phi::GPUContext, T>
     PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor(
         bn_param_desc_, data_desc_, mode_));
 
-    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
-    const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
+    const auto *saved_mean = ctx.Input<phi::DenseTensor>("SavedMean");
+    const auto *saved_var = ctx.Input<phi::DenseTensor>("SavedVariance");
     const auto *saved_mean_data =
         saved_mean->template data<BatchNormParamType<T>>();
     const auto *saved_var_data =
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.h b/paddle/fluid/operators/fused/fused_bn_add_activation_op.h
index 07d2e4564b692..f4913bca3df98 100644
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.h
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.h
@@ -26,7 +26,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class FusedBatchNormAddActOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu b/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu
index 06810c18cc05a..56f150c2dce42 100644
--- a/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu
@@ -49,8 +49,8 @@ struct TestFusedDropoutActBias {
   bool is_upscale_in_train;
   bool is_test;  // default false,  Set to true for inference only
   bool has_bias = true;
-  framework::Tensor src, bias, out, mask;
-  framework::Tensor dsrc, dbias;
+  phi::DenseTensor src, bias, out, mask;
+  phi::DenseTensor dsrc, dbias;
 
   std::vector<T> src_vec, bias_vec, out_vec, mask_vec;
   std::vector<T> correct_out, correct_dsrc, correct_dbias;
diff --git a/paddle/fluid/operators/fused/fused_dropout_helper.h b/paddle/fluid/operators/fused/fused_dropout_helper.h
index 2d1491fefb07e..5d6dd1a5bbf81 100644
--- a/paddle/fluid/operators/fused/fused_dropout_helper.h
+++ b/paddle/fluid/operators/fused/fused_dropout_helper.h
@@ -38,7 +38,7 @@ struct DropoutParam {
   bool is_test;
   bool fix_seed;
   int increment;
-  const framework::Tensor* tensor_seed;
+  const phi::DenseTensor* tensor_seed;
   int seed_val;
 
   DropoutParam() {
@@ -56,7 +56,7 @@ struct DropoutParam {
                bool is_test_,
                bool is_upscale_in_train_,
                float dropout_prob_,
-               const framework::Tensor* tensor_seed_,
+               const phi::DenseTensor* tensor_seed_,
                int seed_val_) {
     fix_seed = fix_seed_;
     seed = seed_;
@@ -95,8 +95,9 @@ struct DropoutParam {
     } else {
       str_seed = str_seed + "Seed";
     }
-    tensor_seed =
-        context.HasInput(str_seed) ? context.Input<Tensor>(str_seed) : nullptr;
+    tensor_seed = context.HasInput(str_seed)
+                      ? context.Input<phi::DenseTensor>(str_seed)
+                      : nullptr;
     seed_val = context.Attr<int>(pre_fix + "seed");
   }
 
diff --git a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc
index 8560907680480..8c81a646fdebb 100644
--- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc
+++ b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc
@@ -174,8 +174,8 @@ class FusedElemwiseActivationOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx.Input<framework::Tensor>("X")->dtype(),
-                      ctx.Input<framework::Tensor>("Y")->dtype(),
+    PADDLE_ENFORCE_EQ(ctx.Input<phi::DenseTensor>("X")->dtype(),
+                      ctx.Input<phi::DenseTensor>("Y")->dtype(),
                       platform::errors::InvalidArgument(
                           "The element's type of input should be the same."));
     return framework::OpKernelType(
diff --git a/paddle/fluid/operators/fused/fused_elemwise_activation_op.h b/paddle/fluid/operators/fused/fused_elemwise_activation_op.h
index 5942404a6beb1..0d6a5e3b40da9 100644
--- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.h
+++ b/paddle/fluid/operators/fused/fused_elemwise_activation_op.h
@@ -49,13 +49,12 @@ template <typename DeviceContext,
           typename T,
           typename BinaryFunctor,
           typename UnaryFunctor>
-static void RunBinaryCompoundFunctor(
-    const framework::ExecutionContext &ctx,
-    const BinaryFunctor &binary_functor,
-    const UnaryFunctor &unary_functor,
-    const framework::Tensor &in_x,
-    const framework::Tensor &in_y,
-    std::vector<framework::Tensor *> *outputs) {
+static void RunBinaryCompoundFunctor(const framework::ExecutionContext &ctx,
+                                     const BinaryFunctor &binary_functor,
+                                     const UnaryFunctor &unary_functor,
+                                     const phi::DenseTensor &in_x,
+                                     const phi::DenseTensor &in_y,
+                                     std::vector<phi::DenseTensor *> *outputs) {
   // Z = Binary(X, Unary(Y))
   // intermediate_out = Unary(Y)
   // out = Binary(X, Unary(Y))
@@ -86,13 +85,12 @@ template <typename DeviceContext,
           typename T,
           typename UnaryFunctor,
           typename BinaryFunctor>
-static void RunUnaryCompoundFunctors(
-    const framework::ExecutionContext &ctx,
-    const UnaryFunctor &unary_functor,
-    const BinaryFunctor &binary_functor,
-    const framework::Tensor &in_x,
-    const framework::Tensor &in_y,
-    std::vector<framework::Tensor *> *outputs) {
+static void RunUnaryCompoundFunctors(const framework::ExecutionContext &ctx,
+                                     const UnaryFunctor &unary_functor,
+                                     const BinaryFunctor &binary_functor,
+                                     const phi::DenseTensor &in_x,
+                                     const phi::DenseTensor &in_y,
+                                     std::vector<phi::DenseTensor *> *outputs) {
   // Z = Unary(Binary(X, Y))
   // intermediate_out = Binary(X, Y)
   // out = Unary(Binary(X, Y))
@@ -132,14 +130,14 @@ static void RunBinaryCompoundGradFunctors(
     const BinaryGradFunctor &binary_grad_functor,
     const UnaryFunctor &unary_functor,
     const UnaryGradFunctor &unary_grad_functor,
-    const framework::Tensor *in_x,
-    const framework::Tensor *in_y,
-    const framework::Tensor *in_out,
-    const framework::Tensor *in_intermediate_out,
-    const framework::Tensor *in_out_grad,
-    framework::Tensor *x_grad,
-    framework::Tensor *y_grad,
-    framework::Tensor *d_intermediate_out) {
+    const phi::DenseTensor *in_x,
+    const phi::DenseTensor *in_y,
+    const phi::DenseTensor *in_out,
+    const phi::DenseTensor *in_intermediate_out,
+    const phi::DenseTensor *in_out_grad,
+    phi::DenseTensor *x_grad,
+    phi::DenseTensor *y_grad,
+    phi::DenseTensor *d_intermediate_out) {
   // Z = Binary(X, Unary(Y))
   int axis = ctx.Attr<int>("axis");
 
@@ -218,14 +216,14 @@ static void RunUnaryCompoundGradFunctors(
     const UnaryGradFunctor &unary_grad_functor,
     const BinaryFunctor &binary_functor,
     const BinaryGradFunctor &binary_grad_functor,
-    const framework::Tensor *in_x,
-    const framework::Tensor *in_y,
-    const framework::Tensor *in_out,
-    const framework::Tensor *in_intermediate_out,
-    const framework::Tensor *in_out_grad,
-    framework::Tensor *x_grad,
-    framework::Tensor *y_grad,
-    framework::Tensor *d_intermediate_out) {
+    const phi::DenseTensor *in_x,
+    const phi::DenseTensor *in_y,
+    const phi::DenseTensor *in_out,
+    const phi::DenseTensor *in_intermediate_out,
+    const phi::DenseTensor *in_out_grad,
+    phi::DenseTensor *x_grad,
+    phi::DenseTensor *y_grad,
+    phi::DenseTensor *d_intermediate_out) {
   // Z = Unary(Binary(X, Y))
   int axis = ctx.Attr<int>("axis");
 
@@ -298,9 +296,9 @@ static void RunUnaryCompoundGradFunctors(
 
 template <typename DeviceContext, typename T>
 static void RunFunctors(const framework::ExecutionContext &ctx,
-                        const framework::Tensor &in_x,
-                        const framework::Tensor &in_y,
-                        std::vector<framework::Tensor *> *outputs) {
+                        const phi::DenseTensor &in_x,
+                        const phi::DenseTensor &in_y,
+                        std::vector<phi::DenseTensor *> *outputs) {
   auto &functors = ctx.Attr<std::vector<std::string>>("functor_list");
 
   // TODO(zcd): The following code can be refined.
@@ -424,14 +422,14 @@ static void RunFunctors(const framework::ExecutionContext &ctx,
 
 template <typename DeviceContext, typename T, bool InPlace>
 static void RunGradFunctors(const framework::ExecutionContext &ctx,
-                            const framework::Tensor *in_x,
-                            const framework::Tensor *in_y,
-                            const framework::Tensor *in_out,
-                            const framework::Tensor *in_intermediate_out,
-                            const framework::Tensor *in_out_grad,
-                            framework::Tensor *x_grad,
-                            framework::Tensor *y_grad,
-                            framework::Tensor *d_intermediate_out) {
+                            const phi::DenseTensor *in_x,
+                            const phi::DenseTensor *in_y,
+                            const phi::DenseTensor *in_out,
+                            const phi::DenseTensor *in_intermediate_out,
+                            const phi::DenseTensor *in_out_grad,
+                            phi::DenseTensor *x_grad,
+                            phi::DenseTensor *y_grad,
+                            phi::DenseTensor *d_intermediate_out) {
   auto &functors = ctx.Attr<std::vector<std::string>>("functor_list");
   auto funcs_str = functors[0] + "," + functors[1];
 
@@ -622,11 +620,11 @@ template <typename DeviceContext, typename T>
 class FusedElemwiseActivationKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    auto &in_x = GET_DATA_SAFELY(ctx.Input<framework::Tensor>("X"),
+    auto &in_x = GET_DATA_SAFELY(ctx.Input<phi::DenseTensor>("X"),
                                  "Input",
                                  "X",
                                  "FusedElemwiseActivation");
-    auto &in_y = GET_DATA_SAFELY(ctx.Input<framework::Tensor>("Y"),
+    auto &in_y = GET_DATA_SAFELY(ctx.Input<phi::DenseTensor>("Y"),
                                  "Input",
                                  "Y",
                                  "FusedElemwiseActivation");
@@ -635,9 +633,9 @@ class FusedElemwiseActivationKernel : public framework::OpKernel<T> {
                       true,
                       platform::errors::InvalidArgument(
                           "The output(Out) should not be empty"));
-    auto output = ctx.Output<framework::Tensor>("Out");
+    auto output = ctx.Output<phi::DenseTensor>("Out");
 
-    std::vector<framework::Tensor *> outputs;
+    std::vector<phi::DenseTensor *> outputs;
     outputs.emplace_back(output);
 
     if (ctx.Attr<bool>("save_intermediate_out")) {
@@ -647,7 +645,7 @@ class FusedElemwiseActivationKernel : public framework::OpKernel<T> {
                             "The save_intermediate_out is enable, so the "
                             "IntermediateOut should not be empty."));
 
-      auto intermediate_out = ctx.Output<framework::Tensor>("IntermediateOut");
+      auto intermediate_out = ctx.Output<phi::DenseTensor>("IntermediateOut");
       outputs.emplace_back(intermediate_out);
     } else {
       outputs.emplace_back(nullptr);
@@ -661,42 +659,42 @@ template <typename DeviceContext, typename T>
 class FusedElemwiseActivationGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    auto in_y = ctx.Input<framework::Tensor>("Y");
+    auto in_y = ctx.Input<phi::DenseTensor>("Y");
     PADDLE_ENFORCE_NE(
         in_y,
         nullptr,
         platform::errors::InvalidArgument("Input(Y) should not be nullptr."));
-    auto in_out = ctx.Input<framework::Tensor>("Out");
+    auto in_out = ctx.Input<phi::DenseTensor>("Out");
     PADDLE_ENFORCE_NE(
         in_out,
         nullptr,
         platform::errors::InvalidArgument("Input(Out) should not be nullptr."));
     auto in_out_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
     PADDLE_ENFORCE_NE(in_out_grad,
                       nullptr,
                       platform::errors::InvalidArgument(
                           "Input(Out@Grad) should not be nullptr."));
 
-    framework::Tensor *in_x =
-        const_cast<framework::Tensor *>(ctx.Input<framework::Tensor>("X"));
-    framework::Tensor *x_grad =
-        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    framework::Tensor *y_grad =
-        ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
-    framework::Tensor *d_intermediate_out = ctx.Output<framework::Tensor>(
-        framework::GradVarName("IntermediateOut"));
+    phi::DenseTensor *in_x =
+        const_cast<phi::DenseTensor *>(ctx.Input<phi::DenseTensor>("X"));
+    phi::DenseTensor *x_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    phi::DenseTensor *y_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
+    phi::DenseTensor *d_intermediate_out =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("IntermediateOut"));
 
     auto functor_list = ctx.Attr<std::vector<std::string>>("functor_list");
 
     // Get intermediate_out
-    framework::Tensor *in_intermediate_out = nullptr;
+    phi::DenseTensor *in_intermediate_out = nullptr;
     if (ctx.Attr<bool>("save_intermediate_out")) {
       // if save_intermediate_out is true, for Unary(Binary(x, y)) and
       // Binary(x, Unary(y)), the Binary(x, y) and Unary(y) not need to
       // recompute.
-      in_intermediate_out = const_cast<framework::Tensor *>(
-          ctx.Input<framework::Tensor>("IntermediateOut"));
+      in_intermediate_out = const_cast<phi::DenseTensor *>(
+          ctx.Input<phi::DenseTensor>("IntermediateOut"));
       PADDLE_ENFORCE_NE(in_intermediate_out,
                         nullptr,
                         platform::errors::InvalidArgument(
@@ -725,7 +723,7 @@ class FusedElemwiseActivationGradKernel : public framework::OpKernel<T> {
                         platform::errors::InvalidArgument(
                             "Only when the compoundfunctor contains "
                             "elementwise_add_grad, the 'X' could be absent."));
-      in_x = const_cast<framework::Tensor *>(in_out_grad);
+      in_x = const_cast<phi::DenseTensor *>(in_out_grad);
     }
 
     bool has_in_place = HasInPlaceUnary(functor_list);
diff --git a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cc b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cc
index 1b291cfa018ad..4f8c4d12d6b58 100644
--- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cc
@@ -105,7 +105,7 @@ class EmbeddingEltWiseLayerNormOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    auto inputs = ctx.MultiInput<framework::Tensor>("Embs");
+    auto inputs = ctx.MultiInput<phi::DenseTensor>("Embs");
     auto input_data_type = framework::proto::VarType::Type(0);
     bool flag = 0;
     for (auto* input : inputs) {
diff --git a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
index 75e131b2deb34..8360f07a5f3e7 100644
--- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
@@ -33,13 +33,13 @@ template <typename DeviceContext, typename T>
 class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    using Tensor = framework::Tensor;
+    using Tensor = phi::DenseTensor;
     auto &device_ctx = context.template device_context<DeviceContext>();
-    auto ids = context.MultiInput<framework::Tensor>("Ids");
-    auto embs = context.MultiInput<framework::Tensor>("Embs");
+    auto ids = context.MultiInput<phi::DenseTensor>("Ids");
+    auto embs = context.MultiInput<phi::DenseTensor>("Embs");
     int input_num = static_cast<int>(ids.size());
 
-    framework::Tensor in_ids_(
+    phi::DenseTensor in_ids_(
         framework::TransToPhiDataType(framework::proto::VarType::INT64)),
         in_embs_(
             framework::TransToPhiDataType(framework::proto::VarType::INT64));
@@ -90,9 +90,9 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel<T> {
                     device_ctx.stream());
 #endif
 
-    auto *bias = context.Input<framework::Tensor>("Bias");
-    auto *scale = context.Input<framework::Tensor>("Scale");
-    auto *out = context.Output<framework::Tensor>("Out");
+    auto *bias = context.Input<phi::DenseTensor>("Bias");
+    auto *scale = context.Input<phi::DenseTensor>("Scale");
+    auto *out = context.Output<phi::DenseTensor>("Out");
 
     // should be (B * S * hidden)
     auto id0_dims = ids[0]->dims();
diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
index 8f413f34242a8..af75fa6112e3a 100644
--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
@@ -285,17 +285,17 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
     act_cand = act_functor(act_cand_str);                                    \
   }
 
-#define INIT_BASE_INPUT_OUTPUT                        \
-  auto* ids = ctx.Input<LoDTensor>("Ids");            \
-  auto* h0 = ctx.Input<Tensor>("H0");                 \
-  auto* c0 = ctx.Input<Tensor>("C0");                 \
-  auto* embeddings = ctx.Input<Tensor>("Embeddings"); \
-  auto* wh = ctx.Input<Tensor>("WeightH");            \
-  auto* bias = ctx.Input<Tensor>("Bias");             \
-  auto* xx = ctx.Output<LoDTensor>("XX");             \
-  auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
-  auto* cell_out = ctx.Output<LoDTensor>("Cell");     \
-  bool is_reverse = ctx.Attr<bool>("is_reverse");     \
+#define INIT_BASE_INPUT_OUTPUT                                  \
+  auto* ids = ctx.Input<LoDTensor>("Ids");                      \
+  auto* h0 = ctx.Input<phi::DenseTensor>("H0");                 \
+  auto* c0 = ctx.Input<phi::DenseTensor>("C0");                 \
+  auto* embeddings = ctx.Input<phi::DenseTensor>("Embeddings"); \
+  auto* wh = ctx.Input<phi::DenseTensor>("WeightH");            \
+  auto* bias = ctx.Input<phi::DenseTensor>("Bias");             \
+  auto* xx = ctx.Output<LoDTensor>("XX");                       \
+  auto* hidden_out = ctx.Output<LoDTensor>("Hidden");           \
+  auto* cell_out = ctx.Output<LoDTensor>("Cell");               \
+  bool is_reverse = ctx.Attr<bool>("is_reverse");               \
   bool use_peepholes = ctx.Attr<bool>("use_peepholes");
 
 #define INIT_BASE_SIZES                                \
@@ -506,8 +506,8 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
     INIT_VEC_FUNC
     INIT_BASE_INPUT_DATAS
 
-    auto* reordered_h0 = ctx.Output<Tensor>("ReorderedH0");
-    auto* reordered_c0 = ctx.Output<Tensor>("ReorderedC0");
+    auto* reordered_h0 = ctx.Output<phi::DenseTensor>("ReorderedH0");
+    auto* reordered_c0 = ctx.Output<phi::DenseTensor>("ReorderedC0");
     auto* batched_input = ctx.Output<LoDTensor>("BatchedInput");
     auto* batched_c_out = ctx.Output<LoDTensor>("BatchedCell");
     auto* batched_h_out = ctx.Output<LoDTensor>("BatchedHidden");
diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.h b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.h
index 2775b2ac04d28..129123fc52cd7 100644
--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.h
+++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.h
@@ -19,7 +19,7 @@ namespace paddle {
 namespace operators {
 
 using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class FusedEmbeddingFCLSTMOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
index c593c65618d78..ced30ccc50ec5 100644
--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
@@ -28,7 +28,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 using SelectedRows = phi::SelectedRows;
 using DDim = framework::DDim;
diff --git a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
index 758fb8a23f8f9..74ba0b54afd45 100644
--- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
@@ -384,9 +384,9 @@ template <typename T>
 class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* w = ctx.Input<framework::Tensor>("W");
-    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* w = ctx.Input<phi::DenseTensor>("W");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     auto w_dims = w->dims();
     int N = w_dims[1];
@@ -413,18 +413,18 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel<T> {
               static_cast<T>(0.0),
               out_data,
               N);
-    auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* bias_0 = ctx.Input<framework::Tensor>("Bias0");
-    auto* bias_1 = ctx.Input<framework::Tensor>("Bias1");
-    auto* scale = ctx.Input<framework::Tensor>("Scale");
+    auto* y = ctx.Input<phi::DenseTensor>("Y");
+    auto* bias_0 = ctx.Input<phi::DenseTensor>("Bias0");
+    auto* bias_1 = ctx.Input<phi::DenseTensor>("Bias1");
+    auto* scale = ctx.Input<phi::DenseTensor>("Scale");
 
     const T* y_data = y->data<T>();
     const T* bias_0_data = bias_0 ? bias_0->data<T>() : nullptr;
     const T* bias_1_data = bias_1 ? bias_1->data<T>() : nullptr;
     const T* scale_data = scale ? scale->data<T>() : nullptr;
 
-    auto* mean = ctx.Output<framework::Tensor>("Mean");
-    auto* variance = ctx.Output<framework::Tensor>("Variance");
+    auto* mean = ctx.Output<phi::DenseTensor>("Mean");
+    auto* variance = ctx.Output<phi::DenseTensor>("Variance");
 
     T* mean_data =
         mean ? dev_ctx.template Alloc<T>(mean, mean->numel() * sizeof(T))
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cc b/paddle/fluid/operators/fused/fused_feedforward_op.cc
index 9b8b256a9ee54..71fe468f780b2 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cc
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cc
@@ -23,7 +23,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class FusedFeedForwardOp : public framework::OperatorWithKernel {
  public:
@@ -345,7 +345,7 @@ class FusedFeedForwardOpGrad : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    auto input = ctx.Input<Tensor>("X");
+    auto input = ctx.Input<phi::DenseTensor>("X");
     auto input_data_type = framework::TransToProtoVarType(input->dtype());
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu
index 33d1e89bf28fe..6084b1f61f80c 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cu
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu
@@ -31,10 +31,10 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
-static void AllReduce(framework::Tensor& tensor,  // NOLINT
+static void AllReduce(phi::DenseTensor& tensor,  // NOLINT
                       const int ring_id,
                       const phi::GPUContext& ctx) {
   if (ring_id == -1) return;
@@ -74,9 +74,9 @@ template <typename DeviceContext, typename T>
 class FusedFeedForwardKernel : public framework::OpKernel<T> {
  public:
   void MatMul(const phi::GPUContext& ctx,
-              const framework::Tensor& a,
-              const framework::Tensor& b,
-              framework::Tensor* c) const {
+              const phi::DenseTensor& a,
+              const phi::DenseTensor& b,
+              phi::DenseTensor* c) const {
     auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
     auto a_2d = FoldInitDims(a);
     auto b_2d = FoldInitDims(b);
@@ -87,26 +87,26 @@ class FusedFeedForwardKernel : public framework::OpKernel<T> {
   }
 
   void FFN(const phi::GPUContext& ctx,
-           const framework::Tensor& x,
-           const framework::Tensor& linear1_weight,
-           const framework::Tensor* linear1_bias,
-           const framework::Tensor& linear2_weight,
-           const framework::Tensor* linear2_bias,
-           const framework::Tensor* ln1_scale,
-           const framework::Tensor* ln1_bias,
-           const framework::Tensor* ln2_scale,
-           const framework::Tensor* ln2_bias,
-           framework::Tensor* out,
-           framework::Tensor* dropout1_mask,
-           framework::Tensor* dropout2_mask,
-           framework::Tensor* ln1_mean,
-           framework::Tensor* ln1_variance,
-           framework::Tensor* ln2_mean,
-           framework::Tensor* ln2_variance,
-           framework::Tensor* linear1_out,
-           framework::Tensor* ln1_out,
-           framework::Tensor* dropout1_out,
-           framework::Tensor* dropout2_out,
+           const phi::DenseTensor& x,
+           const phi::DenseTensor& linear1_weight,
+           const phi::DenseTensor* linear1_bias,
+           const phi::DenseTensor& linear2_weight,
+           const phi::DenseTensor* linear2_bias,
+           const phi::DenseTensor* ln1_scale,
+           const phi::DenseTensor* ln1_bias,
+           const phi::DenseTensor* ln2_scale,
+           const phi::DenseTensor* ln2_bias,
+           phi::DenseTensor* out,
+           phi::DenseTensor* dropout1_mask,
+           phi::DenseTensor* dropout2_mask,
+           phi::DenseTensor* ln1_mean,
+           phi::DenseTensor* ln1_variance,
+           phi::DenseTensor* ln2_mean,
+           phi::DenseTensor* ln2_variance,
+           phi::DenseTensor* linear1_out,
+           phi::DenseTensor* ln1_out,
+           phi::DenseTensor* dropout1_out,
+           phi::DenseTensor* dropout2_out,
            const int bsz_seq,
            const int d_model,
            const int dim_feedforward,
@@ -126,7 +126,7 @@ class FusedFeedForwardKernel : public framework::OpKernel<T> {
         ctx, bsz_seq, d_model, dropout_param2, epsilon2);
 
     using U = LayerNormParamType<T>;
-    const framework::Tensor* in = &x;
+    const phi::DenseTensor* in = &x;
 
     const U* ln1_scale_ptr =
         ln1_scale == nullptr ? nullptr : ln1_scale->data<U>();
@@ -156,7 +156,7 @@ class FusedFeedForwardKernel : public framework::OpKernel<T> {
                                             act_method,
                                             dropout1_out->data<T>(),
                                             dropout1_mask->data<uint8_t>());
-    framework::Tensor linear2_out;
+    phi::DenseTensor linear2_out;
     linear2_out.Resize({bsz_seq, d_model});
     ctx.Alloc<T>(&linear2_out, linear2_out.numel() * sizeof(T));
     MatMul(ctx, *dropout1_out, linear2_weight, &linear2_out);
@@ -197,43 +197,41 @@ class FusedFeedForwardKernel : public framework::OpKernel<T> {
   }
 
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* linear1_weight = context.Input<framework::Tensor>("Linear1Weight");
-    auto* linear1_bias = context.Input<framework::Tensor>("Linear1Bias");
-    auto* linear2_weight = context.Input<framework::Tensor>("Linear2Weight");
-    auto* linear2_bias = context.Input<framework::Tensor>("Linear2Bias");
+    auto* x = context.Input<phi::DenseTensor>("X");
+    auto* linear1_weight = context.Input<phi::DenseTensor>("Linear1Weight");
+    auto* linear1_bias = context.Input<phi::DenseTensor>("Linear1Bias");
+    auto* linear2_weight = context.Input<phi::DenseTensor>("Linear2Weight");
+    auto* linear2_bias = context.Input<phi::DenseTensor>("Linear2Bias");
     const bool pre_layer_norm = context.Attr<bool>("pre_layer_norm");
     auto& dev_ctx = context.template device_context<phi::GPUContext>();
 
     auto* ln1_scale =
-        pre_layer_norm ? context.Input<framework::Tensor>("Ln1Scale") : nullptr;
+        pre_layer_norm ? context.Input<phi::DenseTensor>("Ln1Scale") : nullptr;
     auto* ln1_bias =
-        pre_layer_norm ? context.Input<framework::Tensor>("Ln1Bias") : nullptr;
-    auto* ln2_scale = !pre_layer_norm
-                          ? context.Input<framework::Tensor>("Ln2Scale")
-                          : nullptr;
+        pre_layer_norm ? context.Input<phi::DenseTensor>("Ln1Bias") : nullptr;
+    auto* ln2_scale =
+        !pre_layer_norm ? context.Input<phi::DenseTensor>("Ln2Scale") : nullptr;
     auto* ln2_bias =
-        !pre_layer_norm ? context.Input<framework::Tensor>("Ln2Bias") : nullptr;
+        !pre_layer_norm ? context.Input<phi::DenseTensor>("Ln2Bias") : nullptr;
 
     auto* ln1_mean =
-        pre_layer_norm ? context.Output<framework::Tensor>("Ln1Mean") : nullptr;
+        pre_layer_norm ? context.Output<phi::DenseTensor>("Ln1Mean") : nullptr;
     auto* ln1_variance = pre_layer_norm
-                             ? context.Output<framework::Tensor>("Ln1Variance")
+                             ? context.Output<phi::DenseTensor>("Ln1Variance")
                              : nullptr;
-    auto* ln2_mean = !pre_layer_norm
-                         ? context.Output<framework::Tensor>("Ln2Mean")
-                         : nullptr;
+    auto* ln2_mean =
+        !pre_layer_norm ? context.Output<phi::DenseTensor>("Ln2Mean") : nullptr;
     auto* ln2_variance = !pre_layer_norm
-                             ? context.Output<framework::Tensor>("Ln2Variance")
+                             ? context.Output<phi::DenseTensor>("Ln2Variance")
                              : nullptr;
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto* dropout1_mask = context.Output<framework::Tensor>("Dropout1Mask");
-    auto* dropout2_mask = context.Output<framework::Tensor>("Dropout2Mask");
-    auto* linear1_out = context.Output<framework::Tensor>("Linear1Out");
+    auto* out = context.Output<phi::DenseTensor>("Out");
+    auto* dropout1_mask = context.Output<phi::DenseTensor>("Dropout1Mask");
+    auto* dropout2_mask = context.Output<phi::DenseTensor>("Dropout2Mask");
+    auto* linear1_out = context.Output<phi::DenseTensor>("Linear1Out");
     auto* ln1_out =
-        pre_layer_norm ? context.Output<framework::Tensor>("Ln1Out") : nullptr;
-    auto* dropout1_out = context.Output<framework::Tensor>("Dropout1Out");
-    auto* dropout2_out = context.Output<framework::Tensor>("Dropout2Out");
+        pre_layer_norm ? context.Output<phi::DenseTensor>("Ln1Out") : nullptr;
+    auto* dropout1_out = context.Output<phi::DenseTensor>("Dropout1Out");
+    auto* dropout2_out = context.Output<phi::DenseTensor>("Dropout2Out");
 
     const std::string act_method = context.Attr<std::string>("act_method");
 
@@ -312,11 +310,11 @@ template <typename DeviceContext, typename T>
 class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
  public:
   void MatMulGrad(const phi::GPUContext& ctx,
-                  const framework::Tensor& d_out,
-                  const framework::Tensor& a,
-                  const framework::Tensor& b,
-                  framework::Tensor* d_a,
-                  framework::Tensor* d_b) const {
+                  const phi::DenseTensor& d_out,
+                  const phi::DenseTensor& a,
+                  const phi::DenseTensor& b,
+                  phi::DenseTensor* d_a,
+                  phi::DenseTensor* d_b) const {
     auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
     auto a_2d = FoldInitDims(a);
     auto b_2d = FoldInitDims(b);
@@ -330,34 +328,34 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
   }
 
   void FFNGrad(const phi::GPUContext& ctx,
-               const framework::Tensor& d_out,
-               const framework::Tensor& x,
-               const framework::Tensor& dropout1_mask,
-               const framework::Tensor& dropout2_mask,
-               const framework::Tensor& linear1_out,
-               const framework::Tensor* ln1_out,
-               const framework::Tensor& dropout1_out,
-               const framework::Tensor& dropout2_out,
-               const framework::Tensor& linear1_weight,
-               const framework::Tensor* linear1_bias,
-               const framework::Tensor& linear2_weight,
-               const framework::Tensor* ln1_gamma,
-               const framework::Tensor* ln1_beta,
-               const framework::Tensor* ln1_mean,
-               const framework::Tensor* ln1_variance,
-               const framework::Tensor* ln2_gamma,
-               const framework::Tensor* ln2_beta,
-               const framework::Tensor* ln2_mean,
-               const framework::Tensor* ln2_variance,
-               framework::Tensor* d_x,
-               framework::Tensor* d_linear1_weight,
-               framework::Tensor* d_linear1_bias,
-               framework::Tensor* d_linear2_weight,
-               framework::Tensor* d_linear2_bias,
-               framework::Tensor* d_ln1_gamma,
-               framework::Tensor* d_ln1_beta,
-               framework::Tensor* d_ln2_gamma,
-               framework::Tensor* d_ln2_beta,
+               const phi::DenseTensor& d_out,
+               const phi::DenseTensor& x,
+               const phi::DenseTensor& dropout1_mask,
+               const phi::DenseTensor& dropout2_mask,
+               const phi::DenseTensor& linear1_out,
+               const phi::DenseTensor* ln1_out,
+               const phi::DenseTensor& dropout1_out,
+               const phi::DenseTensor& dropout2_out,
+               const phi::DenseTensor& linear1_weight,
+               const phi::DenseTensor* linear1_bias,
+               const phi::DenseTensor& linear2_weight,
+               const phi::DenseTensor* ln1_gamma,
+               const phi::DenseTensor* ln1_beta,
+               const phi::DenseTensor* ln1_mean,
+               const phi::DenseTensor* ln1_variance,
+               const phi::DenseTensor* ln2_gamma,
+               const phi::DenseTensor* ln2_beta,
+               const phi::DenseTensor* ln2_mean,
+               const phi::DenseTensor* ln2_variance,
+               phi::DenseTensor* d_x,
+               phi::DenseTensor* d_linear1_weight,
+               phi::DenseTensor* d_linear1_bias,
+               phi::DenseTensor* d_linear2_weight,
+               phi::DenseTensor* d_linear2_bias,
+               phi::DenseTensor* d_ln1_gamma,
+               phi::DenseTensor* d_ln1_beta,
+               phi::DenseTensor* d_ln2_gamma,
+               phi::DenseTensor* d_ln2_beta,
                const int bsz_seq,
                const int d_model,
                const int dim_feedforward,
@@ -396,7 +394,7 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
         d_ln2_gamma == nullptr ? nullptr : d_ln2_gamma->data<U>();
     U* d_ln2_beta_ptr = d_ln2_beta == nullptr ? nullptr : d_ln2_beta->data<U>();
 
-    framework::Tensor d_linear2_out, d_dropout2_out, d_residual;
+    phi::DenseTensor d_linear2_out, d_dropout2_out, d_residual;
     d_linear2_out.Resize({bsz_seq, d_model});
     ctx.Alloc<T>(&d_linear2_out, d_linear2_out.numel() * sizeof(T));
     d_dropout2_out.Resize({bsz_seq, d_model});
@@ -433,7 +431,7 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
           d_residual_ptr);
     }
 
-    framework::Tensor d_dropout1_out;
+    phi::DenseTensor d_dropout1_out;
     d_dropout1_out.Resize({bsz_seq, dim_feedforward});
     ctx.Alloc<T>(&d_dropout1_out, d_dropout1_out.numel() * sizeof(T));
     MatMulGrad(ctx,
@@ -443,7 +441,7 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
                &d_dropout1_out,
                d_linear2_weight);
 
-    framework::Tensor d_linear1_out;
+    phi::DenseTensor d_linear1_out;
     d_linear1_out.Resize({bsz_seq, dim_feedforward});
     ctx.Alloc<T>(&d_linear1_out, d_linear1_out.numel() * sizeof(T));
     fused_act_dropout_helper.DropoutActBiasGrad(ctx,
@@ -456,7 +454,7 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
                                                 act_method);
 
     if (pre_layer_norm) {
-      framework::Tensor d_ln1_out;
+      phi::DenseTensor d_ln1_out;
       d_ln1_out.Resize({bsz_seq, d_model});
       ctx.Alloc<T>(&d_ln1_out, d_ln1_out.numel() * sizeof(T));
       MatMulGrad(ctx,
@@ -484,8 +482,8 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
 
     if (add_residual) {
       // gradient accumulation
-      std::vector<const Tensor*> ins = {&d_residual, d_x};
-      std::vector<Tensor*> outs = {d_x};
+      std::vector<const phi::DenseTensor*> ins = {&d_residual, d_x};
+      std::vector<phi::DenseTensor*> outs = {d_x};
       phi::funcs::ElementwiseKernel<T>(
           ctx, ins, &outs, phi::funcs::AddFunctor<T>());
     }
@@ -495,61 +493,60 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
     using U = LayerNormParamType<T>;
     auto& dev_ctx = context.template device_context<phi::GPUContext>();
     auto d_out =
-        *context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto x = *context.Input<framework::Tensor>("X");
+        *context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto x = *context.Input<phi::DenseTensor>("X");
     const bool pre_layer_norm = context.Attr<bool>("pre_layer_norm");
-    auto dropout1_mask = *context.Input<framework::Tensor>("Dropout1Mask");
-    auto dropout2_mask = *context.Input<framework::Tensor>("Dropout2Mask");
-    auto linear1_out = *context.Input<framework::Tensor>("Linear1Out");
+    auto dropout1_mask = *context.Input<phi::DenseTensor>("Dropout1Mask");
+    auto dropout2_mask = *context.Input<phi::DenseTensor>("Dropout2Mask");
+    auto linear1_out = *context.Input<phi::DenseTensor>("Linear1Out");
     auto* ln1_out =
-        pre_layer_norm ? context.Input<framework::Tensor>("Ln1Out") : nullptr;
-    auto dropout1_out = *context.Input<framework::Tensor>("Dropout1Out");
-    auto dropout2_out = *context.Input<framework::Tensor>("Dropout2Out");
-    auto linear1_weight = *context.Input<framework::Tensor>("Linear1Weight");
-    auto* linear1_bias = context.Input<framework::Tensor>("Linear1Bias");
-    auto linear2_weight = *context.Input<framework::Tensor>("Linear2Weight");
+        pre_layer_norm ? context.Input<phi::DenseTensor>("Ln1Out") : nullptr;
+    auto dropout1_out = *context.Input<phi::DenseTensor>("Dropout1Out");
+    auto dropout2_out = *context.Input<phi::DenseTensor>("Dropout2Out");
+    auto linear1_weight = *context.Input<phi::DenseTensor>("Linear1Weight");
+    auto* linear1_bias = context.Input<phi::DenseTensor>("Linear1Bias");
+    auto linear2_weight = *context.Input<phi::DenseTensor>("Linear2Weight");
     auto* ln1_mean =
-        pre_layer_norm ? context.Input<framework::Tensor>("Ln1Mean") : nullptr;
+        pre_layer_norm ? context.Input<phi::DenseTensor>("Ln1Mean") : nullptr;
     auto* ln1_variance = pre_layer_norm
-                             ? context.Input<framework::Tensor>("Ln1Variance")
+                             ? context.Input<phi::DenseTensor>("Ln1Variance")
                              : nullptr;
     auto* ln1_scale =
-        pre_layer_norm ? context.Input<framework::Tensor>("Ln1Scale") : nullptr;
+        pre_layer_norm ? context.Input<phi::DenseTensor>("Ln1Scale") : nullptr;
     auto* ln1_bias =
-        pre_layer_norm ? context.Input<framework::Tensor>("Ln1Bias") : nullptr;
+        pre_layer_norm ? context.Input<phi::DenseTensor>("Ln1Bias") : nullptr;
     auto* ln2_mean =
-        !pre_layer_norm ? context.Input<framework::Tensor>("Ln2Mean") : nullptr;
+        !pre_layer_norm ? context.Input<phi::DenseTensor>("Ln2Mean") : nullptr;
     auto* ln2_variance = !pre_layer_norm
-                             ? context.Input<framework::Tensor>("Ln2Variance")
+                             ? context.Input<phi::DenseTensor>("Ln2Variance")
                              : nullptr;
-    auto* ln2_scale = !pre_layer_norm
-                          ? context.Input<framework::Tensor>("Ln2Scale")
-                          : nullptr;
+    auto* ln2_scale =
+        !pre_layer_norm ? context.Input<phi::DenseTensor>("Ln2Scale") : nullptr;
     auto* ln2_bias =
-        !pre_layer_norm ? context.Input<framework::Tensor>("Ln2Bias") : nullptr;
+        !pre_layer_norm ? context.Input<phi::DenseTensor>("Ln2Bias") : nullptr;
 
-    auto* d_x = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* d_ln1_scale = pre_layer_norm ? context.Output<framework::Tensor>(
+    auto* d_x = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* d_ln1_scale = pre_layer_norm ? context.Output<phi::DenseTensor>(
                                              framework::GradVarName("Ln1Scale"))
                                        : nullptr;
-    auto* d_ln1_bias = pre_layer_norm ? context.Output<framework::Tensor>(
+    auto* d_ln1_bias = pre_layer_norm ? context.Output<phi::DenseTensor>(
                                             framework::GradVarName("Ln1Bias"))
                                       : nullptr;
     auto* d_ln2_scale = pre_layer_norm
                             ? nullptr
-                            : context.Output<framework::Tensor>(
+                            : context.Output<phi::DenseTensor>(
                                   framework::GradVarName("Ln2Scale"));
     auto* d_ln2_bias = pre_layer_norm ? nullptr
-                                      : context.Output<framework::Tensor>(
+                                      : context.Output<phi::DenseTensor>(
                                             framework::GradVarName("Ln2Bias"));
-    auto* d_linear1_weight = context.Output<framework::Tensor>(
+    auto* d_linear1_weight = context.Output<phi::DenseTensor>(
         framework::GradVarName("Linear1Weight"));
-    auto* d_linear1_bias = context.Output<framework::Tensor>(
-        framework::GradVarName("Linear1Bias"));
-    auto* d_linear2_weight = context.Output<framework::Tensor>(
+    auto* d_linear1_bias =
+        context.Output<phi::DenseTensor>(framework::GradVarName("Linear1Bias"));
+    auto* d_linear2_weight = context.Output<phi::DenseTensor>(
         framework::GradVarName("Linear2Weight"));
-    auto* d_linear2_bias = context.Output<framework::Tensor>(
-        framework::GradVarName("Linear2Bias"));
+    auto* d_linear2_bias =
+        context.Output<phi::DenseTensor>(framework::GradVarName("Linear2Bias"));
 
     const float epsilon1 = context.Attr<float>("ln1_epsilon");
     const float epsilon2 = context.Attr<float>("ln2_epsilon");
diff --git a/paddle/fluid/operators/fused/fused_gate_attention.h b/paddle/fluid/operators/fused/fused_gate_attention.h
index 12db3e6e0d63d..e50cc24d88adf 100644
--- a/paddle/fluid/operators/fused/fused_gate_attention.h
+++ b/paddle/fluid/operators/fused/fused_gate_attention.h
@@ -24,9 +24,9 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
-inline std::string MemoryDebugString(const Tensor& t) {
+inline std::string MemoryDebugString(const phi::DenseTensor& t) {
   int device_id = platform::GetCurrentDeviceId();
   int64_t allocated =
       memory::DeviceMemoryStatCurrentValue("Allocated", device_id);
@@ -46,7 +46,7 @@ inline std::string MemoryDebugString(const Tensor& t) {
 template <typename T>
 void AllocWithDebugInfo(const phi::GPUContext& dev_ctx,
                         const std::string& info,
-                        Tensor* t) {
+                        phi::DenseTensor* t) {
   dev_ctx.Alloc<T>(t, t->numel() * sizeof(T));
   VLOG(4) << info << ": " << MemoryDebugString(*t);
 }
@@ -87,10 +87,10 @@ struct GateAttentionConfig {
   phi::DDim gate_out_dims;
 
   GateAttentionConfig(const phi::GPUContext& dev_ctx,
-                      const Tensor* query,
-                      const Tensor* key,
-                      const Tensor* query_weight,
-                      const Tensor* qkv_weight,
+                      const phi::DenseTensor* query,
+                      const phi::DenseTensor* key,
+                      const phi::DenseTensor* query_weight,
+                      const phi::DenseTensor* qkv_weight,
                       bool merge_qkv,
                       bool has_gating)
       : dev_ctx(dev_ctx), merge_qkv(merge_qkv), has_gating(has_gating) {
@@ -152,7 +152,7 @@ struct GateAttentionConfig {
     return batch_size * seq_len_m * seq_len_r * num_heads * head_dim;
   }
 
-  Tensor* GetQKVOut() {
+  phi::DenseTensor* GetQKVOut() {
     if (!qkv_out.IsInitialized()) {
       qkv_out.Resize(qkv_out_dims);
       AllocWithDebugInfo<T>(dev_ctx, "qkv_out", &qkv_out);
@@ -160,7 +160,7 @@ struct GateAttentionConfig {
     return &qkv_out;
   }
 
-  Tensor* GetQueryOut() {
+  phi::DenseTensor* GetQueryOut() {
     if (!query_out.IsInitialized()) {
       query_out.Resize(q_out_dims);
       AllocWithDebugInfo<T>(dev_ctx, "query_out", &query_out);
@@ -168,7 +168,7 @@ struct GateAttentionConfig {
     return &query_out;
   }
 
-  Tensor* GetKeyOut() {
+  phi::DenseTensor* GetKeyOut() {
     if (!key_out.IsInitialized()) {
       key_out.Resize(kv_out_dims);
       AllocWithDebugInfo<T>(dev_ctx, "key_out", &key_out);
@@ -176,7 +176,7 @@ struct GateAttentionConfig {
     return &key_out;
   }
 
-  Tensor* GetValueOut() {
+  phi::DenseTensor* GetValueOut() {
     if (!value_out.IsInitialized()) {
       value_out.Resize(kv_out_dims);
       AllocWithDebugInfo<T>(dev_ctx, "value_out", &value_out);
@@ -184,7 +184,7 @@ struct GateAttentionConfig {
     return &value_out;
   }
 
-  Tensor* GetQKOut(Tensor* softmax_out) {
+  phi::DenseTensor* GetQKOut(phi::DenseTensor* softmax_out) {
     // softmax_dim = qk_out_dim[-1] = qk_out_dim[rank - 1]
     int softmax_dim = m_size;
     if (!softmax_out || phi::UseCudnnSoftmax<T>(dev_ctx, softmax_dim, true)) {
@@ -200,7 +200,7 @@ struct GateAttentionConfig {
     }
   }
 
-  Tensor* GetQKTVOut(Tensor* gate_out) {
+  phi::DenseTensor* GetQKTVOut(phi::DenseTensor* gate_out) {
     if (has_gating && gate_out) {
       // Reuse gate_out.
       gate_out->Resize(qktv_out_dims);
@@ -250,10 +250,10 @@ template <typename T>
 struct GateAttentionGradConfig : public GateAttentionConfig<T> {
  public:
   GateAttentionGradConfig(const phi::GPUContext& dev_ctx,
-                          const Tensor* query,
-                          const Tensor* key,
-                          const Tensor* query_weight,
-                          const Tensor* qkv_weight,
+                          const phi::DenseTensor* query,
+                          const phi::DenseTensor* key,
+                          const phi::DenseTensor* query_weight,
+                          const phi::DenseTensor* qkv_weight,
                           bool merge_qkv,
                           bool has_gating)
       : GateAttentionConfig<T>(dev_ctx,
@@ -264,7 +264,7 @@ struct GateAttentionGradConfig : public GateAttentionConfig<T> {
                                merge_qkv,
                                has_gating) {}
 
-  Tensor* GetQKVOutGrad() {
+  phi::DenseTensor* GetQKVOutGrad() {
     if (!qkv_out_grad.IsInitialized()) {
       qkv_out_grad.Resize(this->qkv_out_dims);
       AllocWithDebugInfo<T>(this->dev_ctx, "qkv_out_grad", &qkv_out_grad);
@@ -272,7 +272,7 @@ struct GateAttentionGradConfig : public GateAttentionConfig<T> {
     return &qkv_out_grad;
   }
 
-  Tensor* GetQueryOutGrad() {
+  phi::DenseTensor* GetQueryOutGrad() {
     if (!query_out_grad.IsInitialized()) {
       query_out_grad.Resize(this->q_out_dims);
       AllocWithDebugInfo<T>(this->dev_ctx, "query_out_grad", &query_out_grad);
@@ -280,7 +280,7 @@ struct GateAttentionGradConfig : public GateAttentionConfig<T> {
     return &query_out_grad;
   }
 
-  Tensor* GetKeyOutGrad() {
+  phi::DenseTensor* GetKeyOutGrad() {
     if (!key_out_grad.IsInitialized()) {
       key_out_grad.Resize(this->kv_out_dims);
       AllocWithDebugInfo<T>(this->dev_ctx, "key_out_grad", &key_out_grad);
@@ -288,7 +288,7 @@ struct GateAttentionGradConfig : public GateAttentionConfig<T> {
     return &key_out_grad;
   }
 
-  Tensor* GetValueOutGrad() {
+  phi::DenseTensor* GetValueOutGrad() {
     if (!value_out_grad.IsInitialized()) {
       value_out_grad.Resize(this->kv_out_dims);
       AllocWithDebugInfo<T>(this->dev_ctx, "value_out_grad", &value_out_grad);
@@ -296,7 +296,7 @@ struct GateAttentionGradConfig : public GateAttentionConfig<T> {
     return &value_out_grad;
   }
 
-  Tensor* GetQKOutGrad(Tensor* softmax_out_grad) {
+  phi::DenseTensor* GetQKOutGrad(phi::DenseTensor* softmax_out_grad) {
     // softmax_dim = qk_out_dim[-1] = qk_out_dim[rank - 1]
     int softmax_dim = this->m_size;
     if (!softmax_out_grad ||
@@ -325,15 +325,15 @@ class FMHAGateRef {
   FMHAGateRef(const phi::GPUContext& dev_ctx, bool merge_qkv)
       : dev_ctx_(dev_ctx), merge_qkv_(merge_qkv) {}
 
-  void ComputeForward(const Tensor* nonbatched_bias,
-                      const Tensor* src_mask,
-                      Tensor* q_transpose_out,
-                      Tensor* k_transpose_out,
-                      Tensor* v_transpose_out,
-                      Tensor* qkv_transpose_out,
-                      Tensor* softmax_out,
-                      Tensor* fmha_out,
-                      Tensor* gate_out,
+  void ComputeForward(const phi::DenseTensor* nonbatched_bias,
+                      const phi::DenseTensor* src_mask,
+                      phi::DenseTensor* q_transpose_out,
+                      phi::DenseTensor* k_transpose_out,
+                      phi::DenseTensor* v_transpose_out,
+                      phi::DenseTensor* qkv_transpose_out,
+                      phi::DenseTensor* softmax_out,
+                      phi::DenseTensor* fmha_out,
+                      phi::DenseTensor* gate_out,
                       GateAttentionConfig<T>* config) {
     T* q_ptr = nullptr;
     T* k_ptr = nullptr;
@@ -345,7 +345,7 @@ class FMHAGateRef {
           platform::errors::NotFound("The input qkv_transpose_out can not be "
                                      "nullptr when merge_qkv is true."));
 
-      Tensor* qkv_out = config->GetQKVOut();
+      phi::DenseTensor* qkv_out = config->GetQKVOut();
       ComputeQKVTransposeForward(*qkv_out, qkv_transpose_out);
       config->ClearQKVOut();
 
@@ -368,9 +368,9 @@ class FMHAGateRef {
           platform::errors::NotFound("The input v_transpose_out can not be "
                                      "nullptr when merge_qkv is false."));
 
-      Tensor* query_out = config->GetQueryOut();
-      Tensor* key_out = config->GetKeyOut();
-      Tensor* value_out = config->GetValueOut();
+      phi::DenseTensor* query_out = config->GetQueryOut();
+      phi::DenseTensor* key_out = config->GetKeyOut();
+      phi::DenseTensor* value_out = config->GetValueOut();
       ComputeQKVTransposeForward(*query_out,
                                  *key_out,
                                  *value_out,
@@ -388,7 +388,7 @@ class FMHAGateRef {
     // [batch_size, seq_len_m, num_heads, seq_len_r, head_dim] *
     //                [batch_size, seq_len_m, num_heads, m_size, head_dim]
     // -> [batch_size, seq_len_m, num_heads, seq_len_r, m_size]
-    Tensor* qk_out = config->GetQKOut(softmax_out);
+    phi::DenseTensor* qk_out = config->GetQKOut(softmax_out);
     T* qk_out_ptr = qk_out->data<T>();
 
     int64_t gemm_batch_size =
@@ -418,7 +418,7 @@ class FMHAGateRef {
     // [batch_size, seq_len_m, num_heads, seq_len_r, m_size] *
     //               [batch_size, seq_len_m, num_heads, m_size, head_dim]
     // -> [batch_size, seq_len_m, num_heads, seq_len_r, head_dim]
-    Tensor* qktv_out = config->GetQKTVOut(gate_out);
+    phi::DenseTensor* qktv_out = config->GetQKTVOut(gate_out);
     T* qktv_out_ptr = qktv_out->data<T>();
 
     gemm_m = config->seq_len_r;
@@ -444,14 +444,14 @@ class FMHAGateRef {
     }
   }
 
-  void ComputeBackward(const Tensor* q_transpose_out,
-                       const Tensor* k_transpose_out,
-                       const Tensor* v_transpose_out,
-                       const Tensor* qkv_transpose_out,
-                       const Tensor* softmax_out,
-                       const Tensor* fmha_out_grad,
-                       Tensor* src_mask_grad,
-                       Tensor* nonbatched_bias_grad,
+  void ComputeBackward(const phi::DenseTensor* q_transpose_out,
+                       const phi::DenseTensor* k_transpose_out,
+                       const phi::DenseTensor* v_transpose_out,
+                       const phi::DenseTensor* qkv_transpose_out,
+                       const phi::DenseTensor* softmax_out,
+                       const phi::DenseTensor* fmha_out_grad,
+                       phi::DenseTensor* src_mask_grad,
+                       phi::DenseTensor* nonbatched_bias_grad,
                        GateAttentionGradConfig<T>* config) {
     const T* q_ptr = nullptr;
     const T* k_ptr = nullptr;
@@ -562,7 +562,7 @@ class FMHAGateRef {
                          gemm_batch_size);
     }
 
-    Tensor* qk_out_grad = config->GetQKOutGrad(&softmax_out_grad);
+    phi::DenseTensor* qk_out_grad = config->GetQKOutGrad(&softmax_out_grad);
     ComputeBiasMaskSoftmaxBackward(&softmax_out_grad,
                                    softmax_out,
                                    src_mask_grad,
@@ -604,12 +604,12 @@ class FMHAGateRef {
                        alpha);
 
     if (merge_qkv_) {
-      Tensor* qkv_out_grad = config->GetQKVOutGrad();
+      phi::DenseTensor* qkv_out_grad = config->GetQKVOutGrad();
       ComputeQKVTransposeBackward(qkv_transpose_out_grad, qkv_out_grad);
     } else {
-      Tensor* q_out_grad = config->GetQueryOutGrad();
-      Tensor* k_out_grad = config->GetKeyOutGrad();
-      Tensor* v_out_grad = config->GetValueOutGrad();
+      phi::DenseTensor* q_out_grad = config->GetQueryOutGrad();
+      phi::DenseTensor* k_out_grad = config->GetKeyOutGrad();
+      phi::DenseTensor* v_out_grad = config->GetValueOutGrad();
       ComputeQKVTransposeBackward(q_transpose_out_grad,
                                   k_transpose_out_grad,
                                   v_transpose_out_grad,
@@ -619,24 +619,24 @@ class FMHAGateRef {
     }
   }
 
-  void ComputeQKVTransposeForward(const Tensor& q_out,
-                                  const Tensor& k_out,
-                                  const Tensor& v_out,
-                                  Tensor* q_transpose_out,
-                                  Tensor* k_transpose_out,
-                                  Tensor* v_transpose_out) {
+  void ComputeQKVTransposeForward(const phi::DenseTensor& q_out,
+                                  const phi::DenseTensor& k_out,
+                                  const phi::DenseTensor& v_out,
+                                  phi::DenseTensor* q_transpose_out,
+                                  phi::DenseTensor* k_transpose_out,
+                                  phi::DenseTensor* v_transpose_out) {
     std::vector<int> perm = {0, 1, 3, 2, 4};
     TransposeGPUKernelDriver<T>(dev_ctx_, q_out, perm, q_transpose_out);
     TransposeGPUKernelDriver<T>(dev_ctx_, k_out, perm, k_transpose_out);
     TransposeGPUKernelDriver<T>(dev_ctx_, v_out, perm, v_transpose_out);
   }
 
-  void ComputeQKVTransposeBackward(const Tensor& q_transpose_out_grad,
-                                   const Tensor& k_transpose_out_grad,
-                                   const Tensor& v_transpose_out_grad,
-                                   Tensor* q_out_grad,
-                                   Tensor* k_out_grad,
-                                   Tensor* v_out_grad) {
+  void ComputeQKVTransposeBackward(const phi::DenseTensor& q_transpose_out_grad,
+                                   const phi::DenseTensor& k_transpose_out_grad,
+                                   const phi::DenseTensor& v_transpose_out_grad,
+                                   phi::DenseTensor* q_out_grad,
+                                   phi::DenseTensor* k_out_grad,
+                                   phi::DenseTensor* v_out_grad) {
     std::vector<int> perm = {0, 1, 3, 2, 4};
     TransposeGPUKernelDriver<T>(
         dev_ctx_, q_transpose_out_grad, perm, q_out_grad);
@@ -648,14 +648,15 @@ class FMHAGateRef {
 
   // [batch_size, seq_len_m, seq_len_r, 3, num_heads, head_dim] ->
   //         [3, batch_size, seq_len_m, num_heads, seq_len_r, head_dim]
-  void ComputeQKVTransposeForward(const Tensor& qkv_out,
-                                  Tensor* qkv_transpose_out) {
+  void ComputeQKVTransposeForward(const phi::DenseTensor& qkv_out,
+                                  phi::DenseTensor* qkv_transpose_out) {
     std::vector<int> perm = {3, 0, 1, 4, 2, 5};
     TransposeGPUKernelDriver<T>(dev_ctx_, qkv_out, perm, qkv_transpose_out);
   }
 
-  void ComputeQKVTransposeBackward(const Tensor& qkv_transpose_out_grad,
-                                   Tensor* qkv_out_grad) {
+  void ComputeQKVTransposeBackward(
+      const phi::DenseTensor& qkv_transpose_out_grad,
+      phi::DenseTensor* qkv_out_grad) {
     std::vector<int> perm = {1, 2, 4, 0, 3, 5};
     TransposeGPUKernelDriver<T>(
         dev_ctx_, qkv_transpose_out_grad, perm, qkv_out_grad);
@@ -663,31 +664,33 @@ class FMHAGateRef {
 
   // [batch_size, seq_len_m, num_head, seq_len_r, c] ->
   //         [batch_size, seq_len_m, seq_len_r, num_head, c]
-  void ComputeQKTVTransposeForward(const Tensor& qktv_out, Tensor* fmha_out) {
+  void ComputeQKTVTransposeForward(const phi::DenseTensor& qktv_out,
+                                   phi::DenseTensor* fmha_out) {
     std::vector<int> perm = {0, 1, 3, 2, 4};
     TransposeGPUKernelDriver<T>(dev_ctx_, qktv_out, perm, fmha_out);
   }
 
-  void ComputeQKTVTransposeBackward(const Tensor& fmha_out_grad,
-                                    Tensor* qktv_out_grad) {
+  void ComputeQKTVTransposeBackward(const phi::DenseTensor& fmha_out_grad,
+                                    phi::DenseTensor* qktv_out_grad) {
     std::vector<int> perm = {0, 1, 3, 2, 4};
     TransposeGPUKernelDriver<T>(dev_ctx_, fmha_out_grad, perm, qktv_out_grad);
   }
 
   // qk_out = qk_out + nonbatched_bias + src_mask
   // softmax_out = softmax(src_mask_out)
-  void ComputeBiasMaskSoftmaxForward(const Tensor* nonbatched_bias,
-                                     const Tensor* src_mask,
-                                     Tensor* qk_out,
-                                     Tensor* softmax_out) {
+  void ComputeBiasMaskSoftmaxForward(const phi::DenseTensor* nonbatched_bias,
+                                     const phi::DenseTensor* src_mask,
+                                     phi::DenseTensor* qk_out,
+                                     phi::DenseTensor* softmax_out) {
     if (nonbatched_bias) {
-      std::vector<const Tensor*> ins = {qk_out, src_mask, nonbatched_bias};
-      std::vector<Tensor*> outs = {qk_out};
+      std::vector<const phi::DenseTensor*> ins = {
+          qk_out, src_mask, nonbatched_bias};
+      std::vector<phi::DenseTensor*> outs = {qk_out};
       phi::funcs::BroadcastKernel<phi::ElementwiseType::kTernary, T, T>(
           dev_ctx_, ins, &outs, -1, TernaryAddFunctor<T>());
     } else {
-      std::vector<const Tensor*> ins = {qk_out, src_mask};
-      std::vector<Tensor*> outs = {qk_out};
+      std::vector<const phi::DenseTensor*> ins = {qk_out, src_mask};
+      std::vector<phi::DenseTensor*> outs = {qk_out};
       phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
           dev_ctx_, ins, &outs, -1, phi::funcs::AddFunctor<T>());
     }
@@ -696,11 +699,11 @@ class FMHAGateRef {
 
   // src_mask_out = qk_out + nonbatched_bias + src_mask
   // softmax_out = softmax(src_mask_out)
-  void ComputeBiasMaskSoftmaxBackward(const Tensor* softmax_out_grad,
-                                      const Tensor* softmax_out,
-                                      Tensor* src_mask_grad,
-                                      Tensor* qk_out_grad,
-                                      Tensor* nonbatched_bias_grad) {
+  void ComputeBiasMaskSoftmaxBackward(const phi::DenseTensor* softmax_out_grad,
+                                      const phi::DenseTensor* softmax_out,
+                                      phi::DenseTensor* src_mask_grad,
+                                      phi::DenseTensor* qk_out_grad,
+                                      phi::DenseTensor* nonbatched_bias_grad) {
     PADDLE_ENFORCE_NOT_NULL(
         qk_out_grad,
         platform::errors::NotFound("The qk_out_grad can not be nullptr."));
diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cc b/paddle/fluid/operators/fused/fused_gate_attention_op.cc
index 0823f391fd086..ce7929c39ffa8 100644
--- a/paddle/fluid/operators/fused/fused_gate_attention_op.cc
+++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using DDim = framework::DDim;
 
 class FusedGateAttentionOp : public framework::OperatorWithKernel {
diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cu b/paddle/fluid/operators/fused/fused_gate_attention_op.cu
index 413dc41dbd17c..8f13424ce49b5 100644
--- a/paddle/fluid/operators/fused/fused_gate_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cu
@@ -22,7 +22,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 struct SigmoidMultiplyFunctor {
@@ -69,7 +69,7 @@ void ComputeMergedQKVMatmulForward(const framework::ExecutionContext &ctx,
   // query: shape=[batch_size, seq_len_m, seq_len_r, qkv_dim]
   // qkv_weight: shape=[3, num_heads, head_dim, qkv_dim]
   // qkv_out: shape=[batch_size, seq_len_m, seq_len_r, 3, num_heads, head_dim]
-  auto *qkv_weight = ctx.Input<Tensor>("QKVWeight");
+  auto *qkv_weight = ctx.Input<phi::DenseTensor>("QKVWeight");
 
   // qkv_out = GEMM(query, qkv_weight^T)
   int m = config.batch_size * config.seq_len_m * config.seq_len_r;
@@ -87,9 +87,9 @@ void ComputeMergedQKVMatmulBackward(const framework::ExecutionContext &ctx,
                                     const Tensor *qkv_out_grad,
                                     Tensor *query_grad,
                                     bool use_addto) {
-  auto *qkv_weight = ctx.Input<Tensor>("QKVWeight");
+  auto *qkv_weight = ctx.Input<phi::DenseTensor>("QKVWeight");
   auto *qkv_weight_grad =
-      ctx.Output<Tensor>(framework::GradVarName("QKVWeight"));
+      ctx.Output<phi::DenseTensor>(framework::GradVarName("QKVWeight"));
   auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
   dev_ctx.Alloc<T>(qkv_weight_grad, qkv_weight_grad->numel() * sizeof(T));
 
@@ -116,9 +116,9 @@ void ComputeSeparatedQKVMatmulForward(const framework::ExecutionContext &ctx,
                                       Tensor *query_out,
                                       Tensor *key_out,
                                       Tensor *value_out) {
-  auto *query_weight = ctx.Input<Tensor>("QueryWeight");
-  auto *key_weight = ctx.Input<Tensor>("KeyWeight");
-  auto *value_weight = ctx.Input<Tensor>("ValueWeight");
+  auto *query_weight = ctx.Input<phi::DenseTensor>("QueryWeight");
+  auto *key_weight = ctx.Input<phi::DenseTensor>("KeyWeight");
+  auto *value_weight = ctx.Input<phi::DenseTensor>("ValueWeight");
 
   // query_out = GEMM(query, query_weight)
   // query: shape=[batch_size, seq_len_m, seq_len_r, q_dim]
@@ -158,9 +158,9 @@ void ComputeSeparatedQKVMatmulBackward(const framework::ExecutionContext &ctx,
                                        Tensor *key_grad,
                                        bool use_addto) {
   // Gradient of GEMM(key, k_weight)
-  const auto *key_weight = ctx.Input<Tensor>("KeyWeight");
+  const auto *key_weight = ctx.Input<phi::DenseTensor>("KeyWeight");
   auto *key_weight_grad =
-      ctx.Output<Tensor>(framework::GradVarName("KeyWeight"));
+      ctx.Output<phi::DenseTensor>(framework::GradVarName("KeyWeight"));
   auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
   dev_ctx.Alloc<T>(key_weight_grad, key_weight_grad->numel() * sizeof(T));
 
@@ -173,9 +173,9 @@ void ComputeSeparatedQKVMatmulBackward(const framework::ExecutionContext &ctx,
       key, key_weight, key_out_grad, key_grad, key_weight_grad, nullptr, false);
 
   // Gradient of GEMM(value, v_weight)
-  auto *value_weight = ctx.Input<Tensor>("ValueWeight");
+  auto *value_weight = ctx.Input<phi::DenseTensor>("ValueWeight");
   auto *value_weight_grad =
-      ctx.Output<Tensor>(framework::GradVarName("ValueWeight"));
+      ctx.Output<phi::DenseTensor>(framework::GradVarName("ValueWeight"));
   dev_ctx.Alloc<T>(value_weight_grad, value_weight_grad->numel() * sizeof(T));
 
   kv_compute.ComputeBackward(key,
@@ -187,9 +187,9 @@ void ComputeSeparatedQKVMatmulBackward(const framework::ExecutionContext &ctx,
                              true);
 
   // Gradient of GEMM(query, query_weight)
-  const auto *query_weight = ctx.Input<Tensor>("QueryWeight");
+  const auto *query_weight = ctx.Input<phi::DenseTensor>("QueryWeight");
   auto *query_weight_grad =
-      ctx.Output<Tensor>(framework::GradVarName("QueryWeight"));
+      ctx.Output<phi::DenseTensor>(framework::GradVarName("QueryWeight"));
   dev_ctx.Alloc<T>(query_weight_grad, query_weight_grad->numel() * sizeof(T));
 
   int q_m = config.batch_size * config.seq_len_m * config.seq_len_r;
@@ -212,8 +212,8 @@ void ComputeGatingLinearForward(const framework::ExecutionContext &ctx,
                                 const Tensor *query,
                                 const Tensor *fmha_out,
                                 Tensor *gate_out) {
-  auto *gate_weight = ctx.Input<Tensor>("GateWeight");
-  auto *gate_bias = ctx.Input<Tensor>("GateBias");
+  auto *gate_weight = ctx.Input<phi::DenseTensor>("GateWeight");
+  auto *gate_bias = ctx.Input<phi::DenseTensor>("GateBias");
 
   // The first gate_bias_out stores the result of the multiplication,
   // and the second gate_bias_out stores the result of the multiplication +
@@ -242,8 +242,8 @@ void ComputeGatingLinearBackward(const framework::ExecutionContext &ctx,
                                  const Tensor *gate_out_grad,
                                  Tensor *query_grad,
                                  Tensor *fmha_out_grad) {
-  const auto *gate_weight = ctx.Input<Tensor>("GateWeight");
-  const auto *gate_bias = ctx.Input<Tensor>("GateBias");
+  const auto *gate_weight = ctx.Input<phi::DenseTensor>("GateWeight");
+  const auto *gate_bias = ctx.Input<phi::DenseTensor>("GateBias");
   auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
   // Re-compute gate_bias_out
   Tensor gate_bias_out;
@@ -267,8 +267,9 @@ void ComputeGatingLinearBackward(const framework::ExecutionContext &ctx,
 
   // Gradient of GEMM(query, gate_weight) + gate_bias
   auto *gate_weight_grad =
-      ctx.Output<Tensor>(framework::GradVarName("GateWeight"));
-  auto *gate_bias_grad = ctx.Output<Tensor>(framework::GradVarName("GateBias"));
+      ctx.Output<phi::DenseTensor>(framework::GradVarName("GateWeight"));
+  auto *gate_bias_grad =
+      ctx.Output<phi::DenseTensor>(framework::GradVarName("GateBias"));
   dev_ctx.Alloc<T>(gate_weight_grad, gate_weight_grad->numel() * sizeof(T));
   dev_ctx.Alloc<T>(gate_bias_grad, gate_bias_grad->numel() * sizeof(T));
 
@@ -285,8 +286,9 @@ void ComputeOutputLinearForward(const framework::ExecutionContext &ctx,
                                 const GateAttentionConfig<T> &config,
                                 const Tensor *fmha_or_gate_out,
                                 Tensor *out) {
-  const auto *out_linear_weight = ctx.Input<Tensor>("OutLinearWeight");
-  const auto *out_linear_bias = ctx.Input<Tensor>("OutLinearBias");
+  const auto *out_linear_weight =
+      ctx.Input<phi::DenseTensor>("OutLinearWeight");
+  const auto *out_linear_bias = ctx.Input<phi::DenseTensor>("OutLinearBias");
 
   // out = GEMM(fmha_or_gate_out, out_linear_weight) + out_linear_bias
   int m = config.batch_size * config.seq_len_m * config.seq_len_r;
@@ -304,13 +306,15 @@ void ComputeOutputLinearBackward(const framework::ExecutionContext &ctx,
                                  const Tensor *input,
                                  Tensor *input_grad) {
   auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
-  const auto *out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-  const auto *out_linear_weight = ctx.Input<Tensor>("OutLinearWeight");
+  const auto *out_grad =
+      ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+  const auto *out_linear_weight =
+      ctx.Input<phi::DenseTensor>("OutLinearWeight");
 
   auto *out_linear_weight_grad =
-      ctx.Output<Tensor>(framework::GradVarName("OutLinearWeight"));
+      ctx.Output<phi::DenseTensor>(framework::GradVarName("OutLinearWeight"));
   auto *out_linear_bias_grad =
-      ctx.Output<Tensor>(framework::GradVarName("OutLinearBias"));
+      ctx.Output<phi::DenseTensor>(framework::GradVarName("OutLinearBias"));
 
   dev_ctx.Alloc<T>(out_linear_weight_grad,
                    out_linear_weight_grad->numel() * sizeof(T));
@@ -334,23 +338,23 @@ template <typename T>
 class FusedGateAttentionOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *query = ctx.Input<Tensor>("Query");
-    const auto *key = ctx.Input<Tensor>("Key");
-    const auto *query_weight = ctx.Input<Tensor>("QueryWeight");
-    const auto *qkv_weight = ctx.Input<Tensor>("QKVWeight");
+    const auto *query = ctx.Input<phi::DenseTensor>("Query");
+    const auto *key = ctx.Input<phi::DenseTensor>("Key");
+    const auto *query_weight = ctx.Input<phi::DenseTensor>("QueryWeight");
+    const auto *qkv_weight = ctx.Input<phi::DenseTensor>("QKVWeight");
 
-    const auto *src_mask = ctx.Input<Tensor>("SrcMask");
-    const auto *nonbatched_bias = ctx.Input<Tensor>("NonbatchedBias");
+    const auto *src_mask = ctx.Input<phi::DenseTensor>("SrcMask");
+    const auto *nonbatched_bias = ctx.Input<phi::DenseTensor>("NonbatchedBias");
 
-    auto *q_transpose_out = ctx.Output<Tensor>("QueryTransposeOut");
-    auto *k_transpose_out = ctx.Output<Tensor>("KeyTransposeOut");
-    auto *v_transpose_out = ctx.Output<Tensor>("ValueTransposeOut");
-    auto *qkv_transpose_out = ctx.Output<Tensor>("QKVTransposeOut");
+    auto *q_transpose_out = ctx.Output<phi::DenseTensor>("QueryTransposeOut");
+    auto *k_transpose_out = ctx.Output<phi::DenseTensor>("KeyTransposeOut");
+    auto *v_transpose_out = ctx.Output<phi::DenseTensor>("ValueTransposeOut");
+    auto *qkv_transpose_out = ctx.Output<phi::DenseTensor>("QKVTransposeOut");
 
-    auto *softmax_out = ctx.Output<Tensor>("SoftmaxOut");
-    auto *fmha_out = ctx.Output<Tensor>("FMHAOut");
-    auto *gate_out = ctx.Output<Tensor>("GateOut");
-    auto *out = ctx.Output<Tensor>("Out");
+    auto *softmax_out = ctx.Output<phi::DenseTensor>("SoftmaxOut");
+    auto *fmha_out = ctx.Output<phi::DenseTensor>("FMHAOut");
+    auto *gate_out = ctx.Output<phi::DenseTensor>("GateOut");
+    auto *out = ctx.Output<phi::DenseTensor>("Out");
 
     const bool merge_qkv = ctx.Attr<bool>("merge_qkv");
     const bool has_gating = ctx.Attr<bool>("has_gating");
@@ -424,24 +428,29 @@ class FusedGateAttentionGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     // forward input
-    const auto *query = ctx.Input<Tensor>("Query");
-    const auto *key = ctx.Input<Tensor>("Key");
-    const auto *query_weight = ctx.Input<Tensor>("QueryWeight");
-    const auto *qkv_weight = ctx.Input<Tensor>("QKVWeight");
+    const auto *query = ctx.Input<phi::DenseTensor>("Query");
+    const auto *key = ctx.Input<phi::DenseTensor>("Key");
+    const auto *query_weight = ctx.Input<phi::DenseTensor>("QueryWeight");
+    const auto *qkv_weight = ctx.Input<phi::DenseTensor>("QKVWeight");
 
     // forward output, backward input
-    const auto *q_transpose_out = ctx.Input<Tensor>("QueryTransposeOut");
-    const auto *k_transpose_out = ctx.Input<Tensor>("KeyTransposeOut");
-    const auto *v_transpose_out = ctx.Input<Tensor>("ValueTransposeOut");
-    const auto *qkv_transpose_out = ctx.Input<Tensor>("QKVTransposeOut");
-    const auto *softmax_out = ctx.Input<Tensor>("SoftmaxOut");
-    const auto *fmha_out = ctx.Input<Tensor>("FMHAOut");
-    const auto *gate_out = ctx.Input<Tensor>("GateOut");
+    const auto *q_transpose_out =
+        ctx.Input<phi::DenseTensor>("QueryTransposeOut");
+    const auto *k_transpose_out =
+        ctx.Input<phi::DenseTensor>("KeyTransposeOut");
+    const auto *v_transpose_out =
+        ctx.Input<phi::DenseTensor>("ValueTransposeOut");
+    const auto *qkv_transpose_out =
+        ctx.Input<phi::DenseTensor>("QKVTransposeOut");
+    const auto *softmax_out = ctx.Input<phi::DenseTensor>("SoftmaxOut");
+    const auto *fmha_out = ctx.Input<phi::DenseTensor>("FMHAOut");
+    const auto *gate_out = ctx.Input<phi::DenseTensor>("GateOut");
 
     // backward output
-    auto *query_grad = ctx.Output<Tensor>(framework::GradVarName("Query"));
+    auto *query_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Query"));
     auto *nonbatched_bias_grad =
-        ctx.Output<Tensor>(framework::GradVarName("NonbatchedBias"));
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("NonbatchedBias"));
 
     bool has_gating = ctx.Attr<bool>("has_gating");
     bool merge_qkv = ctx.Attr<bool>("merge_qkv");
@@ -501,7 +510,8 @@ class FusedGateAttentionGradKernel : public framework::OpKernel<T> {
           ctx, config, query, qkv_out_grad, query_grad, use_addto);
     } else {
       // 4. Gradient of Separated QKV Matmul
-      auto *key_grad = ctx.Output<Tensor>(framework::GradVarName("Key"));
+      auto *key_grad =
+          ctx.Output<phi::DenseTensor>(framework::GradVarName("Key"));
       if (key_grad) {
         AllocWithDebugInfo<T>(dev_ctx, "key_grad", key_grad);
       }
diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
index d14e30a5f7f2a..e5f80e2511e2d 100644
--- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class FusedGemmEpilogueOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
index 22340210b5715..5f3c60df9a080 100644
--- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
@@ -23,7 +23,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
@@ -31,12 +31,13 @@ class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
 
-    const Tensor* x = ctx.Input<Tensor>("X");
-    const Tensor* y = ctx.Input<Tensor>("Y");
-    const Tensor* bias = ctx.Input<Tensor>("Bias");
+    const phi::DenseTensor* x = ctx.Input<phi::DenseTensor>("X");
+    const phi::DenseTensor* y = ctx.Input<phi::DenseTensor>("Y");
+    const phi::DenseTensor* bias = ctx.Input<phi::DenseTensor>("Bias");
 
-    Tensor* out = ctx.Output<Tensor>("Out");
-    Tensor* reserve_space = ctx.Output<Tensor>("ReserveSpace");
+    phi::DenseTensor* out = ctx.Output<phi::DenseTensor>("Out");
+    phi::DenseTensor* reserve_space =
+        ctx.Output<phi::DenseTensor>("ReserveSpace");
 
     bool trans_x = ctx.Attr<bool>("trans_x");
     bool trans_y = ctx.Attr<bool>("trans_y");
@@ -322,14 +323,15 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel<T> {
   static void ComputeImpl(const framework::ExecutionContext& ctx) {
     using Trait = FusedGEMMGradTrait<TransX, TransY>;
     auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
-    const Tensor* dout = ctx.Input<Tensor>("DOut");
-    const Tensor* x = ctx.Input<Tensor>("X");
-    const Tensor* y = ctx.Input<Tensor>("Y");
-    const Tensor* reserve_space = ctx.Input<Tensor>("ReserveSpace");
-
-    Tensor* dx = ctx.Output<Tensor>("DX");
-    Tensor* dy = ctx.Output<Tensor>("DY");
-    Tensor* dbias = ctx.Output<Tensor>("DBias");
+    const phi::DenseTensor* dout = ctx.Input<phi::DenseTensor>("DOut");
+    const phi::DenseTensor* x = ctx.Input<phi::DenseTensor>("X");
+    const phi::DenseTensor* y = ctx.Input<phi::DenseTensor>("Y");
+    const phi::DenseTensor* reserve_space =
+        ctx.Input<phi::DenseTensor>("ReserveSpace");
+
+    phi::DenseTensor* dx = ctx.Output<phi::DenseTensor>("DX");
+    phi::DenseTensor* dy = ctx.Output<phi::DenseTensor>("DY");
+    phi::DenseTensor* dbias = ctx.Output<phi::DenseTensor>("DBias");
 
     std::string activation_grad = ctx.Attr<std::string>("activation_grad");
 
diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op_xpu.cc b/paddle/fluid/operators/fused/fused_gemm_epilogue_op_xpu.cc
index 2b4b03e32cd8e..b1707ff55950d 100644
--- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op_xpu.cc
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op_xpu.cc
@@ -22,7 +22,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class FusedGemmEpilogueXPUKernel : public framework::OpKernel<T> {
@@ -32,12 +32,13 @@ class FusedGemmEpilogueXPUKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto& dev_ctx = ctx.template device_context<phi::XPUContext>();
 
-    const Tensor* x = ctx.Input<Tensor>("X");
-    const Tensor* y = ctx.Input<Tensor>("Y");
-    const Tensor* bias = ctx.Input<Tensor>("Bias");
+    const phi::DenseTensor* x = ctx.Input<phi::DenseTensor>("X");
+    const phi::DenseTensor* y = ctx.Input<phi::DenseTensor>("Y");
+    const phi::DenseTensor* bias = ctx.Input<phi::DenseTensor>("Bias");
 
-    Tensor* out = ctx.Output<Tensor>("Out");
-    Tensor* reserve_space = ctx.Output<Tensor>("ReserveSpace");
+    phi::DenseTensor* out = ctx.Output<phi::DenseTensor>("Out");
+    phi::DenseTensor* reserve_space =
+        ctx.Output<phi::DenseTensor>("ReserveSpace");
 
     bool trans_x = ctx.Attr<bool>("trans_x");
     bool trans_y = ctx.Attr<bool>("trans_y");
@@ -112,15 +113,16 @@ class FusedGemmEpilogueXPUGradKernel : public framework::OpKernel<T> {
     bool trans_x = ctx.Attr<bool>("trans_x");
     bool trans_y = ctx.Attr<bool>("trans_y");
     auto& dev_ctx = ctx.template device_context<phi::XPUContext>();
-    const Tensor* dout = ctx.Input<Tensor>("DOut");
-    const Tensor* x = ctx.Input<Tensor>("X");
-    const Tensor* y = ctx.Input<Tensor>("Y");
+    const phi::DenseTensor* dout = ctx.Input<phi::DenseTensor>("DOut");
+    const phi::DenseTensor* x = ctx.Input<phi::DenseTensor>("X");
+    const phi::DenseTensor* y = ctx.Input<phi::DenseTensor>("Y");
 
-    const Tensor* reserve_space = ctx.Input<Tensor>("ReserveSpace");
+    const phi::DenseTensor* reserve_space =
+        ctx.Input<phi::DenseTensor>("ReserveSpace");
 
-    Tensor* dx = ctx.Output<Tensor>("DX");
-    Tensor* dy = ctx.Output<Tensor>("DY");
-    Tensor* dbias = ctx.Output<Tensor>("DBias");
+    phi::DenseTensor* dx = ctx.Output<phi::DenseTensor>("DX");
+    phi::DenseTensor* dy = ctx.Output<phi::DenseTensor>("DY");
+    phi::DenseTensor* dbias = ctx.Output<phi::DenseTensor>("DBias");
 
     std::string activation = "none";
     if (ctx.HasAttr("activation")) {
diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
index d3c6cca95efb0..f383d6846f946 100644
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
@@ -41,9 +41,9 @@ struct TestFusedLayernormResidualDropoutBias {
   bool has_bias = true;
   bool has_scale = true;
   bool has_layernorm_bias = true;
-  framework::Tensor src, residual, bias, out, mask, scale, layernorm_bias,
+  phi::DenseTensor src, residual, bias, out, mask, scale, layernorm_bias,
       layernorm_out, means, vars;
-  framework::Tensor dsrc, dbias;
+  phi::DenseTensor dsrc, dbias;
 
   std::vector<T> src_vec, residual_vec, bias_vec;
   std::vector<LayerNormParamType<T>> means_vec, vars_vec, scale_vec,
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cc b/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cc
index 9572a87aba21d..2a2d1f27edd9c 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cc
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class FusedMultiTransformerINT8Op : public framework::OperatorWithKernel {
  private:
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cu
index 8e200275f8171..fe1ee3449a102 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cu
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cu
@@ -25,9 +25,9 @@ class FusedMultiTransformerINT8OpKernel : public framework::OpKernel<T> {
     using U = LayerNormParamType<T>;
     auto &dev_ctx = ctx.cuda_device_context();
 
-    auto *time_step = ctx.Input<Tensor>("TimeStep");
+    auto *time_step = ctx.Input<phi::DenseTensor>("TimeStep");
     // 0. input
-    auto *input_x = ctx.Input<Tensor>("X");
+    auto *input_x = ctx.Input<phi::DenseTensor>("X");
     const auto input_x_dims = input_x->dims();
     int bsz = input_x_dims[0];
     int seq_len = input_x_dims[1];
@@ -48,10 +48,11 @@ class FusedMultiTransformerINT8OpKernel : public framework::OpKernel<T> {
 
     // dequant output scales, tensor, size = [num_layers, n], n is gemm output
     // size
-    auto *qkv_out_scale = ctx.Input<Tensor>("QKVOutScale");
-    auto *out_linear_out_scale = ctx.Input<Tensor>("OutLinearOutScale");
-    auto *ffn1_out_scale = ctx.Input<Tensor>("FFN1OutScale");
-    auto *ffn2_out_scale = ctx.Input<Tensor>("FFN2OutScale");
+    auto *qkv_out_scale = ctx.Input<phi::DenseTensor>("QKVOutScale");
+    auto *out_linear_out_scale =
+        ctx.Input<phi::DenseTensor>("OutLinearOutScale");
+    auto *ffn1_out_scale = ctx.Input<phi::DenseTensor>("FFN1OutScale");
+    auto *ffn2_out_scale = ctx.Input<phi::DenseTensor>("FFN2OutScale");
 
     int qkv_out_scale_n = qkv_out_scale->dims()[1];
     int out_linear_out_scale_n = out_linear_out_scale->dims()[1];
@@ -61,8 +62,8 @@ class FusedMultiTransformerINT8OpKernel : public framework::OpKernel<T> {
     // 1. layer norm
     const auto pre_layer_norm = ctx.Attr<bool>("pre_layer_norm");
     const float epsilon = ctx.Attr<float>("epsilon");
-    auto ln_scales = ctx.MultiInput<Tensor>("LnScale");
-    auto ln_biases = ctx.MultiInput<Tensor>("LnBias");
+    auto ln_scales = ctx.MultiInput<phi::DenseTensor>("LnScale");
+    auto ln_biases = ctx.MultiInput<phi::DenseTensor>("LnBias");
 
     auto ln_compute =
         AttnLayerNorm<T, T, int8_t>(dev_ctx, epsilon, bsz_seq, dim_embed);
@@ -76,8 +77,8 @@ class FusedMultiTransformerINT8OpKernel : public framework::OpKernel<T> {
     // 2. qkv
     // x: qkv's input [batch_size, seq_len, dim_embed]
     // y: qkv's weight: [3, num_head, dim_head, dim_embed]
-    auto qkv_weights = ctx.MultiInput<Tensor>("QKVW");
-    auto qkv_biases = ctx.MultiInput<Tensor>("QKVBias");
+    auto qkv_weights = ctx.MultiInput<phi::DenseTensor>("QKVW");
+    auto qkv_biases = ctx.MultiInput<phi::DenseTensor>("QKVBias");
     const bool trans_qkvw = ctx.Attr<bool>("trans_qkvw");
     const auto qkv_w_dims = qkv_weights[0]->dims();
     int num_head = trans_qkvw ? qkv_w_dims[1] : qkv_w_dims[2];
@@ -100,10 +101,10 @@ class FusedMultiTransformerINT8OpKernel : public framework::OpKernel<T> {
         true, "upscale_in_train", 0.0, true, true, 0, nullptr);
     auto fmha_compute =
         FMHARef<T>(dev_ctx, bsz, seq_len, num_head, dim_head, attn_param);
-    auto *src_mask = ctx.Input<Tensor>("SrcMask");
-    auto cache_kvs = ctx.MultiInput<Tensor>("CacheKV");
-    auto cache_kv_outs = ctx.MultiOutput<Tensor>("CacheKVOut");
-    // auto *time_step = ctx.Input<Tensor>("TimeStep");
+    auto *src_mask = ctx.Input<phi::DenseTensor>("SrcMask");
+    auto cache_kvs = ctx.MultiInput<phi::DenseTensor>("CacheKV");
+    auto cache_kv_outs = ctx.MultiOutput<phi::DenseTensor>("CacheKVOut");
+    // auto *time_step = ctx.Input<phi::DenseTensor>("TimeStep");
 
     auto out_seq_len = seq_len;
     if (time_step) {
@@ -156,8 +157,8 @@ class FusedMultiTransformerINT8OpKernel : public framework::OpKernel<T> {
         dev_ctx.Alloc<T>(&fmha_out, fmha_out.numel() * sizeof(T));
 
     // 4. out_linear
-    auto out_linear_weights = ctx.MultiInput<Tensor>("OutLinearW");
-    auto out_linear_biases = ctx.MultiInput<Tensor>("OutLinearBias");
+    auto out_linear_weights = ctx.MultiInput<phi::DenseTensor>("OutLinearW");
+    auto out_linear_biases = ctx.MultiInput<phi::DenseTensor>("OutLinearBias");
     int ring_id = ctx.Attr<int>("ring_id");
     // (transA, transB, compute_bias) = (false, false, false)
     AttnMatmulINT8<T> out_linear_compute(
@@ -171,8 +172,8 @@ class FusedMultiTransformerINT8OpKernel : public framework::OpKernel<T> {
     FusedDropoutLayerNormHelper<T, uint8_t>
         fused_dropout_layernorm_helper_for_post_layernorm(
             dev_ctx, bsz_seq, dim_embed, dropout_param2, epsilon);
-    auto ffn_ln_scales = ctx.MultiInput<Tensor>("FFNLnScale");
-    auto ffn_ln_biases = ctx.MultiInput<Tensor>("FFNLnBias");
+    auto ffn_ln_scales = ctx.MultiInput<phi::DenseTensor>("FFNLnScale");
+    auto ffn_ln_biases = ctx.MultiInput<phi::DenseTensor>("FFNLnBias");
     Tensor bias_dropout_residual_out, dropout_mask_out;
     T *bias_dropout_residual_out_data = nullptr;
     if (pre_layer_norm) {
@@ -186,8 +187,8 @@ class FusedMultiTransformerINT8OpKernel : public framework::OpKernel<T> {
         &dropout_mask_out, dropout_mask_out.numel() * sizeof(uint8_t));
 
     // 6. ffn matmul1
-    auto ffn1_weights = ctx.MultiInput<Tensor>("FFN1Weight");
-    auto ffn1_biases = ctx.MultiInput<Tensor>("FFN1Bias");
+    auto ffn1_weights = ctx.MultiInput<phi::DenseTensor>("FFN1Weight");
+    auto ffn1_biases = ctx.MultiInput<phi::DenseTensor>("FFN1Bias");
     auto ffn1_weight_dim = ffn1_weights[0]->dims();
 
     int dim_ffn = ffn1_weight_dim[0];
@@ -213,8 +214,8 @@ class FusedMultiTransformerINT8OpKernel : public framework::OpKernel<T> {
         &ffn1_dropout_mask, ffn1_dropout_mask.numel() * sizeof(uint8_t));
 
     // 8. ffn2 matmul
-    auto ffn2_weights = ctx.MultiInput<Tensor>("FFN2Weight");
-    auto ffn2_biases = ctx.MultiInput<Tensor>("FFN2Bias");
+    auto ffn2_weights = ctx.MultiInput<phi::DenseTensor>("FFN2Weight");
+    auto ffn2_biases = ctx.MultiInput<phi::DenseTensor>("FFN2Bias");
     AttnMatmulINT8<T> ffn2_linear_compute(
         dev_ctx, bsz_seq, dim_embed, dim_ffn, false);
 
@@ -245,7 +246,7 @@ class FusedMultiTransformerINT8OpKernel : public framework::OpKernel<T> {
                            output_workspace.numel() * sizeof(int32_t));
 
     // calc
-    auto *out = ctx.Output<Tensor>("Out");
+    auto *out = ctx.Output<phi::DenseTensor>("Out");
     auto *from_data = dev_ctx.Alloc<T>(out, out->numel() * sizeof(T));
     Tensor *from_tensor = out;
     Tensor tmp_out;
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cc b/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
index 86de140b9cde8..ede6300decbe5 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/fused/fused_multi_transformer_op.h"
+
 #include <memory>
 #include <string>
 
@@ -21,7 +23,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class FusedMultiTransformerOp : public framework::OperatorWithKernel {
  private:
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
index 5cf22885aabba..b70f0c7ea1965 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
@@ -21,9 +21,9 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
     using U = LayerNormParamType<T>;
     auto &dev_ctx = ctx.cuda_device_context();
 
-    auto *time_step = ctx.Input<Tensor>("TimeStep");
+    auto *time_step = ctx.Input<phi::DenseTensor>("TimeStep");
     // 0. input
-    auto *input_x = ctx.Input<Tensor>("X");
+    auto *input_x = ctx.Input<phi::DenseTensor>("X");
     const auto input_x_dims = input_x->dims();
     int bsz = input_x_dims[0];
     int seq_len = input_x_dims[1];
@@ -33,8 +33,8 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
     // 1. layer norm
     const auto pre_layer_norm = ctx.Attr<bool>("pre_layer_norm");
     const float epsilon = ctx.Attr<float>("epsilon");
-    auto ln_scales = ctx.MultiInput<Tensor>("LnScale");
-    auto ln_biases = ctx.MultiInput<Tensor>("LnBias");
+    auto ln_scales = ctx.MultiInput<phi::DenseTensor>("LnScale");
+    auto ln_biases = ctx.MultiInput<phi::DenseTensor>("LnBias");
 
     auto ln_compute = AttnLayerNorm<T>(dev_ctx, epsilon, bsz_seq, dim_embed);
     Tensor ln_mean, ln_var;
@@ -47,8 +47,8 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
     // 2. qkv
     // x: qkv's input [batch_size, seq_len, dim_embed]
     // y: qkv's weight: [3, num_head, dim_head, dim_embed]
-    auto qkv_weights = ctx.MultiInput<Tensor>("QKVW");
-    auto qkv_biases = ctx.MultiInput<Tensor>("QKVBias");
+    auto qkv_weights = ctx.MultiInput<phi::DenseTensor>("QKVW");
+    auto qkv_biases = ctx.MultiInput<phi::DenseTensor>("QKVBias");
     const bool trans_qkvw = ctx.Attr<bool>("trans_qkvw");
     const auto qkv_w_dims = qkv_weights[0]->dims();
     int num_head = trans_qkvw ? qkv_w_dims[1] : qkv_w_dims[2];
@@ -76,10 +76,10 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
         true, "upscale_in_train", 0.0, true, true, 0, nullptr);
     auto fmha_compute =
         FMHARef<T>(dev_ctx, bsz, seq_len, num_head, dim_head, attn_param);
-    auto *src_mask = ctx.Input<Tensor>("SrcMask");
-    auto cache_kvs = ctx.MultiInput<Tensor>("CacheKV");
-    auto cache_kv_outs = ctx.MultiOutput<Tensor>("CacheKVOut");
-    // auto *time_step = ctx.Input<Tensor>("TimeStep");
+    auto *src_mask = ctx.Input<phi::DenseTensor>("SrcMask");
+    auto cache_kvs = ctx.MultiInput<phi::DenseTensor>("CacheKV");
+    auto cache_kv_outs = ctx.MultiOutput<phi::DenseTensor>("CacheKVOut");
+    // auto *time_step = ctx.Input<phi::DenseTensor>("TimeStep");
 
     auto out_seq_len = seq_len;
     if (time_step) {
@@ -132,8 +132,8 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
         dev_ctx.Alloc<T>(&fmha_out, fmha_out.numel() * sizeof(T));
 
     // 4. out_linear
-    auto out_linear_weights = ctx.MultiInput<Tensor>("OutLinearW");
-    auto out_linear_biases = ctx.MultiInput<Tensor>("OutLinearBias");
+    auto out_linear_weights = ctx.MultiInput<phi::DenseTensor>("OutLinearW");
+    auto out_linear_biases = ctx.MultiInput<phi::DenseTensor>("OutLinearBias");
     int ring_id = ctx.Attr<int>("ring_id");
     // (transA, transB, compute_bias) = (false, false, false)
     auto out_linear_compute = AttnMatMul<T>(
@@ -143,8 +143,8 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
     DropoutParam dropout_param2(true, 0, true, true, 0.0, nullptr, 0);
     FusedDropoutLayerNormHelper<T, uint8_t> fused_dropout_layernorm_helper(
         dev_ctx, bsz_seq, dim_embed, dropout_param2, epsilon);
-    auto ffn_ln_scales = ctx.MultiInput<Tensor>("FFNLnScale");
-    auto ffn_ln_biases = ctx.MultiInput<Tensor>("FFNLnBias");
+    auto ffn_ln_scales = ctx.MultiInput<phi::DenseTensor>("FFNLnScale");
+    auto ffn_ln_biases = ctx.MultiInput<phi::DenseTensor>("FFNLnBias");
     Tensor bias_dropout_residual_out, dropout_mask_out;
     T *bias_dropout_residual_out_data = nullptr;
     if (pre_layer_norm) {
@@ -158,8 +158,8 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
         &dropout_mask_out, dropout_mask_out.numel() * sizeof(uint8_t));
 
     // 6. ffn matmul1
-    auto ffn1_weights = ctx.MultiInput<Tensor>("FFN1Weight");
-    auto ffn1_biases = ctx.MultiInput<Tensor>("FFN1Bias");
+    auto ffn1_weights = ctx.MultiInput<phi::DenseTensor>("FFN1Weight");
+    auto ffn1_biases = ctx.MultiInput<phi::DenseTensor>("FFN1Bias");
     auto ffn1_weight_dim = ffn1_weights[0]->dims();
 
     int dim_ffn = ffn1_weight_dim[1];
@@ -183,8 +183,8 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
         &ffn1_dropout_mask, ffn1_dropout_mask.numel() * sizeof(uint8_t));
 
     // 8. ffn2 matmul
-    auto ffn2_weights = ctx.MultiInput<Tensor>("FFN2Weight");
-    auto ffn2_biases = ctx.MultiInput<Tensor>("FFN2Bias");
+    auto ffn2_weights = ctx.MultiInput<phi::DenseTensor>("FFN2Weight");
+    auto ffn2_biases = ctx.MultiInput<phi::DenseTensor>("FFN2Bias");
     auto ffn2_linear_compute = AttnMatMul<T>(
         dev_ctx, false, false, bsz_seq, dim_embed, dim_ffn, false);
 
@@ -194,7 +194,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
         dev_ctx, bsz_seq, dim_embed, ffn2_dropout_param, epsilon);
 
     // calc
-    auto *out = ctx.Output<Tensor>("Out");
+    auto *out = ctx.Output<phi::DenseTensor>("Out");
     auto *from_data = dev_ctx.Alloc<T>(out, out->numel() * sizeof(T));
     Tensor *from_tensor = out;
     Tensor tmp_out;
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.h b/paddle/fluid/operators/fused/fused_multi_transformer_op.h
index 761a31ce094d1..e0795616fd951 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.h
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.h
@@ -41,13 +41,13 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 // for debug
 // #define _DEBUG_FUSED_MULTI_TRANSFORMER
 
 template <typename T>
-static void AllReduce(framework::Tensor &tensor,  // NOLINT
+static void AllReduce(phi::DenseTensor &tensor,  // NOLINT
                       const int ring_id,
                       const int count,
                       const phi::GPUContext &ctx) {
diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
index ba0652339e96e..79eb5f64cf0ec 100644
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
@@ -50,8 +50,8 @@ struct FusedResidualDropoutBiasTester {
   bool has_bias = true;
   bool add_residual = true;
 
-  framework::Tensor src, residual, bias, out, mask;
-  framework::Tensor dsrc, dbias;
+  phi::DenseTensor src, residual, bias, out, mask;
+  phi::DenseTensor dsrc, dbias;
 
   std::vector<T> src_vec, residual_vec, bias_vec;
   std::vector<T> correct_out, correct_dsrc, correct_dbias;
diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
index dbfabe07f474f..a58a5ea01d02e 100644
--- a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
+++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
@@ -425,7 +425,7 @@ class FusedSeqpoolCVMCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     auto inputs = ctx.MultiInput<LoDTensor>("X");
-    auto outputs = ctx.MultiOutput<framework::Tensor>("Out");
+    auto outputs = ctx.MultiOutput<phi::DenseTensor>("Out");
     auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
     const auto slot_size = inputs.size();
     std::vector<const float *> input_data(slot_size);
diff --git a/paddle/fluid/operators/fused/fused_softmax_mask.cu.h b/paddle/fluid/operators/fused/fused_softmax_mask.cu.h
index 009a9253ab351..60723c6cb5d17 100644
--- a/paddle/fluid/operators/fused/fused_softmax_mask.cu.h
+++ b/paddle/fluid/operators/fused/fused_softmax_mask.cu.h
@@ -20,8 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 namespace plat = paddle::platform;
 
 #define FINAL_MASK 0xffffffff
diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
index 194d171c46e7a..9eee08600ae0e 100644
--- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
@@ -20,7 +20,7 @@ namespace paddle {
 namespace operators {
 
 #if CUDNN_VERSION >= 7100
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
 using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
 using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
@@ -40,12 +40,12 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
-    auto* input = ctx.Input<Tensor>("Input");
-    auto filters = ctx.MultiInput<framework::Tensor>("Filter");
-    auto bias = ctx.MultiInput<framework::Tensor>("Bias");
+    auto* input = ctx.Input<phi::DenseTensor>("Input");
+    auto filters = ctx.MultiInput<phi::DenseTensor>("Filter");
+    auto bias = ctx.MultiInput<phi::DenseTensor>("Bias");
 
-    auto* output = ctx.Output<Tensor>("Output");
-    auto temp_outs = ctx.MultiOutput<framework::Tensor>("TempOutput");
+    auto* output = ctx.Output<phi::DenseTensor>("Output");
+    auto temp_outs = ctx.MultiOutput<phi::DenseTensor>("TempOutput");
 
     const std::string pool_type = ctx.Attr<std::string>("pooling_type");
     const std::string activation = ctx.Attr<std::string>("activation");
diff --git a/paddle/fluid/operators/fused/fusion_group_op_test.cc b/paddle/fluid/operators/fused/fusion_group_op_test.cc
index e0ee074e7f2a7..6688501fe9ac5 100644
--- a/paddle/fluid/operators/fused/fusion_group_op_test.cc
+++ b/paddle/fluid/operators/fused/fusion_group_op_test.cc
@@ -26,10 +26,10 @@ namespace operators {
 using CPUKernelFunc = std::function<void(size_t n, std::vector<void*> args)>;
 
 template <typename T>
-framework::Tensor* CreateTensor(framework::Scope* scope,
-                                const platform::Place& place,
-                                const std::string& name,
-                                const std::vector<int64_t>& shape) {
+phi::DenseTensor* CreateTensor(framework::Scope* scope,
+                               const platform::Place& place,
+                               const std::string& name,
+                               const std::vector<int64_t>& shape) {
   auto* var = scope->Var(name);
   auto* tensor = var->GetMutable<framework::LoDTensor>();
   if (shape.size() > 0) {
@@ -39,7 +39,7 @@ framework::Tensor* CreateTensor(framework::Scope* scope,
 }
 
 template <typename T>
-void SetupRandomCPUTensor(framework::Tensor* tensor,
+void SetupRandomCPUTensor(phi::DenseTensor* tensor,
                           const std::vector<int64_t>& shape) {
   static unsigned int seed = 100;
   std::mt19937 rng(seed++);
@@ -104,10 +104,10 @@ void PrepareDeviceCode(platform::Place place,
 
 void CheckOutputs(framework::Scope* scope,
                   const std::vector<std::string>& output_names,
-                  std::vector<framework::Tensor>* cpu_tensors,
+                  std::vector<phi::DenseTensor>* cpu_tensors,
                   size_t num_inputs,
                   CPUKernelFunc cpu_kernel_func) {
-  std::vector<framework::Tensor> cpu_outputs;
+  std::vector<phi::DenseTensor> cpu_outputs;
   cpu_outputs.resize(output_names.size());
   for (size_t j = 0; j < output_names.size(); ++j) {
     auto* var = scope->Var(output_names[j]);
@@ -158,11 +158,11 @@ void TestMain(const std::vector<std::string>& input_names,
   framework::Scope scope;
 
   // Prepare input tensors.
-  std::vector<framework::Tensor> cpu_tensors;
+  std::vector<phi::DenseTensor> cpu_tensors;
   cpu_tensors.resize(input_names.size() + output_names.size());
   for (size_t i = 0; i < input_names.size(); ++i) {
     SetupRandomCPUTensor<float>(&(cpu_tensors[i]), input_shapes[i]);
-    framework::Tensor* dev_tensor =
+    phi::DenseTensor* dev_tensor =
         CreateTensor<float>(&scope, place, input_names[i], input_shapes[i]);
     paddle::framework::TensorCopySync(cpu_tensors[i], place, dev_tensor);
   }
diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc
index e2d2cf071caba..a8ad8c9cbf9ba 100644
--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fused/fusion_gru_op.cc
@@ -269,7 +269,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
 
 #define INIT_BASE_DEFINES                                  \
   auto* x = ctx.Input<LoDTensor>("X");                     \
-  auto* wh = ctx.Input<Tensor>("WeightH");                 \
+  auto* wh = ctx.Input<phi::DenseTensor>("WeightH");       \
   auto* xx = ctx.Output<LoDTensor>("XX");                  \
   auto x_lod = x->lod();                                   \
   auto x_dims = x->dims(); /* T x M*/                      \
@@ -281,9 +281,9 @@ class FusionGRUKernel : public framework::OpKernel<T> {
   const int D3 = wh_dims[1]
 
 #define INIT_OTHER_DEFINES                                                   \
-  auto* h0 = ctx.Input<Tensor>("H0");                                        \
-  auto* wx = ctx.Input<Tensor>("WeightX");                                   \
-  auto* bias = ctx.Input<Tensor>("Bias");                                    \
+  auto* h0 = ctx.Input<phi::DenseTensor>("H0");                              \
+  auto* wx = ctx.Input<phi::DenseTensor>("WeightX");                         \
+  auto* bias = ctx.Input<phi::DenseTensor>("Bias");                          \
   auto* hidden_out = ctx.Output<LoDTensor>("Hidden");                        \
   bool is_reverse = ctx.Attr<bool>("is_reverse");                            \
   const int M = x_mat_dims[1];                                               \
@@ -408,7 +408,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
       return;
     }
     INIT_OTHER_DEFINES;
-    auto* reordered_h0 = ctx.Output<Tensor>("ReorderedH0");
+    auto* reordered_h0 = ctx.Output<phi::DenseTensor>("ReorderedH0");
     auto* batched_input = ctx.Output<LoDTensor>("BatchedInput");
     auto* batched_out = ctx.Output<LoDTensor>("BatchedOut");
     T* batched_input_data = batched_input->mutable_data<T>(place);
diff --git a/paddle/fluid/operators/fused/fusion_gru_op.h b/paddle/fluid/operators/fused/fusion_gru_op.h
index eaa59cd412f8f..2e57998b71f59 100644
--- a/paddle/fluid/operators/fused/fusion_gru_op.h
+++ b/paddle/fluid/operators/fused/fusion_gru_op.h
@@ -19,7 +19,7 @@ namespace paddle {
 namespace operators {
 
 using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class FusionGRUOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc
index 5454c90b3c596..69561e0df1ffa 100644
--- a/paddle/fluid/operators/fused/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc
@@ -309,11 +309,11 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
 #define INIT_BASE_DEFINES                               \
   using DeviceContext = phi::CPUContext;                \
   auto* x = ctx.Input<LoDTensor>("X");                  \
-  auto* h0 = ctx.Input<Tensor>("H0");                   \
-  auto* c0 = ctx.Input<Tensor>("C0");                   \
-  auto* wx = ctx.Input<Tensor>("WeightX");              \
-  auto* wh = ctx.Input<Tensor>("WeightH");              \
-  auto* bias = ctx.Input<Tensor>("Bias");               \
+  auto* h0 = ctx.Input<phi::DenseTensor>("H0");         \
+  auto* c0 = ctx.Input<phi::DenseTensor>("C0");         \
+  auto* wx = ctx.Input<phi::DenseTensor>("WeightX");    \
+  auto* wh = ctx.Input<phi::DenseTensor>("WeightH");    \
+  auto* bias = ctx.Input<phi::DenseTensor>("Bias");     \
   auto* xx = ctx.Output<LoDTensor>("XX");               \
   auto* hidden_out = ctx.Output<LoDTensor>("Hidden");   \
   auto* cell_out = ctx.Output<LoDTensor>("Cell");       \
@@ -336,7 +336,7 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
   auto place = ctx.GetPlace();                                                 \
   if (use_peepholes) {                                                         \
     /* w_ic * Ct-1, w_fc * Ct-1  ; w_oc * Ct => ih*/                           \
-    auto* checked_cell = ctx.Output<Tensor>("CheckedCell");                    \
+    auto* checked_cell = ctx.Output<phi::DenseTensor>("CheckedCell");          \
     checked_cell_data = checked_cell->mutable_data<T>(place);                  \
   }                                                                            \
   const jit::lstm_attr_t attr(                                                 \
@@ -448,8 +448,8 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
     }
     INIT_OTHER_DEFINES;
 
-    auto* reordered_h0 = ctx.Output<Tensor>("ReorderedH0");
-    auto* reordered_c0 = ctx.Output<Tensor>("ReorderedC0");
+    auto* reordered_h0 = ctx.Output<phi::DenseTensor>("ReorderedH0");
+    auto* reordered_c0 = ctx.Output<phi::DenseTensor>("ReorderedC0");
     auto* batched_input = ctx.Output<LoDTensor>("BatchedInput");
     auto* batched_c_out = ctx.Output<LoDTensor>("BatchedCell");
     auto* batched_h_out = ctx.Output<LoDTensor>("BatchedHidden");
diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.h b/paddle/fluid/operators/fused/fusion_lstm_op.h
index 7f79601602348..2d64c592a2f6c 100644
--- a/paddle/fluid/operators/fused/fusion_lstm_op.h
+++ b/paddle/fluid/operators/fused/fusion_lstm_op.h
@@ -19,7 +19,7 @@ namespace paddle {
 namespace operators {
 
 using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class FusionLSTMOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
index 983d51241491b..50291ee648141 100644
--- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
+++ b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
@@ -140,11 +140,11 @@ template <typename T>
 class FusionRepeatedFCReluKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto in = ctx.Input<Tensor>("X");
-    auto weights = ctx.MultiInput<Tensor>("W");
-    auto biases = ctx.MultiInput<Tensor>("Bias");
-    auto relus = ctx.MultiOutput<Tensor>("ReluOut");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto in = ctx.Input<phi::DenseTensor>("X");
+    auto weights = ctx.MultiInput<phi::DenseTensor>("W");
+    auto biases = ctx.MultiInput<phi::DenseTensor>("Bias");
+    auto relus = ctx.MultiOutput<phi::DenseTensor>("ReluOut");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     auto place = ctx.GetPlace();
     int weight_sz = static_cast<int>(weights.size());
 
diff --git a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.h b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.h
index cdcaf8b483346..383353180eb38 100644
--- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.h
+++ b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.h
@@ -19,7 +19,7 @@ namespace paddle {
 namespace operators {
 
 using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class FusionRepeatedFCReluOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
index 2ebac6d7f7124..64cc22224d385 100644
--- a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
@@ -151,10 +151,10 @@ class FusionSeqConvEltAddReluKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     using DeviceContext = phi::CPUContext;
     auto* x = ctx.Input<LoDTensor>("X");
-    auto* w = ctx.Input<Tensor>("Filter");
-    auto* b = ctx.Input<Tensor>("Bias");
+    auto* w = ctx.Input<phi::DenseTensor>("Filter");
+    auto* b = ctx.Input<phi::DenseTensor>("Bias");
     auto* y = ctx.Output<LoDTensor>("Out");
-    auto* col = ctx.Output<Tensor>("ColMat");
+    auto* col = ctx.Output<phi::DenseTensor>("ColMat");
 
     auto x_lod = x->lod();
     auto x_dims = x->dims();
diff --git a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.h b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.h
index 028d79dc2a1ee..7ce582c398604 100644
--- a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.h
+++ b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.h
@@ -19,7 +19,7 @@ namespace paddle {
 namespace operators {
 
 using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class FusionSeqConvEltAddReluOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
index 6655c6756a5c8..095a1c1deb153 100644
--- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
@@ -151,10 +151,10 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     using DeviceContext = phi::CPUContext;
     auto ins = ctx.MultiInput<LoDTensor>("X");
-    auto* w = ctx.Input<Tensor>("FCWeight");
-    auto* b = ctx.Input<Tensor>("FCBias");
+    auto* w = ctx.Input<phi::DenseTensor>("FCWeight");
+    auto* b = ctx.Input<phi::DenseTensor>("FCBias");
     auto* out = ctx.Output<LoDTensor>("Out");
-    auto* fc_out = ctx.Output<Tensor>("FCOut");
+    auto* fc_out = ctx.Output<phi::DenseTensor>("FCOut");
 
     auto* ref_in = ins[0];
     auto ref_lod = ref_in->lod();
diff --git a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.h b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.h
index f78e820f60335..30170eb17d6da 100644
--- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.h
+++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.h
@@ -19,7 +19,7 @@ namespace paddle {
 namespace operators {
 
 using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class FusionSeqExpandConcatFCOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.h b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.h
index 9f882a59d351c..47204abb6d718 100644
--- a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.h
+++ b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.h
@@ -19,7 +19,7 @@ namespace paddle {
 namespace operators {
 
 using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class FusionSeqPoolConcatOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.h b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.h
index 75e8556c31a81..5e3afd4e62b08 100644
--- a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.h
+++ b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.h
@@ -19,7 +19,7 @@ namespace paddle {
 namespace operators {
 
 using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class FusionSeqPoolCVMConcatOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
index 6be6763492345..b7a01b7955887 100644
--- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
+++ b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
@@ -88,12 +88,12 @@ template <typename T>
 class FusionSquaredMatSubKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto x = ctx.Input<Tensor>("X");
-    auto y = ctx.Input<Tensor>("Y");
-    auto* squared_x = ctx.Output<Tensor>("SquaredX");
-    auto* squared_y = ctx.Output<Tensor>("SquaredY");
-    auto* squared_xy = ctx.Output<Tensor>("SquaredXY");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto x = ctx.Input<phi::DenseTensor>("X");
+    auto y = ctx.Input<phi::DenseTensor>("Y");
+    auto* squared_x = ctx.Output<phi::DenseTensor>("SquaredX");
+    auto* squared_y = ctx.Output<phi::DenseTensor>("SquaredY");
+    auto* squared_xy = ctx.Output<phi::DenseTensor>("SquaredXY");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     auto place = ctx.GetPlace();
     T scalar = static_cast<T>(ctx.Attr<float>("scalar"));
 
diff --git a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.h b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.h
index 0ab2c2bb10a15..c926613dc29fa 100644
--- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.h
+++ b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.h
@@ -19,7 +19,7 @@ namespace paddle {
 namespace operators {
 
 using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 // ( (A.^2 * B.^2) - (A * B).^2 ) .* scalar
 class FusionSquaredMatSubOp : public framework::OperatorWithKernel {
diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc
index c9900daf4ed06..e7bb037a3f3aa 100644
--- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc
+++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc
@@ -22,8 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class TransposeFlattenConcatFusionOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
index 4d063ba2be7cd..e5d32270bf4ee 100644
--- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
+++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
@@ -28,8 +28,8 @@ template <typename T>
 class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto ins = ctx.MultiInput<framework::Tensor>("X");
-    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto ins = ctx.MultiInput<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
     dev_ctx.Alloc<T>(out, out->numel() * sizeof(T));
     auto odims = out->dims();
diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
index ff983684708aa..a040aa3779323 100644
--- a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
@@ -21,7 +21,7 @@ namespace paddle {
 namespace operators {
 
 using paddle::framework::LoDTensor;
-using paddle::framework::Tensor;
+
 using paddle::platform::MKLDNNGetDataType;
 using paddle::platform::MKLDNNMemDesc;
 using phi::CPUContext;
@@ -35,8 +35,8 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
                    const dnnl::engine mkldnn_engine,
                    platform::Place cpu_place,
                    const LoDTensor* input,
-                   const Tensor* weight_h,
-                   const Tensor* h0,
+                   const phi::DenseTensor* weight_h,
+                   const phi::DenseTensor* h0,
                    const bool is_reverse,
                    const int64_t N,
                    const int64_t Ti,
@@ -116,8 +116,8 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
   }
 
   template <typename U>
-  std::shared_ptr<dnnl::memory> AcquireWeightXMemory(const Tensor* weight_x,
-                                                     const bool origin_mode) {
+  std::shared_ptr<dnnl::memory> AcquireWeightXMemory(
+      const phi::DenseTensor* weight_x, const bool origin_mode) {
     const std::string wx_key = this->memory_key_ + "@weight_x";
     auto memory_p =
         std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(wx_key));
@@ -156,8 +156,8 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
   }
 
   template <typename U>
-  std::shared_ptr<dnnl::memory> AcquireWeightHMemory(const Tensor* weight_h,
-                                                     const bool origin_mode) {
+  std::shared_ptr<dnnl::memory> AcquireWeightHMemory(
+      const phi::DenseTensor* weight_h, const bool origin_mode) {
     const std::string wh_key = this->memory_key_ + "@weight_h";
     auto memory_p =
         std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(wh_key));
@@ -209,7 +209,7 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
     return memory_p;
   }
 
-  std::shared_ptr<dnnl::memory> AcquireBiasMemory(const Tensor* bias,
+  std::shared_ptr<dnnl::memory> AcquireBiasMemory(const phi::DenseTensor* bias,
                                                   const bool origin_mode) {
     const std::string bias_key = this->memory_key_ + "@bias";
     auto memory_p = std::static_pointer_cast<dnnl::memory>(
@@ -263,10 +263,10 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
 
     // Get Tensors
     const auto* input = ctx.Input<LoDTensor>("X");
-    const auto* h0 = ctx.Input<Tensor>("H0");
-    const auto* weight_x = ctx.Input<Tensor>("WeightX");
-    const auto* weight_h = ctx.Input<Tensor>("WeightH");
-    const auto* bias = ctx.Input<Tensor>("Bias");
+    const auto* h0 = ctx.Input<phi::DenseTensor>("H0");
+    const auto* weight_x = ctx.Input<phi::DenseTensor>("WeightX");
+    const auto* weight_h = ctx.Input<phi::DenseTensor>("WeightH");
+    const auto* bias = ctx.Input<phi::DenseTensor>("Bias");
     auto* hidden = ctx.Output<LoDTensor>("Hidden");
     auto x_dims = input->dims();
     auto x_mat_dims = (x_dims.size() == 3 && x_dims[1] == 1)
diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
index 748de5dae9520..6ecde2fdcf87e 100644
--- a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
@@ -21,7 +21,7 @@ namespace paddle {
 namespace operators {
 
 using paddle::framework::LoDTensor;
-using paddle::framework::Tensor;
+
 using paddle::platform::MKLDNNGetDataType;
 using paddle::platform::MKLDNNMemDesc;
 using phi::CPUContext;
@@ -36,9 +36,9 @@ class LSTMMKLDNNHandler
                     const dnnl::engine mkldnn_engine,
                     platform::Place cpu_place,
                     const LoDTensor* input,
-                    const Tensor* weight_h,
-                    const Tensor* h0,
-                    const Tensor* c0,
+                    const phi::DenseTensor* weight_h,
+                    const phi::DenseTensor* h0,
+                    const phi::DenseTensor* c0,
                     const bool is_reverse,
                     const int64_t N,
                     const int64_t Ti,
@@ -168,7 +168,8 @@ class LSTMMKLDNNHandler
   }
 
   template <typename U>
-  std::shared_ptr<dnnl::memory> AcquireWeightXMemory(const Tensor* weight_x) {
+  std::shared_ptr<dnnl::memory> AcquireWeightXMemory(
+      const phi::DenseTensor* weight_x) {
     const std::string wx_key = this->memory_key_ + "@weight_x";
     auto memory_p =
         std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(wx_key));
@@ -199,7 +200,8 @@ class LSTMMKLDNNHandler
   }
 
   template <typename U>
-  std::shared_ptr<dnnl::memory> AcquireWeightHMemory(const Tensor* weight_h) {
+  std::shared_ptr<dnnl::memory> AcquireWeightHMemory(
+      const phi::DenseTensor* weight_h) {
     const std::string wh_key = this->memory_key_ + "@weight_h";
     auto memory_p =
         std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(wh_key));
@@ -229,7 +231,8 @@ class LSTMMKLDNNHandler
     return memory_p;
   }
 
-  std::shared_ptr<dnnl::memory> AcquireBiasMemory(const Tensor* bias) {
+  std::shared_ptr<dnnl::memory> AcquireBiasMemory(
+      const phi::DenseTensor* bias) {
     const std::string bias_key = this->memory_key_ + "@bias";
     auto memory_p = std::static_pointer_cast<dnnl::memory>(
         this->dev_ctx_.GetBlob(bias_key));
@@ -256,7 +259,8 @@ class LSTMMKLDNNHandler
     return memory_p;
   }
 
-  std::shared_ptr<dnnl::memory> AcquirePeepholeWeights(const Tensor* bias) {
+  std::shared_ptr<dnnl::memory> AcquirePeepholeWeights(
+      const phi::DenseTensor* bias) {
     const std::string peepholes_key = this->memory_key_ + "@peepholes_weights";
     auto memory_p = std::static_pointer_cast<dnnl::memory>(
         this->dev_ctx_.GetBlob(peepholes_key));
@@ -282,7 +286,7 @@ class LSTMMKLDNNHandler
     return memory_p;
   }
 
-  std::shared_ptr<dnnl::memory> AcquireC0Memory(const Tensor* c0) {
+  std::shared_ptr<dnnl::memory> AcquireC0Memory(const phi::DenseTensor* c0) {
     const std::string c0_key = this->memory_key_ + "@c0";
     auto memory_p =
         std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(c0_key));
@@ -340,11 +344,11 @@ class FusionLSTMMKLDNNKernel : public framework::OpKernel<T> {
 
     // Get Tensors
     const auto* input = ctx.Input<LoDTensor>("X");
-    const auto* h0 = ctx.Input<Tensor>("H0");
-    const auto* c0 = ctx.Input<Tensor>("C0");
-    const auto* weight_x = ctx.Input<Tensor>("WeightX");
-    const auto* weight_h = ctx.Input<Tensor>("WeightH");
-    const auto* bias = ctx.Input<Tensor>("Bias");
+    const auto* h0 = ctx.Input<phi::DenseTensor>("H0");
+    const auto* c0 = ctx.Input<phi::DenseTensor>("C0");
+    const auto* weight_x = ctx.Input<phi::DenseTensor>("WeightX");
+    const auto* weight_h = ctx.Input<phi::DenseTensor>("WeightH");
+    const auto* bias = ctx.Input<phi::DenseTensor>("Bias");
     auto* hidden = ctx.Output<LoDTensor>("Hidden");
     auto* cell = ctx.Output<LoDTensor>("Cell");
     cell = cell;
diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h b/paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h
index a357a59a09420..f4ae023f85e43 100644
--- a/paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h
@@ -20,7 +20,7 @@ namespace paddle {
 namespace operators {
 
 using paddle::framework::LoDTensor;
-using paddle::framework::Tensor;
+
 using paddle::platform::CreateKey;
 using paddle::platform::MKLDNNGetDataType;
 using paddle::platform::MKLDNNMemDesc;
@@ -35,8 +35,8 @@ class RNNMKLDNNHandler : public platform::MKLDNNHandlerT<T, T_alg> {
                    const dnnl::engine mkldnn_engine,
                    platform::Place cpu_place,
                    const LoDTensor* input,
-                   const Tensor* weight_h,
-                   const Tensor* h0,
+                   const phi::DenseTensor* weight_h,
+                   const phi::DenseTensor* h0,
                    const bool is_reverse,
                    const int64_t N,
                    const int64_t Ti,
@@ -201,7 +201,7 @@ class RNNMKLDNNHandler : public platform::MKLDNNHandlerT<T, T_alg> {
   // TODO(jczaja) H0 should be updated each iter and of T type (Fusion pass does
   // not support in yet)
   template <typename U>
-  std::shared_ptr<dnnl::memory> AcquireH0Memory(const Tensor* h0) {
+  std::shared_ptr<dnnl::memory> AcquireH0Memory(const phi::DenseTensor* h0) {
     const std::string h0_key = memory_key_ + "@h0";
     auto memory_p =
         std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(h0_key));
diff --git a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
index c59e7d661607c..372137511a2e6 100644
--- a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <iostream>
 #include <memory>
 
-#include "dnnl.hpp"
+#include "dnnl.hpp"  // NOLINT
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/fused/multi_gru_op.h"
@@ -27,7 +27,7 @@ namespace paddle {
 namespace operators {
 
 using paddle::framework::LoDTensor;
-using paddle::framework::Tensor;
+
 using paddle::platform::CreateKey;
 using paddle::platform::MKLDNNGetDataType;
 using paddle::platform::MKLDNNMemDesc;
@@ -64,9 +64,9 @@ class MultiGRUHandler {
         layers_(ctx.Attr<int>("layers")),
         concat_pds_(layers_, std::shared_ptr<dnnl::concat::primitive_desc>()),
         x_(ctx.Input<LoDTensor>("X")),
-        weights_x_(ctx.MultiInput<Tensor>("WeightX")),
-        weights_h_(ctx.MultiInput<Tensor>("WeightH")),
-        biases_(ctx.MultiInput<Tensor>("Bias")),
+        weights_x_(ctx.MultiInput<phi::DenseTensor>("WeightX")),
+        weights_h_(ctx.MultiInput<phi::DenseTensor>("WeightH")),
+        biases_(ctx.MultiInput<phi::DenseTensor>("Bias")),
         hidden_(ctx.Output<LoDTensor>("Hidden")),
         x_lod_(x_->lod()[0]) {
     PADDLE_ENFORCE_EQ(
@@ -672,9 +672,9 @@ class MultiGRUHandler {
   std::string memory_key_;
 
   const LoDTensor* x_;
-  const std::vector<const Tensor*> weights_x_;
-  const std::vector<const Tensor*> weights_h_;
-  const std::vector<const Tensor*> biases_;
+  const std::vector<const phi::DenseTensor*> weights_x_;
+  const std::vector<const phi::DenseTensor*> weights_h_;
+  const std::vector<const phi::DenseTensor*> biases_;
   LoDTensor* hidden_;
   std::vector<dnnl::primitive_attr> attrs_;
   const paddle::framework::Vector<size_t>& x_lod_;
diff --git a/paddle/fluid/operators/fused/multi_gru_op.h b/paddle/fluid/operators/fused/multi_gru_op.h
index 8b064c8754f5e..ba239d20eb28f 100644
--- a/paddle/fluid/operators/fused/multi_gru_op.h
+++ b/paddle/fluid/operators/fused/multi_gru_op.h
@@ -21,7 +21,6 @@ namespace operators {
 
 using framework::ExecutionContext;
 using framework::LoDTensor;
-using framework::Tensor;
 
 class MultiGRUOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/fused/multihead_matmul_op.cu b/paddle/fluid/operators/fused/multihead_matmul_op.cu
index c2e2754830bbd..a258c0107859c 100644
--- a/paddle/fluid/operators/fused/multihead_matmul_op.cu
+++ b/paddle/fluid/operators/fused/multihead_matmul_op.cu
@@ -260,11 +260,11 @@ template <typename DeviceContext, typename T>
 class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    using Tensor = framework::Tensor;
-    auto *input = context.Input<framework::Tensor>("Input");
-    auto *w = context.Input<framework::Tensor>("W");
-    auto *bias = context.Input<framework::Tensor>("Bias");
-    auto *bias_qk = context.Input<framework::Tensor>("BiasQK");
+    using Tensor = phi::DenseTensor;
+    auto *input = context.Input<phi::DenseTensor>("Input");
+    auto *w = context.Input<phi::DenseTensor>("W");
+    auto *bias = context.Input<phi::DenseTensor>("Bias");
+    auto *bias_qk = context.Input<phi::DenseTensor>("BiasQK");
 
     auto *input_d = input->data<T>();
     auto *w_d = w->data<T>();
@@ -310,7 +310,7 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
     int all_head_size = w_dims[2];
     int head_size = all_head_size / head_number;
 
-    auto *out = context.Output<framework::Tensor>("Out");
+    auto *out = context.Output<phi::DenseTensor>("Out");
     out->Resize({batch, seq_len, all_head_size});
     auto *output_d =
         device_ctx.template Alloc<T>(out, out->numel() * sizeof(T));
diff --git a/paddle/fluid/operators/fused/resnet_basic_block_op.cc b/paddle/fluid/operators/fused/resnet_basic_block_op.cc
index af5b76911692d..0f501368e73f0 100644
--- a/paddle/fluid/operators/fused/resnet_basic_block_op.cc
+++ b/paddle/fluid/operators/fused/resnet_basic_block_op.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class ResNetBasicBlockOp : public framework::OperatorWithKernel {
  public:
@@ -227,26 +227,26 @@ class ResNetBasicBlockOp : public framework::OperatorWithKernel {
     // By default, the type of the scale, bias, mean,
     // and var tensors should be float when input tensor's dtype is float16.
     auto bn_param_type = framework::proto::VarType::FP32;
-    PADDLE_ENFORCE_EQ(
-        bn_param_type,
-        framework::TransToProtoVarType(ctx.Input<Tensor>("Scale1")->dtype()),
-        platform::errors::InvalidArgument(
-            "Scale input should be of float type"));
-    PADDLE_ENFORCE_EQ(
-        bn_param_type,
-        framework::TransToProtoVarType(ctx.Input<Tensor>("Bias1")->dtype()),
-        platform::errors::InvalidArgument(
-            "Bias input should be of float type"));
-    PADDLE_ENFORCE_EQ(
-        bn_param_type,
-        framework::TransToProtoVarType(ctx.Input<Tensor>("Scale2")->dtype()),
-        platform::errors::InvalidArgument(
-            "Scale input should be of float type"));
-    PADDLE_ENFORCE_EQ(
-        bn_param_type,
-        framework::TransToProtoVarType(ctx.Input<Tensor>("Bias2")->dtype()),
-        platform::errors::InvalidArgument(
-            "Bias input should be of float type"));
+    PADDLE_ENFORCE_EQ(bn_param_type,
+                      framework::TransToProtoVarType(
+                          ctx.Input<phi::DenseTensor>("Scale1")->dtype()),
+                      platform::errors::InvalidArgument(
+                          "Scale input should be of float type"));
+    PADDLE_ENFORCE_EQ(bn_param_type,
+                      framework::TransToProtoVarType(
+                          ctx.Input<phi::DenseTensor>("Bias1")->dtype()),
+                      platform::errors::InvalidArgument(
+                          "Bias input should be of float type"));
+    PADDLE_ENFORCE_EQ(bn_param_type,
+                      framework::TransToProtoVarType(
+                          ctx.Input<phi::DenseTensor>("Scale2")->dtype()),
+                      platform::errors::InvalidArgument(
+                          "Scale input should be of float type"));
+    PADDLE_ENFORCE_EQ(bn_param_type,
+                      framework::TransToProtoVarType(
+                          ctx.Input<phi::DenseTensor>("Bias2")->dtype()),
+                      platform::errors::InvalidArgument(
+                          "Bias input should be of float type"));
 
     framework::LibraryType library = framework::LibraryType::kPlain;
     framework::DataLayout layout = framework::DataLayout::kAnyLayout;
diff --git a/paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc b/paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc
index 429e644da4006..8310116849611 100644
--- a/paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc
+++ b/paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc
@@ -21,7 +21,7 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class ResnetBasicBlockAttr {
  public:
@@ -49,11 +49,11 @@ class ResnetBasicBlockAttr {
     global_stats = test_mode || use_global_stats;
 
     // init shape
-    auto input1 = ctx.Input<Tensor>("X");
-    auto filter1 = ctx.Input<Tensor>("Filter1");
-    auto conv1_out = ctx.Output<Tensor>("Conv1");
-    auto filter2 = ctx.Input<Tensor>("Filter2");
-    auto conv2_out = ctx.Output<Tensor>("Conv2");
+    auto input1 = ctx.Input<phi::DenseTensor>("X");
+    auto filter1 = ctx.Input<phi::DenseTensor>("Filter1");
+    auto conv1_out = ctx.Output<phi::DenseTensor>("Conv1");
+    auto filter2 = ctx.Input<phi::DenseTensor>("Filter2");
+    auto conv2_out = ctx.Output<phi::DenseTensor>("Conv2");
     conv1_input_shape = phi::vectorize<int>(input1->dims());
     conv1_output_shape = phi::vectorize<int>(conv1_out->dims());
     conv1_filter_shape = phi::vectorize<int>(filter1->dims());
@@ -69,8 +69,8 @@ class ResnetBasicBlockAttr {
     conv2_output_numel = conv2_out->numel();
 
     if (has_shortcut) {
-      auto filter3 = ctx.Input<Tensor>("Filter3");
-      auto conv3_out = ctx.Output<Tensor>("Conv3");
+      auto filter3 = ctx.Input<phi::DenseTensor>("Filter3");
+      auto conv3_out = ctx.Output<phi::DenseTensor>("Conv3");
       conv3_input_shape = phi::vectorize<int>(input1->dims());
       conv3_output_shape = phi::vectorize<int>(conv3_out->dims());
       conv3_filter_shape = phi::vectorize<int>(filter3->dims());
@@ -137,11 +137,11 @@ class ResnetBasicBlockGradAttr {
     find_max = ctx.Attr<bool>("find_conv_input_max");
 
     // init shape
-    auto input1 = ctx.Input<Tensor>("X");
-    auto filter1 = ctx.Input<Tensor>("Filter1");
-    auto conv1_out = ctx.Input<Tensor>("Conv1");
-    auto filter2 = ctx.Input<Tensor>("Filter2");
-    auto conv2_out = ctx.Input<Tensor>("Conv2");
+    auto input1 = ctx.Input<phi::DenseTensor>("X");
+    auto filter1 = ctx.Input<phi::DenseTensor>("Filter1");
+    auto conv1_out = ctx.Input<phi::DenseTensor>("Conv1");
+    auto filter2 = ctx.Input<phi::DenseTensor>("Filter2");
+    auto conv2_out = ctx.Input<phi::DenseTensor>("Conv2");
     conv1_input_shape = phi::vectorize<int>(input1->dims());
     conv1_output_shape = phi::vectorize<int>(conv1_out->dims());
     conv1_filter_shape = phi::vectorize<int>(filter1->dims());
@@ -157,8 +157,8 @@ class ResnetBasicBlockGradAttr {
     conv2_output_numel = conv2_out->numel();
 
     if (has_shortcut) {
-      auto filter3 = ctx.Input<Tensor>("Filter3");
-      auto conv3_out = ctx.Input<Tensor>("Conv3");
+      auto filter3 = ctx.Input<phi::DenseTensor>("Filter3");
+      auto conv3_out = ctx.Input<phi::DenseTensor>("Conv3");
       conv3_input_shape = phi::vectorize<int>(input1->dims());
       conv3_output_shape = phi::vectorize<int>(conv3_out->dims());
       conv3_filter_shape = phi::vectorize<int>(filter3->dims());
@@ -307,19 +307,19 @@ class ResNetBasicBlockXPUKernel : public framework::OpKernel<T> {
         platform::errors::PreconditionNotMet("It must use XPUPlace."));
 
     // input
-    const Tensor* x = ctx.Input<Tensor>("X");
-    const Tensor* filter1 = ctx.Input<Tensor>("Filter1");
-    const Tensor* scale1 = ctx.Input<Tensor>("Scale1");
-    const Tensor* bias1 = ctx.Input<Tensor>("Bias1");
-    const Tensor* filter2 = ctx.Input<Tensor>("Filter2");
-    const Tensor* scale2 = ctx.Input<Tensor>("Scale2");
-    const Tensor* bias2 = ctx.Input<Tensor>("Bias2");
+    const phi::DenseTensor* x = ctx.Input<phi::DenseTensor>("X");
+    const phi::DenseTensor* filter1 = ctx.Input<phi::DenseTensor>("Filter1");
+    const phi::DenseTensor* scale1 = ctx.Input<phi::DenseTensor>("Scale1");
+    const phi::DenseTensor* bias1 = ctx.Input<phi::DenseTensor>("Bias1");
+    const phi::DenseTensor* filter2 = ctx.Input<phi::DenseTensor>("Filter2");
+    const phi::DenseTensor* scale2 = ctx.Input<phi::DenseTensor>("Scale2");
+    const phi::DenseTensor* bias2 = ctx.Input<phi::DenseTensor>("Bias2");
 
     // output
-    Tensor* conv1_output = ctx.Output<Tensor>("Conv1");
-    Tensor* conv2_output = ctx.Output<Tensor>("Conv2");
-    Tensor* conv2_input = ctx.Output<Tensor>("Conv2Input");
-    Tensor* output = ctx.Output<Tensor>("Y");
+    phi::DenseTensor* conv1_output = ctx.Output<phi::DenseTensor>("Conv1");
+    phi::DenseTensor* conv2_output = ctx.Output<phi::DenseTensor>("Conv2");
+    phi::DenseTensor* conv2_input = ctx.Output<phi::DenseTensor>("Conv2Input");
+    phi::DenseTensor* output = ctx.Output<phi::DenseTensor>("Y");
 
     auto place = ctx.GetPlace();
     auto x_data = reinterpret_cast<const XPUT*>(x->data<T>());
@@ -348,19 +348,23 @@ class ResNetBasicBlockXPUKernel : public framework::OpKernel<T> {
 
     // init find max
     if (attr.find_max) {
-      Tensor* max_input1 = ctx.Output<Tensor>("MaxInput1");
-      Tensor* max_filter1 = ctx.Output<Tensor>("MaxFilter1");
+      phi::DenseTensor* max_input1 = ctx.Output<phi::DenseTensor>("MaxInput1");
+      phi::DenseTensor* max_filter1 =
+          ctx.Output<phi::DenseTensor>("MaxFilter1");
       conv1_input_max_data = max_input1->mutable_data<float>(place);
       conv1_filter_max_data = max_filter1->mutable_data<float>(place);
 
-      Tensor* max_input2 = ctx.Output<Tensor>("MaxInput2");
-      Tensor* max_filter2 = ctx.Output<Tensor>("MaxFilter2");
+      phi::DenseTensor* max_input2 = ctx.Output<phi::DenseTensor>("MaxInput2");
+      phi::DenseTensor* max_filter2 =
+          ctx.Output<phi::DenseTensor>("MaxFilter2");
       conv2_input_max_data = max_input2->mutable_data<float>(place);
       conv2_filter_max_data = max_filter2->mutable_data<float>(place);
 
       if (attr.has_shortcut) {
-        Tensor* max_input3 = ctx.Output<Tensor>("MaxInput3");
-        Tensor* max_filter3 = ctx.Output<Tensor>("MaxFilter3");
+        phi::DenseTensor* max_input3 =
+            ctx.Output<phi::DenseTensor>("MaxInput3");
+        phi::DenseTensor* max_filter3 =
+            ctx.Output<phi::DenseTensor>("MaxFilter3");
         conv3_input_max_data = max_input3->mutable_data<float>(place);
         conv3_filter_max_data = max_filter3->mutable_data<float>(place);
       }
@@ -373,8 +377,8 @@ class ResNetBasicBlockXPUKernel : public framework::OpKernel<T> {
     // 1. short
     const XPUT* z_out_data = nullptr;
     if (attr.has_shortcut) {
-      Tensor* conv3_out = ctx.Output<Tensor>("Conv3");
-      const Tensor* filter3 = ctx.Input<Tensor>("Filter3");
+      phi::DenseTensor* conv3_out = ctx.Output<phi::DenseTensor>("Conv3");
+      const phi::DenseTensor* filter3 = ctx.Input<phi::DenseTensor>("Filter3");
       auto conv3_filter_data =
           reinterpret_cast<const XPUT*>(filter3->data<T>());
       auto conv3_output_data =
@@ -414,8 +418,8 @@ class ResNetBasicBlockXPUKernel : public framework::OpKernel<T> {
                  attr.group);
 
       // bn3
-      const Tensor* scale3 = ctx.Input<Tensor>("Scale3");
-      const Tensor* bias3 = ctx.Input<Tensor>("Bias3");
+      const phi::DenseTensor* scale3 = ctx.Input<phi::DenseTensor>("Scale3");
+      const phi::DenseTensor* bias3 = ctx.Input<phi::DenseTensor>("Bias3");
       auto bias3_data = bias3->data<float>();
       auto scale3_data = scale3->data<float>();
 
@@ -423,10 +427,14 @@ class ResNetBasicBlockXPUKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_XDNN_NOT_NULL(bn3_output_data);
 
       if (!attr.global_stats) {
-        Tensor* saved_mean3 = ctx.Output<Tensor>("SavedMean3");
-        Tensor* saved_invstd3 = ctx.Output<Tensor>("SavedInvstd3");
-        Tensor* running_mean3 = ctx.Output<Tensor>("Mean3Out");
-        Tensor* running_var3 = ctx.Output<Tensor>("Var3Out");
+        phi::DenseTensor* saved_mean3 =
+            ctx.Output<phi::DenseTensor>("SavedMean3");
+        phi::DenseTensor* saved_invstd3 =
+            ctx.Output<phi::DenseTensor>("SavedInvstd3");
+        phi::DenseTensor* running_mean3 =
+            ctx.Output<phi::DenseTensor>("Mean3Out");
+        phi::DenseTensor* running_var3 =
+            ctx.Output<phi::DenseTensor>("Var3Out");
 
         auto saved_mean3_data = saved_mean3->mutable_data<float>(place);
         auto saved_invstd3_data = saved_invstd3->mutable_data<float>(place);
@@ -455,8 +463,8 @@ class ResNetBasicBlockXPUKernel : public framework::OpKernel<T> {
                                          0);
         PADDLE_ENFORCE_XDNN_SUCCESS(r, "batch_norm_fusion");
       } else {
-        const auto* mean3 = ctx.Input<Tensor>("Mean3");
-        const auto* var3 = ctx.Input<Tensor>("Var3");
+        const auto* mean3 = ctx.Input<phi::DenseTensor>("Mean3");
+        const auto* var3 = ctx.Input<phi::DenseTensor>("Var3");
         const auto* mean3_data = mean3->data<float>();
         const auto* variance3_data = var3->data<float>();
         r = xpu::batch_norm_infer<XPUT>(dev_ctx.x_context(),
@@ -513,10 +521,13 @@ class ResNetBasicBlockXPUKernel : public framework::OpKernel<T> {
 
     // 3. bn1 + relu
     if (!attr.global_stats) {
-      Tensor* saved_mean1 = ctx.Output<Tensor>("SavedMean1");
-      Tensor* saved_invstd1 = ctx.Output<Tensor>("SavedInvstd1");
-      Tensor* running_mean1 = ctx.Output<Tensor>("Mean1Out");
-      Tensor* running_var1 = ctx.Output<Tensor>("Var1Out");
+      phi::DenseTensor* saved_mean1 =
+          ctx.Output<phi::DenseTensor>("SavedMean1");
+      phi::DenseTensor* saved_invstd1 =
+          ctx.Output<phi::DenseTensor>("SavedInvstd1");
+      phi::DenseTensor* running_mean1 =
+          ctx.Output<phi::DenseTensor>("Mean1Out");
+      phi::DenseTensor* running_var1 = ctx.Output<phi::DenseTensor>("Var1Out");
 
       auto saved_mean1_data = saved_mean1->mutable_data<float>(place);
       auto saved_invstd1_data = saved_invstd1->mutable_data<float>(place);
@@ -549,8 +560,8 @@ class ResNetBasicBlockXPUKernel : public framework::OpKernel<T> {
       auto bn1_output_data = RAII_GUARD.alloc<XPUT>(attr.conv1_output_numel);
       PADDLE_ENFORCE_XDNN_NOT_NULL(bn1_output_data);
 
-      const auto* mean1 = ctx.Input<Tensor>("Mean1");
-      const auto* var1 = ctx.Input<Tensor>("Var1");
+      const auto* mean1 = ctx.Input<phi::DenseTensor>("Mean1");
+      const auto* var1 = ctx.Input<phi::DenseTensor>("Var1");
       const auto* mean_data = mean1->data<float>();
       const auto* variance_data = var1->data<float>();
       r = xpu::batch_norm_infer<XPUT>(dev_ctx.x_context(),
@@ -580,8 +591,9 @@ class ResNetBasicBlockXPUKernel : public framework::OpKernel<T> {
     XPUT* conv2_filter_l3_data =
         RAII_GUARD.alloc_l3<XPUT>(attr.conv2_filter_numel);
     if (attr.find_max) {
-      Tensor* max_input2 = ctx.Output<Tensor>("MaxInput2");
-      Tensor* max_filter2 = ctx.Output<Tensor>("MaxFilter2");
+      phi::DenseTensor* max_input2 = ctx.Output<phi::DenseTensor>("MaxInput2");
+      phi::DenseTensor* max_filter2 =
+          ctx.Output<phi::DenseTensor>("MaxFilter2");
       conv2_input_max_data = max_input2->mutable_data<float>(place);
       conv2_filter_max_data = max_filter2->mutable_data<float>(place);
 
@@ -615,10 +627,13 @@ class ResNetBasicBlockXPUKernel : public framework::OpKernel<T> {
 
     // 5. bn2
     if (!attr.global_stats) {
-      Tensor* saved_mean2 = ctx.Output<Tensor>("SavedMean2");
-      Tensor* saved_var2 = ctx.Output<Tensor>("SavedInvstd2");
-      Tensor* running_mean2 = ctx.Output<Tensor>("Mean2Out");
-      Tensor* running_var2 = ctx.Output<Tensor>("Var2Out");
+      phi::DenseTensor* saved_mean2 =
+          ctx.Output<phi::DenseTensor>("SavedMean2");
+      phi::DenseTensor* saved_var2 =
+          ctx.Output<phi::DenseTensor>("SavedInvstd2");
+      phi::DenseTensor* running_mean2 =
+          ctx.Output<phi::DenseTensor>("Mean2Out");
+      phi::DenseTensor* running_var2 = ctx.Output<phi::DenseTensor>("Var2Out");
 
       auto saved_mean2_data = saved_mean2->mutable_data<float>(place);
       auto saved_var2_data = saved_var2->mutable_data<float>(place);
@@ -650,8 +665,8 @@ class ResNetBasicBlockXPUKernel : public framework::OpKernel<T> {
       auto bn2_out_data = RAII_GUARD.alloc<XPUT>(attr.conv2_output_numel);
       PADDLE_ENFORCE_XDNN_NOT_NULL(bn2_out_data);
 
-      const auto* mean2 = ctx.Input<Tensor>("Mean2");
-      const auto* var2 = ctx.Input<Tensor>("Var2");
+      const auto* mean2 = ctx.Input<phi::DenseTensor>("Mean2");
+      const auto* var2 = ctx.Input<phi::DenseTensor>("Var2");
       const auto* mean_data = mean2->data<float>();
       const auto* variance_data = var2->data<float>();
       r = xpu::batch_norm_infer<XPUT>(dev_ctx.x_context(),
@@ -694,48 +709,69 @@ class ResNetBasicBlockGradXPUKernel : public framework::OpKernel<T> {
         true,
         platform::errors::PreconditionNotMet("It must use XPUPlace."));
 
-    const Tensor* y_grad = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const Tensor* y = ctx.Input<Tensor>("Y");
-
-    const Tensor* x = ctx.Input<Tensor>("X");
-    const Tensor* filter1 = ctx.Input<Tensor>("Filter1");
-    const Tensor* scale1 = ctx.Input<Tensor>("Scale1");
-    const Tensor* filter2 = ctx.Input<Tensor>("Filter2");
-    const Tensor* scale2 = ctx.Input<Tensor>("Scale2");
-    const Tensor* saved_mean1 = ctx.Input<Tensor>("SavedMean1");
-    const Tensor* saved_invstd1 = ctx.Input<Tensor>("SavedInvstd1");
-    const Tensor* saved_mean2 = ctx.Input<Tensor>("SavedMean2");
-    const Tensor* saved_invstd2 = ctx.Input<Tensor>("SavedInvstd2");
-    const Tensor* conv1_out = ctx.Input<Tensor>("Conv1");
-    const Tensor* conv2_out = ctx.Input<Tensor>("Conv2");
-    const Tensor* conv2_input = ctx.Input<Tensor>("Conv2Input");
-
-    const Tensor* filter3 = ctx.Input<Tensor>("Filter3");
-    const Tensor* conv3_out = ctx.Input<Tensor>("Conv3");
-    const Tensor* scale3 = ctx.Input<Tensor>("Scale3");
-    const Tensor* saved_mean3 = ctx.Input<Tensor>("SavedMean3");
-    const Tensor* saved_invstd3 = ctx.Input<Tensor>("SavedInvstd3");
-
-    const Tensor* conv1_input_max = ctx.Input<Tensor>("MaxInput1");
-    const Tensor* conv1_filter_max = ctx.Input<Tensor>("MaxFilter1");
-    const Tensor* conv2_input_max = ctx.Input<Tensor>("MaxInput2");
-    const Tensor* conv2_filter_max = ctx.Input<Tensor>("MaxFilter2");
-    const Tensor* conv3_input_max = ctx.Input<Tensor>("MaxInput3");
-    const Tensor* conv3_filter_max = ctx.Input<Tensor>("MaxFilter3");
-
-    Tensor* x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    Tensor* filter1_grad =
-        ctx.Output<Tensor>(framework::GradVarName("Filter1"));
-    Tensor* scale1_grad = ctx.Output<Tensor>(framework::GradVarName("Scale1"));
-    Tensor* bias1_grad = ctx.Output<Tensor>(framework::GradVarName("Bias1"));
-    Tensor* filter2_grad =
-        ctx.Output<Tensor>(framework::GradVarName("Filter2"));
-    Tensor* scale2_grad = ctx.Output<Tensor>(framework::GradVarName("Scale2"));
-    Tensor* bias2_grad = ctx.Output<Tensor>(framework::GradVarName("Bias2"));
-    Tensor* filter3_grad =
-        ctx.Output<Tensor>(framework::GradVarName("Filter3"));
-    Tensor* scale3_grad = ctx.Output<Tensor>(framework::GradVarName("Scale3"));
-    Tensor* bias3_grad = ctx.Output<Tensor>(framework::GradVarName("Bias3"));
+    const phi::DenseTensor* y_grad =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
+    const phi::DenseTensor* y = ctx.Input<phi::DenseTensor>("Y");
+
+    const phi::DenseTensor* x = ctx.Input<phi::DenseTensor>("X");
+    const phi::DenseTensor* filter1 = ctx.Input<phi::DenseTensor>("Filter1");
+    const phi::DenseTensor* scale1 = ctx.Input<phi::DenseTensor>("Scale1");
+    const phi::DenseTensor* filter2 = ctx.Input<phi::DenseTensor>("Filter2");
+    const phi::DenseTensor* scale2 = ctx.Input<phi::DenseTensor>("Scale2");
+    const phi::DenseTensor* saved_mean1 =
+        ctx.Input<phi::DenseTensor>("SavedMean1");
+    const phi::DenseTensor* saved_invstd1 =
+        ctx.Input<phi::DenseTensor>("SavedInvstd1");
+    const phi::DenseTensor* saved_mean2 =
+        ctx.Input<phi::DenseTensor>("SavedMean2");
+    const phi::DenseTensor* saved_invstd2 =
+        ctx.Input<phi::DenseTensor>("SavedInvstd2");
+    const phi::DenseTensor* conv1_out = ctx.Input<phi::DenseTensor>("Conv1");
+    const phi::DenseTensor* conv2_out = ctx.Input<phi::DenseTensor>("Conv2");
+    const phi::DenseTensor* conv2_input =
+        ctx.Input<phi::DenseTensor>("Conv2Input");
+
+    const phi::DenseTensor* filter3 = ctx.Input<phi::DenseTensor>("Filter3");
+    const phi::DenseTensor* conv3_out = ctx.Input<phi::DenseTensor>("Conv3");
+    const phi::DenseTensor* scale3 = ctx.Input<phi::DenseTensor>("Scale3");
+    const phi::DenseTensor* saved_mean3 =
+        ctx.Input<phi::DenseTensor>("SavedMean3");
+    const phi::DenseTensor* saved_invstd3 =
+        ctx.Input<phi::DenseTensor>("SavedInvstd3");
+
+    const phi::DenseTensor* conv1_input_max =
+        ctx.Input<phi::DenseTensor>("MaxInput1");
+    const phi::DenseTensor* conv1_filter_max =
+        ctx.Input<phi::DenseTensor>("MaxFilter1");
+    const phi::DenseTensor* conv2_input_max =
+        ctx.Input<phi::DenseTensor>("MaxInput2");
+    const phi::DenseTensor* conv2_filter_max =
+        ctx.Input<phi::DenseTensor>("MaxFilter2");
+    const phi::DenseTensor* conv3_input_max =
+        ctx.Input<phi::DenseTensor>("MaxInput3");
+    const phi::DenseTensor* conv3_filter_max =
+        ctx.Input<phi::DenseTensor>("MaxFilter3");
+
+    phi::DenseTensor* x_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    phi::DenseTensor* filter1_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Filter1"));
+    phi::DenseTensor* scale1_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale1"));
+    phi::DenseTensor* bias1_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias1"));
+    phi::DenseTensor* filter2_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Filter2"));
+    phi::DenseTensor* scale2_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale2"));
+    phi::DenseTensor* bias2_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias2"));
+    phi::DenseTensor* filter3_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Filter3"));
+    phi::DenseTensor* scale3_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale3"));
+    phi::DenseTensor* bias3_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias3"));
 
     // attrs
     ResnetBasicBlockGradAttr attr(ctx);
diff --git a/paddle/fluid/operators/fused/resnet_unit_op.cc b/paddle/fluid/operators/fused/resnet_unit_op.cc
index 779e28c85b72a..61c8d9813ea29 100644
--- a/paddle/fluid/operators/fused/resnet_unit_op.cc
+++ b/paddle/fluid/operators/fused/resnet_unit_op.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 // Shape of bitmask
 static framework::DDim GetBitmaskDims(std::vector<int> out_shape) {
@@ -209,16 +209,16 @@ class ResNetUnitOp : public framework::OperatorWithKernel {
     // and var tensors should be float when input tensor's dtype is float16.
     auto bn_param_type = framework::proto::VarType::FP32;
 
-    PADDLE_ENFORCE_EQ(
-        bn_param_type,
-        framework::TransToProtoVarType(ctx.Input<Tensor>("ScaleX")->dtype()),
-        platform::errors::InvalidArgument(
-            "Scale input should be of float type"));
-    PADDLE_ENFORCE_EQ(
-        bn_param_type,
-        framework::TransToProtoVarType(ctx.Input<Tensor>("BiasX")->dtype()),
-        platform::errors::InvalidArgument(
-            "Bias input should be of float type"));
+    PADDLE_ENFORCE_EQ(bn_param_type,
+                      framework::TransToProtoVarType(
+                          ctx.Input<phi::DenseTensor>("ScaleX")->dtype()),
+                      platform::errors::InvalidArgument(
+                          "Scale input should be of float type"));
+    PADDLE_ENFORCE_EQ(bn_param_type,
+                      framework::TransToProtoVarType(
+                          ctx.Input<phi::DenseTensor>("BiasX")->dtype()),
+                      platform::errors::InvalidArgument(
+                          "Bias input should be of float type"));
     framework::LibraryType library = framework::LibraryType::kPlain;
     framework::DataLayout layout = framework::DataLayout::kAnyLayout;
     return framework::OpKernelType(
diff --git a/paddle/fluid/operators/fused/resnet_unit_op.cu b/paddle/fluid/operators/fused/resnet_unit_op.cu
index d0a8788e0db2f..02bde0ef04ff2 100644
--- a/paddle/fluid/operators/fused/resnet_unit_op.cu
+++ b/paddle/fluid/operators/fused/resnet_unit_op.cu
@@ -23,7 +23,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class ResNetUnitKernel : public framework::OpKernel<T> {
@@ -39,20 +39,20 @@ class ResNetUnitKernel : public framework::OpKernel<T> {
                           "ResNetUnitOp only supports float16 for now."));
 
     // input x
-    const Tensor *input_x = ctx.Input<Tensor>("X");
-    const Tensor *filter_x = ctx.Input<Tensor>("FilterX");
-    const Tensor *scale_x = ctx.Input<Tensor>("ScaleX");
-    const Tensor *bias_x = ctx.Input<Tensor>("BiasX");
+    const Tensor *input_x = ctx.Input<phi::DenseTensor>("X");
+    const Tensor *filter_x = ctx.Input<phi::DenseTensor>("FilterX");
+    const Tensor *scale_x = ctx.Input<phi::DenseTensor>("ScaleX");
+    const Tensor *bias_x = ctx.Input<phi::DenseTensor>("BiasX");
     // norm conv
-    Tensor *conv_out_x = ctx.Output<Tensor>("ConvX");
+    Tensor *conv_out_x = ctx.Output<phi::DenseTensor>("ConvX");
     // bn finalize
-    Tensor *saved_mean_x = ctx.Output<Tensor>("SavedMeanX");
-    Tensor *saved_invstd_x = ctx.Output<Tensor>("SavedInvstdX");
-    Tensor *running_mean_x = ctx.Output<Tensor>("RunningMeanX");
-    Tensor *running_var_x = ctx.Output<Tensor>("RunningVarX");
+    Tensor *saved_mean_x = ctx.Output<phi::DenseTensor>("SavedMeanX");
+    Tensor *saved_invstd_x = ctx.Output<phi::DenseTensor>("SavedInvstdX");
+    Tensor *running_mean_x = ctx.Output<phi::DenseTensor>("RunningMeanX");
+    Tensor *running_var_x = ctx.Output<phi::DenseTensor>("RunningVarX");
     // sbar
-    Tensor *output = ctx.Output<Tensor>("Y");
-    Tensor *bitmask = ctx.Output<Tensor>("BitMask");
+    Tensor *output = ctx.Output<phi::DenseTensor>("Y");
+    Tensor *bitmask = ctx.Output<phi::DenseTensor>("BitMask");
     // attrs
     int padding = ctx.Attr<int>("padding");
     int stride = ctx.Attr<int>("stride");
@@ -140,17 +140,17 @@ class ResNetUnitKernel : public framework::OpKernel<T> {
                                      bitmask_shape);
     if (has_shortcut) {
       // input z
-      const Tensor *input_z = ctx.Input<Tensor>("Z");
-      const Tensor *filter_z = ctx.Input<Tensor>("FilterZ");
-      const Tensor *scale_z = ctx.Input<Tensor>("ScaleZ");
-      const Tensor *bias_z = ctx.Input<Tensor>("BiasZ");
+      const Tensor *input_z = ctx.Input<phi::DenseTensor>("Z");
+      const Tensor *filter_z = ctx.Input<phi::DenseTensor>("FilterZ");
+      const Tensor *scale_z = ctx.Input<phi::DenseTensor>("ScaleZ");
+      const Tensor *bias_z = ctx.Input<phi::DenseTensor>("BiasZ");
       // norm conv
-      Tensor *conv_out_z = ctx.Output<Tensor>("ConvZ");
+      Tensor *conv_out_z = ctx.Output<phi::DenseTensor>("ConvZ");
       // bn finalize
-      Tensor *saved_mean_z = ctx.Output<Tensor>("SavedMeanZ");
-      Tensor *saved_invstd_z = ctx.Output<Tensor>("SavedInvstdZ");
-      Tensor *running_mean_z = ctx.Output<Tensor>("RunningMeanZ");
-      Tensor *running_var_z = ctx.Output<Tensor>("RunningVarZ");
+      Tensor *saved_mean_z = ctx.Output<phi::DenseTensor>("SavedMeanZ");
+      Tensor *saved_invstd_z = ctx.Output<phi::DenseTensor>("SavedInvstdZ");
+      Tensor *running_mean_z = ctx.Output<phi::DenseTensor>("RunningMeanZ");
+      Tensor *running_var_z = ctx.Output<phi::DenseTensor>("RunningVarZ");
 
       auto input_z_shape = phi::vectorize<int>(input_z->dims());
       auto filter_z_shape = phi::vectorize<int>(filter_z->dims());
@@ -203,7 +203,8 @@ class ResNetUnitKernel : public framework::OpKernel<T> {
                       output,
                       bitmask);
     } else {
-      const Tensor *input_z = fuse_add ? ctx.Input<Tensor>("Z") : nullptr;
+      const Tensor *input_z =
+          fuse_add ? ctx.Input<phi::DenseTensor>("Z") : nullptr;
       sbar_op.Forward(dev_ctx,
                       *conv_out_x,
                       equiv_scale_x,
@@ -230,24 +231,27 @@ class ResNetUnitGradKernel : public framework::OpKernel<T> {
                       platform::errors::Unavailable(
                           "ResNetUnitOp only supports float16 for now."));
 
-    const Tensor *y_grad = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const Tensor *y_grad =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
 
-    const Tensor *x = ctx.Input<Tensor>("X");
-    const Tensor *filter_x = ctx.Input<Tensor>("FilterX");
-    const Tensor *scale_x = ctx.Input<Tensor>("ScaleX");
-    const Tensor *bias_x = ctx.Input<Tensor>("BiasX");
-    const Tensor *saved_mean_x = ctx.Input<Tensor>("SavedMeanX");
-    const Tensor *saved_invstd_x = ctx.Input<Tensor>("SavedInvstdX");
+    const Tensor *x = ctx.Input<phi::DenseTensor>("X");
+    const Tensor *filter_x = ctx.Input<phi::DenseTensor>("FilterX");
+    const Tensor *scale_x = ctx.Input<phi::DenseTensor>("ScaleX");
+    const Tensor *bias_x = ctx.Input<phi::DenseTensor>("BiasX");
+    const Tensor *saved_mean_x = ctx.Input<phi::DenseTensor>("SavedMeanX");
+    const Tensor *saved_invstd_x = ctx.Input<phi::DenseTensor>("SavedInvstdX");
 
-    const Tensor *conv_out_x = ctx.Input<Tensor>("ConvX");
-    const Tensor *output = ctx.Input<Tensor>("Y");
-    const Tensor *bitmask = ctx.Input<Tensor>("BitMask");
+    const Tensor *conv_out_x = ctx.Input<phi::DenseTensor>("ConvX");
+    const Tensor *output = ctx.Input<phi::DenseTensor>("Y");
+    const Tensor *bitmask = ctx.Input<phi::DenseTensor>("BitMask");
 
-    Tensor *x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    Tensor *x_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     Tensor *filter_x_grad =
-        ctx.Output<Tensor>(framework::GradVarName("FilterX"));
-    Tensor *scale_x_grad = ctx.Output<Tensor>(framework::GradVarName("ScaleX"));
-    Tensor *bias_x_grad = ctx.Output<Tensor>(framework::GradVarName("BiasX"));
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("FilterX"));
+    Tensor *scale_x_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("ScaleX"));
+    Tensor *bias_x_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("BiasX"));
 
     int padding = ctx.Attr<int>("padding");
     int stride = ctx.Attr<int>("stride");
@@ -291,20 +295,23 @@ class ResNetUnitGradKernel : public framework::OpKernel<T> {
       //          ScaleBiasAddRelu
       //                  |
       //                  Y
-      const Tensor *z = ctx.Input<Tensor>("Z");
-      const Tensor *filter_z = ctx.Input<Tensor>("FilterZ");
-      const Tensor *scale_z = ctx.Input<Tensor>("ScaleZ");
-      const Tensor *bias_z = ctx.Input<Tensor>("BiasZ");
-      const Tensor *saved_mean_z = ctx.Input<Tensor>("SavedMeanZ");
-      const Tensor *saved_invstd_z = ctx.Input<Tensor>("SavedInvstdZ");
-      const Tensor *conv_out_z = ctx.Input<Tensor>("ConvZ");
-
-      Tensor *z_grad = ctx.Output<Tensor>(framework::GradVarName("Z"));
+      const Tensor *z = ctx.Input<phi::DenseTensor>("Z");
+      const Tensor *filter_z = ctx.Input<phi::DenseTensor>("FilterZ");
+      const Tensor *scale_z = ctx.Input<phi::DenseTensor>("ScaleZ");
+      const Tensor *bias_z = ctx.Input<phi::DenseTensor>("BiasZ");
+      const Tensor *saved_mean_z = ctx.Input<phi::DenseTensor>("SavedMeanZ");
+      const Tensor *saved_invstd_z =
+          ctx.Input<phi::DenseTensor>("SavedInvstdZ");
+      const Tensor *conv_out_z = ctx.Input<phi::DenseTensor>("ConvZ");
+
+      Tensor *z_grad =
+          ctx.Output<phi::DenseTensor>(framework::GradVarName("Z"));
       Tensor *filter_z_grad =
-          ctx.Output<Tensor>(framework::GradVarName("FilterZ"));
+          ctx.Output<phi::DenseTensor>(framework::GradVarName("FilterZ"));
       Tensor *scale_z_grad =
-          ctx.Output<Tensor>(framework::GradVarName("ScaleZ"));
-      Tensor *bias_z_grad = ctx.Output<Tensor>(framework::GradVarName("BiasZ"));
+          ctx.Output<phi::DenseTensor>(framework::GradVarName("ScaleZ"));
+      Tensor *bias_z_grad =
+          ctx.Output<phi::DenseTensor>(framework::GradVarName("BiasZ"));
 
       // 1.1 Backward of BN + Add (+ Relu) for x, get conv_out_x_grad,
       // scale_x_grad, bias_x_grad and z_grad_temp
@@ -360,7 +367,8 @@ class ResNetUnitGradKernel : public framework::OpKernel<T> {
       // 1.1 Backward of BN (+ Add + Relu) for x, get conv_out_x_grad,
       // scale_x_grad, bias_x_grad (and z_grad)
       Tensor *z_grad =
-          fuse_add ? ctx.Output<Tensor>(framework::GradVarName("Z")) : nullptr;
+          fuse_add ? ctx.Output<phi::DenseTensor>(framework::GradVarName("Z"))
+                   : nullptr;
       sbar_x_op.Backward(dev_ctx,
                          *y_grad,
                          *conv_out_x,
diff --git a/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc b/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc
index e9ad179960628..80986761c7cba 100644
--- a/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc
+++ b/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class ResNetUnitXPUKernel : public framework::OpKernel<T> {
@@ -35,19 +35,19 @@ class ResNetUnitXPUKernel : public framework::OpKernel<T> {
 
     bool is_nchw = (ctx.Attr<std::string>("data_format") == "NCHW");
     // input x
-    const Tensor *input_x = ctx.Input<Tensor>("X");
-    const Tensor *filter_x = ctx.Input<Tensor>("FilterX");
-    const Tensor *scale_x = ctx.Input<Tensor>("ScaleX");
-    const Tensor *bias_x = ctx.Input<Tensor>("BiasX");
+    const Tensor *input_x = ctx.Input<phi::DenseTensor>("X");
+    const Tensor *filter_x = ctx.Input<phi::DenseTensor>("FilterX");
+    const Tensor *scale_x = ctx.Input<phi::DenseTensor>("ScaleX");
+    const Tensor *bias_x = ctx.Input<phi::DenseTensor>("BiasX");
 
     // output x
-    Tensor *conv_out_x = ctx.Output<Tensor>("ConvX");
-    Tensor *saved_mean_x = ctx.Output<Tensor>("SavedMeanX");
-    Tensor *saved_invstd_x = ctx.Output<Tensor>("SavedInvstdX");
-    Tensor *running_mean_x = ctx.Output<Tensor>("RunningMeanX");
-    Tensor *running_var_x = ctx.Output<Tensor>("RunningVarX");
+    Tensor *conv_out_x = ctx.Output<phi::DenseTensor>("ConvX");
+    Tensor *saved_mean_x = ctx.Output<phi::DenseTensor>("SavedMeanX");
+    Tensor *saved_invstd_x = ctx.Output<phi::DenseTensor>("SavedInvstdX");
+    Tensor *running_mean_x = ctx.Output<phi::DenseTensor>("RunningMeanX");
+    Tensor *running_var_x = ctx.Output<phi::DenseTensor>("RunningVarX");
 
-    Tensor *output = ctx.Output<Tensor>("Y");
+    Tensor *output = ctx.Output<phi::DenseTensor>("Y");
 
     //  attrs
     int padding = ctx.Attr<int>("padding");
@@ -101,16 +101,16 @@ class ResNetUnitXPUKernel : public framework::OpKernel<T> {
     std::vector<const float *> w_maxlist = {nullptr};
     if (has_shortcut) {
       // input z
-      const Tensor *input_z = ctx.Input<Tensor>("Z");
-      const Tensor *filter_z = ctx.Input<Tensor>("FilterZ");
-      const Tensor *scale_z = ctx.Input<Tensor>("ScaleZ");
-      const Tensor *bias_z = ctx.Input<Tensor>("BiasZ");
+      const Tensor *input_z = ctx.Input<phi::DenseTensor>("Z");
+      const Tensor *filter_z = ctx.Input<phi::DenseTensor>("FilterZ");
+      const Tensor *scale_z = ctx.Input<phi::DenseTensor>("ScaleZ");
+      const Tensor *bias_z = ctx.Input<phi::DenseTensor>("BiasZ");
 
-      Tensor *conv_out_z = ctx.Output<Tensor>("ConvZ");
-      Tensor *saved_mean_z = ctx.Output<Tensor>("SavedMeanZ");
-      Tensor *saved_invstd_z = ctx.Output<Tensor>("SavedInvstdZ");
-      Tensor *running_mean_z = ctx.Output<Tensor>("RunningMeanZ");
-      Tensor *running_var_z = ctx.Output<Tensor>("RunningVarZ");
+      Tensor *conv_out_z = ctx.Output<phi::DenseTensor>("ConvZ");
+      Tensor *saved_mean_z = ctx.Output<phi::DenseTensor>("SavedMeanZ");
+      Tensor *saved_invstd_z = ctx.Output<phi::DenseTensor>("SavedInvstdZ");
+      Tensor *running_mean_z = ctx.Output<phi::DenseTensor>("RunningMeanZ");
+      Tensor *running_var_z = ctx.Output<phi::DenseTensor>("RunningVarZ");
 
       x_list.push_back(reinterpret_cast<const XPUType *>(input_z->data<T>()));
       w_list.push_back(reinterpret_cast<const XPUType *>(filter_z->data<T>()));
@@ -137,7 +137,7 @@ class ResNetUnitXPUKernel : public framework::OpKernel<T> {
       w_maxlist.push_back(nullptr);
     } else {
       if (fuse_add) {
-        const Tensor *input_z = ctx.Input<Tensor>("Z");
+        const Tensor *input_z = ctx.Input<phi::DenseTensor>("Z");
         auto input_z_shape = phi::vectorize<int>(input_z->dims());
         x_list.push_back(reinterpret_cast<const XPUType *>(input_z->data<T>()));
         x_shape_list.push_back(input_z_shape);
@@ -189,20 +189,23 @@ class ResNetUnitGradXPUKernel : public framework::OpKernel<T> {
         platform::errors::PreconditionNotMet("It must use XPUPlace."));
 
     bool is_nchw = (ctx.Attr<std::string>("data_format") == "NCHW");
-    const Tensor *y_grad = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const Tensor *x = ctx.Input<Tensor>("X");
-    const Tensor *filter_x = ctx.Input<Tensor>("FilterX");
-    const Tensor *scale_x = ctx.Input<Tensor>("ScaleX");
-    const Tensor *saved_mean_x = ctx.Input<Tensor>("SavedMeanX");
-    const Tensor *saved_invstd_x = ctx.Input<Tensor>("SavedInvstdX");
-    const Tensor *conv_out_x = ctx.Input<Tensor>("ConvX");
-    const Tensor *output = ctx.Input<Tensor>("Y");
-
-    Tensor *x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    const Tensor *y_grad =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
+    const Tensor *x = ctx.Input<phi::DenseTensor>("X");
+    const Tensor *filter_x = ctx.Input<phi::DenseTensor>("FilterX");
+    const Tensor *scale_x = ctx.Input<phi::DenseTensor>("ScaleX");
+    const Tensor *saved_mean_x = ctx.Input<phi::DenseTensor>("SavedMeanX");
+    const Tensor *saved_invstd_x = ctx.Input<phi::DenseTensor>("SavedInvstdX");
+    const Tensor *conv_out_x = ctx.Input<phi::DenseTensor>("ConvX");
+    const Tensor *output = ctx.Input<phi::DenseTensor>("Y");
+
+    Tensor *x_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     Tensor *filter_x_grad =
-        ctx.Output<Tensor>(framework::GradVarName("FilterX"));
-    Tensor *scale_x_grad = ctx.Output<Tensor>(framework::GradVarName("ScaleX"));
-    Tensor *bias_x_grad = ctx.Output<Tensor>(framework::GradVarName("BiasX"));
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("FilterX"));
+    Tensor *scale_x_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("ScaleX"));
+    Tensor *bias_x_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("BiasX"));
 
     int padding = ctx.Attr<int>("padding");
     int stride = ctx.Attr<int>("stride");
@@ -262,19 +265,22 @@ class ResNetUnitGradXPUKernel : public framework::OpKernel<T> {
       //          ScaleBiasAddRelu
       //                  |
       //                  Y
-      const Tensor *z = ctx.Input<Tensor>("Z");
-      const Tensor *filter_z = ctx.Input<Tensor>("FilterZ");
-      const Tensor *scale_z = ctx.Input<Tensor>("ScaleZ");
-      const Tensor *saved_mean_z = ctx.Input<Tensor>("SavedMeanZ");
-      const Tensor *saved_invstd_z = ctx.Input<Tensor>("SavedInvstdZ");
-      const Tensor *conv_out_z = ctx.Input<Tensor>("ConvZ");
-
-      Tensor *z_grad = ctx.Output<Tensor>(framework::GradVarName("Z"));
+      const Tensor *z = ctx.Input<phi::DenseTensor>("Z");
+      const Tensor *filter_z = ctx.Input<phi::DenseTensor>("FilterZ");
+      const Tensor *scale_z = ctx.Input<phi::DenseTensor>("ScaleZ");
+      const Tensor *saved_mean_z = ctx.Input<phi::DenseTensor>("SavedMeanZ");
+      const Tensor *saved_invstd_z =
+          ctx.Input<phi::DenseTensor>("SavedInvstdZ");
+      const Tensor *conv_out_z = ctx.Input<phi::DenseTensor>("ConvZ");
+
+      Tensor *z_grad =
+          ctx.Output<phi::DenseTensor>(framework::GradVarName("Z"));
       Tensor *filter_z_grad =
-          ctx.Output<Tensor>(framework::GradVarName("FilterZ"));
+          ctx.Output<phi::DenseTensor>(framework::GradVarName("FilterZ"));
       Tensor *scale_z_grad =
-          ctx.Output<Tensor>(framework::GradVarName("ScaleZ"));
-      Tensor *bias_z_grad = ctx.Output<Tensor>(framework::GradVarName("BiasZ"));
+          ctx.Output<phi::DenseTensor>(framework::GradVarName("ScaleZ"));
+      Tensor *bias_z_grad =
+          ctx.Output<phi::DenseTensor>(framework::GradVarName("BiasZ"));
       x_list.push_back(reinterpret_cast<const XPUType *>(z->data<T>()));
       w_list.push_back(reinterpret_cast<const XPUType *>(filter_z->data<T>()));
       conv_y_list.push_back(
@@ -303,7 +309,7 @@ class ResNetUnitGradXPUKernel : public framework::OpKernel<T> {
       dbias_list.push_back(bias_z_grad->mutable_data<float>(place));
     } else {
       if (fuse_add) {
-        auto z_grad = ctx.Output<Tensor>(framework::GradVarName("Z"));
+        auto z_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("Z"));
         dx_list.push_back(
             reinterpret_cast<XPUType *>(z_grad->mutable_data<T>(place)));
       }
diff --git a/paddle/fluid/operators/fused/skip_layernorm_op.cu b/paddle/fluid/operators/fused/skip_layernorm_op.cu
index 307d61b31ad38..96646071567d5 100644
--- a/paddle/fluid/operators/fused/skip_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/skip_layernorm_op.cu
@@ -29,11 +29,11 @@ template <typename DeviceContext, typename T>
 class SkipLayerNormKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    using Tensor = framework::Tensor;
-    auto *X = context.Input<framework::Tensor>("X");
-    auto *Y = context.Input<framework::Tensor>("Y");
-    auto *scale = context.Input<framework::Tensor>("Scale");
-    auto *bias = context.Input<framework::Tensor>("Bias");
+    using Tensor = phi::DenseTensor;
+    auto *X = context.Input<phi::DenseTensor>("X");
+    auto *Y = context.Input<phi::DenseTensor>("Y");
+    auto *scale = context.Input<phi::DenseTensor>("Scale");
+    auto *bias = context.Input<phi::DenseTensor>("Bias");
 
     auto *X_d = X->data<T>();
     auto *Y_d = Y->data<T>();
@@ -42,7 +42,7 @@ class SkipLayerNormKernel : public framework::OpKernel<T> {
     float epsilon = context.Attr<float>("epsilon");
     int begin_norm_axis = context.Attr<int>("begin_norm_axis");
 
-    auto *out = context.Output<framework::Tensor>("Out");
+    auto *out = context.Output<phi::DenseTensor>("Out");
     out->Resize(X->dims());
     auto &dev_ctx = context.template device_context<phi::GPUContext>();
     auto *output_d = dev_ctx.Alloc<T>(out, out->numel() * sizeof(T));
diff --git a/paddle/fluid/operators/fused/yolo_box_head_op.cu b/paddle/fluid/operators/fused/yolo_box_head_op.cu
index f932b13d993fa..696cab20db714 100644
--- a/paddle/fluid/operators/fused/yolo_box_head_op.cu
+++ b/paddle/fluid/operators/fused/yolo_box_head_op.cu
@@ -67,9 +67,9 @@ template <typename T>
 class YoloBoxHeadKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    using Tensor = framework::Tensor;
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
+    using Tensor = phi::DenseTensor;
+    auto* x = context.Input<phi::DenseTensor>("X");
+    auto* out = context.Output<phi::DenseTensor>("Out");
     auto anchors = context.Attr<std::vector<int>>("anchors");
     auto class_num = context.Attr<int>("class_num");
     auto& device_ctx = context.template device_context<phi::GPUContext>();
diff --git a/paddle/fluid/operators/fused/yolo_box_post_op.cu b/paddle/fluid/operators/fused/yolo_box_post_op.cu
index 4d53cccf97685..072f0374c5b82 100644
--- a/paddle/fluid/operators/fused/yolo_box_post_op.cu
+++ b/paddle/fluid/operators/fused/yolo_box_post_op.cu
@@ -319,13 +319,13 @@ template <typename T>
 class YoloBoxPostKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    using Tensor = framework::Tensor;
+    using Tensor = phi::DenseTensor;
     // prepare inputs
     std::vector<const float*> boxes_input(3);
     std::vector<std::vector<int32_t>> boxes_input_dims(3);
     for (int i = 0; i < 3; i++) {
       auto* boxes_tensor =
-          context.Input<framework::Tensor>("Boxes" + std::to_string(i));
+          context.Input<phi::DenseTensor>("Boxes" + std::to_string(i));
       boxes_input[i] = boxes_tensor->data<float>();
       auto dims = boxes_tensor->dims();
       for (int j = 0; j < dims.size(); j++) {
@@ -333,13 +333,13 @@ class YoloBoxPostKernel : public framework::OpKernel<T> {
       }
     }
     const float* image_shape_data =
-        context.Input<framework::Tensor>("ImageShape")->data<float>();
+        context.Input<phi::DenseTensor>("ImageShape")->data<float>();
     const float* image_scale_data =
-        context.Input<framework::Tensor>("ImageScale")->data<float>();
+        context.Input<phi::DenseTensor>("ImageScale")->data<float>();
 
     // prepare outputs
-    auto* boxes_scores_tensor = context.Output<framework::Tensor>("Out");
-    auto* boxes_num_tensor = context.Output<framework::Tensor>("NmsRoisNum");
+    auto* boxes_scores_tensor = context.Output<phi::DenseTensor>("Out");
+    auto* boxes_num_tensor = context.Output<phi::DenseTensor>("NmsRoisNum");
 
     // prepare anchors
     std::vector<int32_t> anchors;
@@ -382,7 +382,7 @@ class YoloBoxPostKernel : public framework::OpKernel<T> {
     // clip_bbox and scale_x_y is not used now!
     float nms_threshold = context.Attr<float>("nms_threshold");
 
-    int batch = context.Input<framework::Tensor>("ImageShape")->dims()[0];
+    int batch = context.Input<phi::DenseTensor>("ImageShape")->dims()[0];
     TensorInfo* ts_info = new TensorInfo[batch * boxes_input.size()];
     for (int i = 0; i < batch * static_cast<int>(boxes_input.size()); i++) {
 #ifdef PADDLE_WITH_HIP
diff --git a/paddle/fluid/operators/fused_softmax_mask_op.cc b/paddle/fluid/operators/fused_softmax_mask_op.cc
index 604eaaaf3fc7c..3c8db22f52617 100644
--- a/paddle/fluid/operators/fused_softmax_mask_op.cc
+++ b/paddle/fluid/operators/fused_softmax_mask_op.cc
@@ -22,8 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class SoftmaxMaskFuseOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cc b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cc
index 5992fa2dfc6e4..5d1e4089a753d 100644
--- a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cc
+++ b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cc
@@ -17,8 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class SoftmaxMaskFuseUpperTriangleOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
index 54db576d3171b..4a59250847444 100644
--- a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
+++ b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
@@ -51,7 +51,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using framework::Tensor;
 
 #ifdef PADDLE_WITH_HIP
 #define WARP_SIZE 64
@@ -348,8 +347,8 @@ template <typename Place, typename T>
 class SoftmaxMaskFuseUpperTriangleKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* y = context.Output<Tensor>("Out");
+    auto* x = context.Input<phi::DenseTensor>("X");
+    auto* y = context.Output<phi::DenseTensor>("Out");
 
     auto* x_data = x->data<T>();
     auto* y_data = y->mutable_data<T>(context.GetPlace());
@@ -458,9 +457,11 @@ template <typename Place, typename T>
 class SoftmaxMaskFuseUpperTriangleGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* grad_x = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* grad_y = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* softmax_rst = context.Input<Tensor>("Softmax");
+    auto* grad_x =
+        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* grad_y =
+        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* softmax_rst = context.Input<phi::DenseTensor>("Softmax");
 
     auto* grad_x_data = grad_x->mutable_data<T>(context.GetPlace());
     auto* grad_y_data = grad_y->data<T>();
diff --git a/paddle/fluid/operators/fused_token_prune_op.cc b/paddle/fluid/operators/fused_token_prune_op.cc
index da43ab7588647..2fb5435bdcbe6 100644
--- a/paddle/fluid/operators/fused_token_prune_op.cc
+++ b/paddle/fluid/operators/fused_token_prune_op.cc
@@ -15,8 +15,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class FusedTokenPruneOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
diff --git a/paddle/fluid/operators/fused_token_prune_op.cu b/paddle/fluid/operators/fused_token_prune_op.cu
index 90044f30d8a6e..acf589ef186eb 100644
--- a/paddle/fluid/operators/fused_token_prune_op.cu
+++ b/paddle/fluid/operators/fused_token_prune_op.cu
@@ -28,8 +28,6 @@ namespace cub = hipcub;
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 template <typename T>
 struct AttnMaskFunctor {
   inline HOSTDEVICE T operator()(const T a, const T b) const {
@@ -87,10 +85,11 @@ class FusedTokenPruneOpCUDAKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     auto& dev_ctx = context.cuda_device_context();
     // Inouts
-    const Tensor* attn = context.Input<Tensor>("Attn");
-    const Tensor* x = context.Input<Tensor>("X");
-    const Tensor* mask = context.Input<Tensor>("Mask");
-    const Tensor* new_mask = context.Input<Tensor>("NewMask");
+    const phi::DenseTensor* attn = context.Input<phi::DenseTensor>("Attn");
+    const phi::DenseTensor* x = context.Input<phi::DenseTensor>("X");
+    const phi::DenseTensor* mask = context.Input<phi::DenseTensor>("Mask");
+    const phi::DenseTensor* new_mask =
+        context.Input<phi::DenseTensor>("NewMask");
 
     // Input dims
     auto attn_dims = attn->dims();
@@ -108,35 +107,37 @@ class FusedTokenPruneOpCUDAKernel : public framework::OpKernel<T> {
     const bool keep_order = context.Attr<bool>("keep_order");
 
     // Outputs
-    Tensor* out_slimmed_x = context.Output<Tensor>("SlimmedX");
-    Tensor* slimmed_indices = context.Output<Tensor>("CLSInds");
+    phi::DenseTensor* out_slimmed_x =
+        context.Output<phi::DenseTensor>("SlimmedX");
+    phi::DenseTensor* slimmed_indices =
+        context.Output<phi::DenseTensor>("CLSInds");
     auto* out_slimmed_x_data =
         out_slimmed_x->mutable_data<T>(context.GetPlace());
     auto* slimmed_indices_data =
         slimmed_indices->mutable_data<int64_t>(context.GetPlace());
 
     // Intermediate variable
-    Tensor attn_tmp;
+    phi::DenseTensor attn_tmp;
     auto* attn_tmp_data =
         attn_tmp.mutable_data<T>(attn_dims, context.GetPlace());
-    Tensor attn_accu;
+    phi::DenseTensor attn_accu;
     auto* attn_accu_data =
         attn_accu.mutable_data<T>({bsz, max_seq_len}, context.GetPlace());
-    Tensor attn_accu_indices;
+    phi::DenseTensor attn_accu_indices;
     auto* attn_accu_indices_data = attn_accu_indices.mutable_data<int64_t>(
         {bsz, max_seq_len}, context.GetPlace());
-    Tensor sort_attn_accu;
+    phi::DenseTensor sort_attn_accu;
     auto* sort_attn_accu_data =
         sort_attn_accu.mutable_data<T>({bsz, max_seq_len}, context.GetPlace());
-    Tensor sort_attn_accu_indices;
+    phi::DenseTensor sort_attn_accu_indices;
     auto* sort_attn_accu_indices_data =
         sort_attn_accu_indices.mutable_data<int64_t>({bsz, max_seq_len},
                                                      context.GetPlace());
-    Tensor temp_storage;
+    phi::DenseTensor temp_storage;
 
     // 1. Filter attn by mask
-    std::vector<const Tensor*> ins;
-    std::vector<Tensor*> outs;
+    std::vector<const phi::DenseTensor*> ins;
+    std::vector<phi::DenseTensor*> outs;
     ins.emplace_back(attn);
     ins.emplace_back(mask);
     outs.emplace_back(&attn_tmp);
diff --git a/paddle/fluid/operators/gather_nd_op.cc b/paddle/fluid/operators/gather_nd_op.cc
index 59648bc7d17eb..3198e35b8a438 100644
--- a/paddle/fluid/operators/gather_nd_op.cc
+++ b/paddle/fluid/operators/gather_nd_op.cc
@@ -27,7 +27,7 @@ class GatherNdOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
     const auto& x_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
     return framework::OpKernelType(
         x_type,
diff --git a/paddle/fluid/operators/gather_nd_op_mlu.cc b/paddle/fluid/operators/gather_nd_op_mlu.cc
index aa869f8fa1534..b6c96e3c2edd5 100644
--- a/paddle/fluid/operators/gather_nd_op_mlu.cc
+++ b/paddle/fluid/operators/gather_nd_op_mlu.cc
@@ -20,15 +20,15 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class GatherNdMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *x = ctx.Input<Tensor>("X");
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *out = ctx.Output<Tensor>("Out");
+    auto *x = ctx.Input<phi::DenseTensor>("X");
+    auto *index = ctx.Input<phi::DenseTensor>("Index");
+    auto *out = ctx.Output<phi::DenseTensor>("Out");
 
     auto place = ctx.GetPlace();
     out->template mutable_data<T>(place);
@@ -71,10 +71,10 @@ template <typename T>
 class GatherNdGradMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *x = ctx.Input<Tensor>("X");
+    auto *index = ctx.Input<phi::DenseTensor>("Index");
+    auto *dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto *dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto *x = ctx.Input<phi::DenseTensor>("X");
 
     if (dx->numel() == 0) return;
     if (index->numel() == 0) {
@@ -83,8 +83,8 @@ class GatherNdGradMLUKernel : public framework::OpKernel<T> {
       return;
     }
 
-    framework::Tensor tmp_tensor(index->type());
-    framework::Tensor tmp_tensor2(dout->type());
+    phi::DenseTensor tmp_tensor(index->type());
+    phi::DenseTensor tmp_tensor2(dout->type());
     const auto index_dims = index->dims();
     if (index_dims.size() == 1) {
       tmp_tensor.ShareDataWith(*index);
diff --git a/paddle/fluid/operators/gather_nd_op_npu.cc b/paddle/fluid/operators/gather_nd_op_npu.cc
index 3e91360fd054a..5cea840b4aec5 100644
--- a/paddle/fluid/operators/gather_nd_op_npu.cc
+++ b/paddle/fluid/operators/gather_nd_op_npu.cc
@@ -21,16 +21,16 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using NPUDeviceContext = platform::NPUDeviceContext;
 
 template <typename T>
 class GatherNdNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *x = ctx.Input<Tensor>("X");
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *out = ctx.Output<Tensor>("Out");
+    auto *x = ctx.Input<phi::DenseTensor>("X");
+    auto *index = ctx.Input<phi::DenseTensor>("Index");
+    auto *out = ctx.Output<phi::DenseTensor>("Out");
 
     out->template mutable_data<T>(ctx.GetPlace());
 
@@ -65,10 +65,10 @@ template <typename T>
 class GatherNdGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *x = ctx.Input<Tensor>("X");
-    auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *index = ctx.Input<phi::DenseTensor>("Index");
+    auto *x = ctx.Input<phi::DenseTensor>("X");
+    auto *dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto *dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto *p = dx->mutable_data<T>(ctx.GetPlace());
 
     if (dx->numel() == 0) return;
@@ -78,8 +78,8 @@ class GatherNdGradNPUKernel : public framework::OpKernel<T> {
       return;
     }
 
-    framework::Tensor tmp_tensor(index->type());
-    framework::Tensor tmp_tensor2(dout->type());
+    phi::DenseTensor tmp_tensor(index->type());
+    phi::DenseTensor tmp_tensor2(dout->type());
     const auto index_dims = index->dims();
     if (index_dims.size() == 1) {
       tmp_tensor.ShareDataWith(*index);
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index 77e4adfeea787..4907153a11874 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -40,7 +40,7 @@ class GatherOp : public framework::OperatorWithKernel {
   }
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const framework::Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     if (var_name == "Axis") {
       return expected_kernel_type;
@@ -63,7 +63,7 @@ class GatherGradOp : public framework::OperatorWithKernel {
   }
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const framework::Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     if (var_name == "Axis") {
       return expected_kernel_type;
diff --git a/paddle/fluid/operators/gather_op_mlu.cc b/paddle/fluid/operators/gather_op_mlu.cc
index 5162e5838d013..20a108c981d7e 100644
--- a/paddle/fluid/operators/gather_op_mlu.cc
+++ b/paddle/fluid/operators/gather_op_mlu.cc
@@ -23,8 +23,8 @@ template <typename T>
 class GatherOpMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *x = ctx.Input<Tensor>("X");
-    auto *index = ctx.Input<Tensor>("Index");
+    auto *x = ctx.Input<phi::DenseTensor>("X");
+    auto *index = ctx.Input<phi::DenseTensor>("Index");
     auto axis = ctx.Attr<int>("axis");
 
     const auto index_dims = index->dims();
@@ -44,7 +44,7 @@ class GatherOpMLUKernel : public framework::OpKernel<T> {
               index_dims.size()));
     }
 
-    auto *out = ctx.Output<Tensor>("Out");
+    auto *out = ctx.Output<phi::DenseTensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
 
     MLUCnnlTensorDesc x_desc(*x);
@@ -68,9 +68,9 @@ template <typename T>
 class GatherGradOpMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *index = ctx.Input<phi::DenseTensor>("Index");
+    auto *dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto *dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     const auto index_dims = index->dims();
     if (index_dims.size() == 2) {
diff --git a/paddle/fluid/operators/gather_op_npu.cc b/paddle/fluid/operators/gather_op_npu.cc
index 8f470b0f664e5..b6c1e3ddc6d21 100644
--- a/paddle/fluid/operators/gather_op_npu.cc
+++ b/paddle/fluid/operators/gather_op_npu.cc
@@ -28,9 +28,9 @@ template <typename DeviceContext, typename T>
 class GatherOpNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *x = ctx.Input<Tensor>("X");
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *out = ctx.Output<Tensor>("Out");
+    auto *x = ctx.Input<phi::DenseTensor>("X");
+    auto *index = ctx.Input<phi::DenseTensor>("Index");
+    auto *out = ctx.Output<phi::DenseTensor>("Out");
 
     out->mutable_data<T>(ctx.GetPlace());
     const auto &runner = NpuOpRunner(
@@ -46,14 +46,14 @@ template <typename DeviceContext, typename T>
 class GatherGradOpNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *index = ctx.Input<Tensor>("Index");
-    auto *x = ctx.Input<Tensor>("X");
-    auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *index = ctx.Input<phi::DenseTensor>("Index");
+    auto *x = ctx.Input<phi::DenseTensor>("X");
+    auto *dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto *dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     dx->mutable_data<T>(ctx.GetPlace());
 
     // step1: Unsqueeze index
-    framework::Tensor tmp_tensor(index->type());
+    phi::DenseTensor tmp_tensor(index->type());
     const auto index_dims = index->dims();
     if (index_dims.size() == 1) {
       tmp_tensor.ShareDataWith(*index);
diff --git a/paddle/fluid/operators/gather_scatter_kernel.cc b/paddle/fluid/operators/gather_scatter_kernel.cc
index 716e103990e6f..e05a214dcb4c1 100644
--- a/paddle/fluid/operators/gather_scatter_kernel.cc
+++ b/paddle/fluid/operators/gather_scatter_kernel.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class TensorAssign {
  public:
@@ -52,8 +52,8 @@ struct cpu_gather_scatter_functor {
   template <typename func_t>
   void operator()(Tensor self,
                   int dim,
-                  const Tensor& index,
-                  const Tensor& src,
+                  const phi::DenseTensor& index,
+                  const phi::DenseTensor& src,
                   const std::string& method_name,
                   const func_t& reduce_op,
                   const platform::DeviceContext& ctx) {
@@ -120,8 +120,8 @@ struct cpu_gather_scatter_functor {
           self_idx = is_scatter_like ? replace_index : index_idx;
           src_idx = is_scatter_like ? index_idx : replace_index;
 
-          reduce_op((tensor_t*)(self_data + self_idx),
-                    (tensor_t*)(src_data + src_idx));
+          reduce_op(static_cast<tensor_t*>(self_data + self_idx),
+                    static_cast<tensor_t*>(src_data + src_idx));
           index_idx++;
         }
       }
@@ -132,7 +132,7 @@ struct cpu_gather_scatter_functor {
 template <typename tensor_t, typename index_t>
 void cpu_gather_kernel(Tensor self,
                        int dim,
-                       const Tensor& index,
+                       const phi::DenseTensor& index,
                        Tensor result,
                        const platform::DeviceContext& ctx) {
   cpu_gather_scatter_functor<tensor_t,
@@ -144,7 +144,7 @@ void cpu_gather_kernel(Tensor self,
 template <typename tensor_t, typename index_t>
 void cpu_scatter_assign_kernel(Tensor self,
                                int dim,
-                               const Tensor& index,
+                               const phi::DenseTensor& index,
                                Tensor src,
                                const platform::DeviceContext& ctx) {
   cpu_gather_scatter_functor<tensor_t,
@@ -156,7 +156,7 @@ void cpu_scatter_assign_kernel(Tensor self,
 template <typename tensor_t, typename index_t>
 void cpu_scatter_add_kernel(Tensor self,
                             int dim,
-                            const Tensor& index,
+                            const phi::DenseTensor& index,
                             Tensor src,
                             const platform::DeviceContext& ctx) {
   cpu_gather_scatter_functor<tensor_t,
@@ -168,7 +168,7 @@ void cpu_scatter_add_kernel(Tensor self,
 template <typename tensor_t, typename index_t>
 void cpu_scatter_mul_kernel(Tensor self,
                             int dim,
-                            const Tensor& index,
+                            const phi::DenseTensor& index,
                             Tensor src,
                             const platform::DeviceContext& ctx) {
   cpu_gather_scatter_functor<tensor_t,
@@ -180,7 +180,7 @@ void cpu_scatter_mul_kernel(Tensor self,
 template <typename tensor_t, typename index_t>
 void cpu_scatter_input_grad_kernel(Tensor self,
                                    int dim,
-                                   const Tensor& index,
+                                   const phi::DenseTensor& index,
                                    Tensor output,
                                    const platform::DeviceContext& ctx) {
   auto* index_data = index.data<index_t>();
diff --git a/paddle/fluid/operators/gather_scatter_kernel.cu b/paddle/fluid/operators/gather_scatter_kernel.cu
index fa28481f4c4b6..80dbce4b24d28 100644
--- a/paddle/fluid/operators/gather_scatter_kernel.cu
+++ b/paddle/fluid/operators/gather_scatter_kernel.cu
@@ -18,7 +18,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class TensorAssign {
  public:
@@ -98,7 +98,8 @@ __global__ void GatherScatterGPUKernel(tensor_t* self_data,
                           i * outer_dim_size * replaced_select_dim_size;
   int64_t self_idx = is_scatter_like ? replace_index : tid;
   int64_t src_idx = is_scatter_like ? tid : replace_index;
-  reduce_op((tensor_t*)(self_data + self_idx), (tensor_t*)(src_data + src_idx));
+  reduce_op(static_cast<tensor_t*>(self_data + self_idx),
+            static_cast<tensor_t*>(src_data + src_idx));
 }
 
 template <typename tensor_t,
@@ -108,7 +109,7 @@ struct gpu_gather_scatter_functor {
   template <typename func_t>
   void operator()(Tensor self,
                   int dim,
-                  const Tensor& index,
+                  const phi::DenseTensor& index,
                   Tensor src,
                   const std::string& method_name,
                   const func_t& reduce_op,
@@ -161,7 +162,7 @@ struct gpu_gather_scatter_functor {
 template <typename tensor_t, typename index_t>
 void gpu_gather_kernel(Tensor self,
                        int dim,
-                       const Tensor& index,
+                       const phi::DenseTensor& index,
                        Tensor result,
                        const platform::DeviceContext& ctx) {
   gpu_gather_scatter_functor<tensor_t,
@@ -174,7 +175,7 @@ void gpu_gather_kernel(Tensor self,
 template <typename tensor_t, typename index_t>
 void gpu_scatter_assign_kernel(Tensor self,
                                int dim,
-                               const Tensor& index,
+                               const phi::DenseTensor& index,
                                Tensor src,
                                const platform::DeviceContext& ctx) {
   gpu_gather_scatter_functor<tensor_t,
@@ -186,7 +187,7 @@ void gpu_scatter_assign_kernel(Tensor self,
 template <typename tensor_t, typename index_t>
 void gpu_scatter_add_kernel(Tensor self,
                             int dim,
-                            const Tensor& index,
+                            const phi::DenseTensor& index,
                             Tensor src,
                             const platform::DeviceContext& ctx) {
   gpu_gather_scatter_functor<tensor_t,
@@ -198,7 +199,7 @@ void gpu_scatter_add_kernel(Tensor self,
 template <typename tensor_t, typename index_t>
 void gpu_scatter_mul_kernel(Tensor self,
                             int dim,
-                            const Tensor& index,
+                            const phi::DenseTensor& index,
                             Tensor src,
                             const platform::DeviceContext& ctx) {
   gpu_gather_scatter_functor<tensor_t,
@@ -231,7 +232,7 @@ __global__ void ScatterInputGradGPUKernel(tensor_t* grad_data,
 template <typename tensor_t, typename index_t>
 void gpu_scatter_input_grad_kernel(Tensor self,
                                    int dim,
-                                   const Tensor& index,
+                                   const phi::DenseTensor& index,
                                    Tensor grad,
                                    const platform::DeviceContext& ctx) {
   auto* index_data = index.data<index_t>();
diff --git a/paddle/fluid/operators/gather_scatter_kernel.h b/paddle/fluid/operators/gather_scatter_kernel.h
index 6aa6e4ff7b858..b97451b488b92 100644
--- a/paddle/fluid/operators/gather_scatter_kernel.h
+++ b/paddle/fluid/operators/gather_scatter_kernel.h
@@ -32,84 +32,84 @@ namespace operators {
 #define Instantiate_Template_Function_index_t(func, tensor_t)            \
   template void func<tensor_t, int>(Tensor input,                        \
                                     int dim,                             \
-                                    const Tensor& index,                 \
+                                    const phi::DenseTensor& index,       \
                                     Tensor result,                       \
                                     const platform::DeviceContext& ctx); \
   template void func<tensor_t, int64_t>(Tensor input,                    \
                                         int dim,                         \
-                                        const Tensor& index,             \
+                                        const phi::DenseTensor& index,   \
                                         Tensor result,                   \
                                         const platform::DeviceContext& ctx);
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename tensor_t, typename index_t>
 void cpu_gather_kernel(Tensor self,
                        int dim,
-                       const Tensor& index,
+                       const phi::DenseTensor& index,
                        Tensor result,
                        const platform::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
 void cpu_scatter_assign_kernel(Tensor self,
                                int dim,
-                               const Tensor& index,
+                               const phi::DenseTensor& index,
                                Tensor src,
                                const platform::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
 void cpu_scatter_add_kernel(Tensor self,
                             int dim,
-                            const Tensor& index,
+                            const phi::DenseTensor& index,
                             Tensor src,
                             const platform::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
 void cpu_scatter_mul_kernel(Tensor self,
                             int dim,
-                            const Tensor& index,
+                            const phi::DenseTensor& index,
                             Tensor src,
                             const platform::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
 void cpu_scatter_input_grad_kernel(Tensor self,
                                    int dim,
-                                   const Tensor& index,
+                                   const phi::DenseTensor& index,
                                    Tensor result,
                                    const platform::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
 void gpu_gather_kernel(Tensor self,
                        int dim,
-                       const Tensor& index,
+                       const phi::DenseTensor& index,
                        Tensor result,
                        const platform::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
 void gpu_scatter_assign_kernel(Tensor self,
                                int dim,
-                               const Tensor& index,
+                               const phi::DenseTensor& index,
                                Tensor src,
                                const platform::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
 void gpu_scatter_add_kernel(Tensor self,
                             int dim,
-                            const Tensor& index,
+                            const phi::DenseTensor& index,
                             Tensor src,
                             const platform::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
 void gpu_scatter_mul_kernel(Tensor self,
                             int dim,
-                            const Tensor& index,
+                            const phi::DenseTensor& index,
                             Tensor src,
                             const platform::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
 void gpu_scatter_input_grad_kernel(Tensor self,
                                    int dim,
-                                   const Tensor& index,
+                                   const phi::DenseTensor& index,
                                    Tensor result,
                                    const platform::DeviceContext& ctx);
 }  // namespace operators
diff --git a/paddle/fluid/operators/gather_test.cc b/paddle/fluid/operators/gather_test.cc
index 11c46d1772957..ff48ab776a856 100644
--- a/paddle/fluid/operators/gather_test.cc
+++ b/paddle/fluid/operators/gather_test.cc
@@ -20,9 +20,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 
 TEST(Gather, GatherData) {
-  paddle::framework::Tensor* src = new paddle::framework::Tensor();
-  paddle::framework::Tensor* index = new paddle::framework::Tensor();
-  paddle::framework::Tensor* output = new paddle::framework::Tensor();
+  phi::DenseTensor* src = new phi::DenseTensor();
+  phi::DenseTensor* index = new phi::DenseTensor();
+  phi::DenseTensor* output = new phi::DenseTensor();
 
   int* p_src = nullptr;
   int* p_index = nullptr;
diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc
index b80bc7320c1fd..e2ee27f2561e1 100644
--- a/paddle/fluid/operators/gaussian_random_op.cc
+++ b/paddle/fluid/operators/gaussian_random_op.cc
@@ -26,7 +26,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class CPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
@@ -34,7 +34,7 @@ class CPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     float mean = context.Attr<float>("mean");
     float std = context.Attr<float>("std");
-    auto* tensor = context.Output<framework::Tensor>("Out");
+    auto* tensor = context.Output<phi::DenseTensor>("Out");
     T* data = tensor->mutable_data<T>(context.GetPlace());
 
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
@@ -75,7 +75,7 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     if (var_name == "ShapeTensor" || var_name == "ShapeTensorList") {
       return expected_kernel_type;
diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu
index 4df716f79f2af..41d2547cc9ba0 100644
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -51,7 +51,7 @@ template <typename T>
 class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* tensor = context.Output<framework::Tensor>("Out");
+    auto* tensor = context.Output<phi::DenseTensor>("Out");
     T* data = tensor->mutable_data<T>(context.GetPlace());
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
     T mean = static_cast<T>(context.Attr<float>("mean"));
diff --git a/paddle/fluid/operators/gaussian_random_op_mlu.cc b/paddle/fluid/operators/gaussian_random_op_mlu.cc
index 4b5229b9e63ea..a70ddc428d840 100644
--- a/paddle/fluid/operators/gaussian_random_op_mlu.cc
+++ b/paddle/fluid/operators/gaussian_random_op_mlu.cc
@@ -20,14 +20,14 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 template <typename T>
 class MLUGaussianRandomKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     float mean = context.Attr<float>("mean");
     float std = context.Attr<float>("std");
-    auto* tensor = context.Output<framework::Tensor>("Out");
+    auto* tensor = context.Output<phi::DenseTensor>("Out");
     tensor->mutable_data<T>(context.GetPlace());
 
     Tensor cpu_tensor(tensor->type());
diff --git a/paddle/fluid/operators/gaussian_random_op_npu.cc b/paddle/fluid/operators/gaussian_random_op_npu.cc
index 8b3af57d923fe..0768f4be5c957 100644
--- a/paddle/fluid/operators/gaussian_random_op_npu.cc
+++ b/paddle/fluid/operators/gaussian_random_op_npu.cc
@@ -25,14 +25,14 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 template <typename T>
 class NPUGaussianRandomKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     float mean = context.Attr<float>("mean");
     float std = context.Attr<float>("std");
-    auto* tensor = context.Output<framework::Tensor>("Out");
+    auto* tensor = context.Output<phi::DenseTensor>("Out");
     tensor->mutable_data<T>(context.GetPlace());
 
     Tensor cpu_tensor(tensor->dtype());
diff --git a/paddle/fluid/operators/gelu_op_npu.cc b/paddle/fluid/operators/gelu_op_npu.cc
index 15e16420582c3..f462336b412a3 100644
--- a/paddle/fluid/operators/gelu_op_npu.cc
+++ b/paddle/fluid/operators/gelu_op_npu.cc
@@ -23,15 +23,15 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class GeluNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
 
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     auto place = ctx.GetPlace();
 
@@ -50,10 +50,10 @@ template <typename DeviceContext, typename T>
 class GeluGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
 
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     auto place = ctx.GetPlace();
 
diff --git a/paddle/fluid/operators/graph_khop_sampler_op.cu b/paddle/fluid/operators/graph_khop_sampler_op.cu
index fc8f195fb70a8..c83419f309237 100644
--- a/paddle/fluid/operators/graph_khop_sampler_op.cu
+++ b/paddle/fluid/operators/graph_khop_sampler_op.cu
@@ -49,7 +49,7 @@ constexpr int WARP_SIZE = 32;
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 struct MaxFunctor {
@@ -420,9 +420,9 @@ class GraphKhopSamplerOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     // 1. Get sample neighbors operators' inputs.
-    auto* src = ctx.Input<Tensor>("Row");
-    auto* dst_count = ctx.Input<Tensor>("Col_Ptr");
-    auto* vertices = ctx.Input<Tensor>("X");
+    auto* src = ctx.Input<phi::DenseTensor>("Row");
+    auto* dst_count = ctx.Input<phi::DenseTensor>("Col_Ptr");
+    auto* vertices = ctx.Input<phi::DenseTensor>("X");
     std::vector<int> sample_sizes = ctx.Attr<std::vector<int>>("sample_sizes");
     bool return_eids = ctx.Attr<bool>("return_eids");
 
@@ -451,7 +451,7 @@ class GraphKhopSamplerOpCUDAKernel : public framework::OpKernel<T> {
     bool is_last_layer = false, is_first_layer = true;
 
     if (return_eids) {
-      auto* src_eids = ctx.Input<Tensor>("Eids");
+      auto* src_eids = ctx.Input<phi::DenseTensor>("Eids");
       const T* src_eids_data = src_eids->data<T>();
       for (int i = 0; i < num_layers; i++) {
         if (i == num_layers - 1) {
@@ -563,7 +563,7 @@ class GraphKhopSamplerOpCUDAKernel : public framework::OpKernel<T> {
                                         eids_merge_ptr);
         }
       }
-      auto* out_eids = ctx.Output<Tensor>("Out_Eids");
+      auto* out_eids = ctx.Output<phi::DenseTensor>("Out_Eids");
       out_eids->Resize({static_cast<int>(eids_merge.size())});
       T* p_out_eids = out_eids->mutable_data<T>(ctx.GetPlace());
       thrust::copy(eids_merge.begin(), eids_merge.end(), p_out_eids);
@@ -592,11 +592,11 @@ class GraphKhopSamplerOpCUDAKernel : public framework::OpKernel<T> {
                    &orig_nodes,
                    &reindex_nodes,
                    bs);
-    auto* reindex_x = ctx.Output<Tensor>("Reindex_X");
+    auto* reindex_x = ctx.Output<phi::DenseTensor>("Reindex_X");
     T* p_reindex_x = reindex_x->mutable_data<T>(ctx.GetPlace());
     thrust::copy(reindex_nodes.begin(), reindex_nodes.end(), p_reindex_x);
 
-    auto* sample_index = ctx.Output<Tensor>("Sample_Index");
+    auto* sample_index = ctx.Output<phi::DenseTensor>("Sample_Index");
     sample_index->Resize({static_cast<int>(subset.size())});
     T* p_sample_index = sample_index->mutable_data<T>(ctx.GetPlace());
     thrust::copy(subset.begin(), subset.end(), p_sample_index);  // Done!
@@ -628,8 +628,8 @@ class GraphKhopSamplerOpCUDAKernel : public framework::OpKernel<T> {
             thrust::raw_pointer_cast(dst_merge.data()));
 
     // 8. Give operator's outputs.
-    auto* out_src = ctx.Output<Tensor>("Out_Src");
-    auto* out_dst = ctx.Output<Tensor>("Out_Dst");
+    auto* out_src = ctx.Output<phi::DenseTensor>("Out_Src");
+    auto* out_dst = ctx.Output<phi::DenseTensor>("Out_Dst");
     out_src->Resize({static_cast<int>(src_merge.size()), 1});
     out_dst->Resize({static_cast<int>(src_merge.size()), 1});
     T* p_out_src = out_src->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/graph_khop_sampler_op.h b/paddle/fluid/operators/graph_khop_sampler_op.h
index 1b08acbbedd23..278bbd5efd723 100644
--- a/paddle/fluid/operators/graph_khop_sampler_op.h
+++ b/paddle/fluid/operators/graph_khop_sampler_op.h
@@ -28,7 +28,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <class bidiiter>
 void SampleUniqueNeighbors(bidiiter begin, bidiiter end, int num_samples) {
@@ -198,9 +198,9 @@ class GraphKhopSamplerOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     // 1. Get sample neighbors operators' inputs.
-    auto* src = ctx.Input<Tensor>("Row");
-    auto* dst_count = ctx.Input<Tensor>("Col_Ptr");
-    auto* vertices = ctx.Input<Tensor>("X");
+    auto* src = ctx.Input<phi::DenseTensor>("Row");
+    auto* dst_count = ctx.Input<phi::DenseTensor>("Col_Ptr");
+    auto* vertices = ctx.Input<phi::DenseTensor>("X");
     std::vector<int> sample_sizes = ctx.Attr<std::vector<int>>("sample_sizes");
     bool return_eids = ctx.Attr<bool>("return_eids");
 
@@ -229,7 +229,7 @@ class GraphKhopSamplerOpKernel : public framework::OpKernel<T> {
     bool is_last_layer = false, is_first_layer = true;
 
     if (return_eids) {
-      auto* src_eids = ctx.Input<Tensor>("Eids");
+      auto* src_eids = ctx.Input<phi::DenseTensor>("Eids");
       const T* src_eids_data = src_eids->data<T>();
       for (size_t i = 0; i < num_layers; i++) {
         if (i == num_layers - 1) {
@@ -336,7 +336,7 @@ class GraphKhopSamplerOpKernel : public framework::OpKernel<T> {
                                      eids_merge_ptr);
         }
       }
-      auto* out_eids = ctx.Output<Tensor>("Out_Eids");
+      auto* out_eids = ctx.Output<phi::DenseTensor>("Out_Eids");
       out_eids->Resize({static_cast<int>(eids_merge.size())});
       T* p_out_eids = out_eids->mutable_data<T>(ctx.GetPlace());
       std::copy(eids_merge.begin(), eids_merge.end(), p_out_eids);
@@ -377,16 +377,16 @@ class GraphKhopSamplerOpKernel : public framework::OpKernel<T> {
     }
 
     // 7. Get Reindex_X for input nodes.
-    auto* reindex_x = ctx.Output<Tensor>("Reindex_X");
+    auto* reindex_x = ctx.Output<phi::DenseTensor>("Reindex_X");
     T* p_reindex_x = reindex_x->mutable_data<T>(ctx.GetPlace());
     for (size_t i = 0; i < bs; i++) {
       p_reindex_x[i] = node_map[p_vertices[i]];
     }
 
     // 8. Get operator's outputs.
-    auto* sample_index = ctx.Output<Tensor>("Sample_Index");
-    auto* out_src = ctx.Output<Tensor>("Out_Src");
-    auto* out_dst = ctx.Output<Tensor>("Out_Dst");
+    auto* sample_index = ctx.Output<phi::DenseTensor>("Sample_Index");
+    auto* out_src = ctx.Output<phi::DenseTensor>("Out_Src");
+    auto* out_dst = ctx.Output<phi::DenseTensor>("Out_Dst");
     sample_index->Resize({static_cast<int>(unique_nodes.size())});
     out_src->Resize({static_cast<int>(src_merge.size()), 1});
     out_dst->Resize({static_cast<int>(src_merge.size()), 1});
diff --git a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
index da9ccdf627f44..9230e114bd3bb 100644
--- a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
@@ -25,7 +25,6 @@ class DenseTensor;
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
 using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
 using DataLayout = platform::DataLayout;
 using ScopedSpatialTransformerDescriptor =
@@ -43,9 +42,9 @@ class CUDNNGridSampleOpKernel : public framework::OpKernel<T> {
                           "It must use CUDAPlace when using CUDA Kernel"));
     auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
     auto handle = dev_ctx.cudnn_handle();
-    auto* input = ctx.Input<Tensor>("X");
-    auto* grid = ctx.Input<Tensor>("Grid");
-    auto* output = ctx.Output<Tensor>("Output");
+    auto* input = ctx.Input<phi::DenseTensor>("X");
+    auto* grid = ctx.Input<phi::DenseTensor>("Grid");
+    auto* output = ctx.Output<phi::DenseTensor>("Output");
 
     int n = input->dims()[0];
     int c = input->dims()[1];
@@ -92,11 +91,14 @@ class CUDNNGridSampleGradOpKernel : public framework::OpKernel<T> {
                           "It must use CUDAPlace when using CUDA Kernel"));
     auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
     auto handle = dev_ctx.cudnn_handle();
-    auto* input = ctx.Input<Tensor>("X");
-    auto* grid = ctx.Input<Tensor>("Grid");
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* grid_grad = ctx.Output<Tensor>(framework::GradVarName("Grid"));
+    auto* input = ctx.Input<phi::DenseTensor>("X");
+    auto* grid = ctx.Input<phi::DenseTensor>("Grid");
+    auto* output_grad =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Output"));
+    auto* input_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* grid_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Grid"));
 
     auto output_grad_dims = output_grad->dims();
     const int n = output_grad_dims[0];
diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc
index 12b18bc55e2eb..5d63f6b9a500f 100644
--- a/paddle/fluid/operators/grid_sampler_op.cc
+++ b/paddle/fluid/operators/grid_sampler_op.cc
@@ -26,7 +26,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class GridSampleOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/grid_sampler_op_mlu.cc b/paddle/fluid/operators/grid_sampler_op_mlu.cc
index 8327eaad14425..b62cc6b555c55 100644
--- a/paddle/fluid/operators/grid_sampler_op_mlu.cc
+++ b/paddle/fluid/operators/grid_sampler_op_mlu.cc
@@ -18,7 +18,7 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class GridSamplerMLUKernel : public framework::OpKernel<T> {
@@ -30,9 +30,9 @@ class GridSamplerMLUKernel : public framework::OpKernel<T> {
         platform::errors::Unavailable("This kernel only runs on MLU."));
 
     // input and output data
-    const Tensor* input = ctx.Input<Tensor>("X");
-    const Tensor* grid = ctx.Input<Tensor>("Grid");
-    Tensor* output = ctx.Output<Tensor>("Output");
+    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("X");
+    const phi::DenseTensor* grid = ctx.Input<phi::DenseTensor>("Grid");
+    phi::DenseTensor* output = ctx.Output<phi::DenseTensor>("Output");
 
     int n = input->dims()[0];
     int c = input->dims()[1];
diff --git a/paddle/fluid/operators/group_norm_op.cc b/paddle/fluid/operators/group_norm_op.cc
index 1ac0093735925..7f9c2cea9bb45 100644
--- a/paddle/fluid/operators/group_norm_op.cc
+++ b/paddle/fluid/operators/group_norm_op.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/group_norm_op.h"
+
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -26,7 +28,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 using DataLayout = framework::DataLayout;
 
diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu
index 105d4d6c75efe..bda0124ffa72b 100644
--- a/paddle/fluid/operators/group_norm_op.cu
+++ b/paddle/fluid/operators/group_norm_op.cu
@@ -268,13 +268,13 @@ class GroupNormKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
     const DataLayout data_layout =
         framework::StringToDataLayout(data_layout_str);
     const float epsilon = ctx.Attr<float>("epsilon");
-    auto* scale = ctx.Input<Tensor>("Scale");
-    auto* bias = ctx.Input<Tensor>("Bias");
-    auto* x = ctx.Input<Tensor>("X");
+    auto* scale = ctx.Input<phi::DenseTensor>("Scale");
+    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
 
-    auto* y = ctx.Output<Tensor>("Y");
-    auto* mean = ctx.Output<Tensor>("Mean");
-    auto* var = ctx.Output<Tensor>("Variance");
+    auto* y = ctx.Output<phi::DenseTensor>("Y");
+    auto* mean = ctx.Output<phi::DenseTensor>("Mean");
+    auto* var = ctx.Output<phi::DenseTensor>("Variance");
     const auto groups = ctx.Attr<int>("groups");
 
     const auto x_dims = x->dims();
@@ -616,19 +616,20 @@ class GroupNormGradKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
     const DataLayout data_layout =
         framework::StringToDataLayout(data_layout_str);
     const float epsilon = ctx.Attr<float>("epsilon");
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* mean = ctx.Input<Tensor>("Mean");
-    auto* var = ctx.Input<Tensor>("Variance");
-    auto* scale = ctx.Input<Tensor>("Scale");
-    auto* bias = ctx.Input<Tensor>("Bias");
-    auto* d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* y = ctx.Input<phi::DenseTensor>("Y");
+    auto* mean = ctx.Input<phi::DenseTensor>("Mean");
+    auto* var = ctx.Input<phi::DenseTensor>("Variance");
+    auto* scale = ctx.Input<phi::DenseTensor>("Scale");
+    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
+    auto* d_y = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
     const auto groups = ctx.Attr<int>("groups");
 
     // init output
-    auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto* d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+    auto* d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* d_scale =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
+    auto* d_bias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
 
     const auto& x_dims = x->dims();
     const int C =
diff --git a/paddle/fluid/operators/group_norm_op.h b/paddle/fluid/operators/group_norm_op.h
index 28a3ad2a8e1ee..97ea7ce8f5d39 100644
--- a/paddle/fluid/operators/group_norm_op.h
+++ b/paddle/fluid/operators/group_norm_op.h
@@ -28,7 +28,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 using DataLayout = framework::DataLayout;
 
@@ -40,13 +40,13 @@ class GroupNormKernel : public framework::OpKernel<T> {
     const DataLayout data_layout =
         framework::StringToDataLayout(data_layout_str);
     const float epsilon = ctx.Attr<float>("epsilon");
-    auto* scale = ctx.Input<Tensor>("Scale");
-    auto* bias = ctx.Input<Tensor>("Bias");
-    auto* x = ctx.Input<Tensor>("X");
+    auto* scale = ctx.Input<phi::DenseTensor>("Scale");
+    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
 
-    auto* y = ctx.Output<Tensor>("Y");
-    auto* mean = ctx.Output<Tensor>("Mean");
-    auto* var = ctx.Output<Tensor>("Variance");
+    auto* y = ctx.Output<phi::DenseTensor>("Y");
+    auto* mean = ctx.Output<phi::DenseTensor>("Mean");
+    auto* var = ctx.Output<phi::DenseTensor>("Variance");
     const auto groups = ctx.Attr<int>("groups");
 
     const auto x_dims = x->dims();
@@ -221,17 +221,18 @@ class GroupNormGradKernel : public framework::OpKernel<T> {
     const DataLayout data_layout =
         framework::StringToDataLayout(data_layout_str);
     const float epsilon = ctx.Attr<float>("epsilon");
-    auto* x = ctx.Input<Tensor>("Y");
-    auto* var = ctx.Input<Tensor>("Variance");
-    auto* scale = ctx.Input<Tensor>("Scale");
-    auto* bias = ctx.Input<Tensor>("Bias");
-    auto* d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto* x = ctx.Input<phi::DenseTensor>("Y");
+    auto* var = ctx.Input<phi::DenseTensor>("Variance");
+    auto* scale = ctx.Input<phi::DenseTensor>("Scale");
+    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
+    auto* d_y = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
     const auto groups = ctx.Attr<int>("groups");
 
     // init output
-    auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto* d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+    auto* d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* d_scale =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
+    auto* d_bias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
 
     const auto& x_dims = x->dims();
     const int C =
diff --git a/paddle/fluid/operators/group_norm_op_npu.cc b/paddle/fluid/operators/group_norm_op_npu.cc
index a39c44768c224..0e817515f915c 100644
--- a/paddle/fluid/operators/group_norm_op_npu.cc
+++ b/paddle/fluid/operators/group_norm_op_npu.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 struct GroupNormFunction {
@@ -32,8 +32,8 @@ struct GroupNormFunction {
     stream = ctx.template device_context<paddle::platform::NPUDeviceContext>()
                  .stream();
   }
-  void ReduceMean(const Tensor* x,
-                  Tensor* y,
+  void ReduceMean(const phi::DenseTensor* x,
+                  phi::DenseTensor* y,
                   const std::vector<int>& dim,
                   bool keep_dims = true) {
     //  y should be init first
@@ -41,8 +41,8 @@ struct GroupNormFunction {
         "ReduceMeanD", {*x}, {*y}, {{"axes", dim}, {"keep_dims", keep_dims}});
     runner.Run(stream);
   }
-  void ReduceSum(const Tensor* x,
-                 Tensor* y,
+  void ReduceSum(const phi::DenseTensor* x,
+                 phi::DenseTensor* y,
                  const std::vector<int>& dim,
                  bool keep_dims = true) {
     //  y should be init first
@@ -50,48 +50,60 @@ struct GroupNormFunction {
         "ReduceSumD", {*x}, {*y}, {{"axes", dim}, {"keep_dims", keep_dims}});
     runner.Run(stream);
   }
-  void Add(const Tensor* x, const Tensor* y, Tensor* z) {
+  void Add(const phi::DenseTensor* x,
+           const phi::DenseTensor* y,
+           phi::DenseTensor* z) {
     //  y should be init first
     const auto& runner = NpuOpRunner("AddV2", {*x, *y}, {*z}, {});
     runner.Run(stream);
   }
-  void Sub(const Tensor* x, const Tensor* y, Tensor* z) {
+  void Sub(const phi::DenseTensor* x,
+           const phi::DenseTensor* y,
+           phi::DenseTensor* z) {
     //  y should be init first
     const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*z}, {});
     runner.Run(stream);
   }
-  void Mul(const Tensor* x, const Tensor* y, Tensor* z) {
+  void Mul(const phi::DenseTensor* x,
+           const phi::DenseTensor* y,
+           phi::DenseTensor* z) {
     //  y should be init first
     const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*z}, {});
     runner.Run(stream);
   }
-  void Div(const Tensor* x, const Tensor* y, Tensor* z) {
+  void Div(const phi::DenseTensor* x,
+           const phi::DenseTensor* y,
+           phi::DenseTensor* z) {
     //  y should be init first
     const auto& runner = NpuOpRunner("Div", {*x, *y}, {*z}, {});
     runner.Run(stream);
   }
-  void DivNoNan(const Tensor* x, const Tensor* y, Tensor* z) {
+  void DivNoNan(const phi::DenseTensor* x,
+                const phi::DenseTensor* y,
+                phi::DenseTensor* z) {
     //  y should be init first
     const auto& runner = NpuOpRunner("DivNoNan", {*x, *y}, {*z}, {});
     runner.Run(stream);
   }
-  void Transpose(const Tensor* x, Tensor* y, const std::vector<int>& axis) {
+  void Transpose(const phi::DenseTensor* x,
+                 phi::DenseTensor* y,
+                 const std::vector<int>& axis) {
     //  y should be init first
     const auto& runner =
         NpuOpRunner("TransposeD", {*x}, {*y}, {{"perm", axis}});
     runner.Run(stream);
   }
-  void Sqrt(const Tensor* x, Tensor* y) {
+  void Sqrt(const phi::DenseTensor* x, phi::DenseTensor* y) {
     //  y should be init first
     const auto& runner = NpuOpRunner("Sqrt", {*x}, {*y}, {});
     runner.Run(stream);
   }
-  void Adds(const Tensor* x, float scalar, Tensor* y) {
+  void Adds(const phi::DenseTensor* x, float scalar, phi::DenseTensor* y) {
     //  y should be init first
     const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scalar}});
     runner.Run(stream);
   }
-  Tensor ReduceMeanToNG(const Tensor* x,
+  Tensor ReduceMeanToNG(const phi::DenseTensor* x,
                         const DataLayout& data_layout,
                         const int64_t N,
                         const int64_t C,
@@ -129,13 +141,13 @@ class GroupNormNPUKernel : public framework::OpKernel<T> {
     const DataLayout data_layout =
         framework::StringToDataLayout(data_layout_str);
     const float epsilon = ctx.Attr<float>("epsilon");
-    auto* scale = ctx.Input<Tensor>("Scale");
-    auto* bias = ctx.Input<Tensor>("Bias");
-    auto* x = ctx.Input<Tensor>("X");
+    auto* scale = ctx.Input<phi::DenseTensor>("Scale");
+    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
 
-    auto* y = ctx.Output<Tensor>("Y");
-    auto* mean = ctx.Output<Tensor>("Mean");
-    auto* var = ctx.Output<Tensor>("Variance");
+    auto* y = ctx.Output<phi::DenseTensor>("Y");
+    auto* mean = ctx.Output<phi::DenseTensor>("Mean");
+    auto* var = ctx.Output<phi::DenseTensor>("Variance");
     const auto groups = ctx.Attr<int>("groups");
 
     auto place = ctx.GetPlace();
@@ -203,18 +215,19 @@ class GroupNormGradNPUKernel : public framework::OpKernel<T> {
     const DataLayout data_layout =
         framework::StringToDataLayout(data_layout_str);
     const float epsilon = ctx.Attr<float>("epsilon");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* var = ctx.Input<Tensor>("Variance");
+    auto* y = ctx.Input<phi::DenseTensor>("Y");
+    auto* var = ctx.Input<phi::DenseTensor>("Variance");
 
-    auto* scale = ctx.Input<Tensor>("Scale");
-    auto* bias = ctx.Input<Tensor>("Bias");
-    auto* d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto* scale = ctx.Input<phi::DenseTensor>("Scale");
+    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
+    auto* d_y = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
     const auto G = ctx.Attr<int>("groups");
 
     // init output
-    auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto* d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+    auto* d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* d_scale =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
+    auto* d_bias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
 
     GroupNormFunction<T> F(ctx);
     auto place = ctx.GetPlace();
diff --git a/paddle/fluid/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc
index 1040f2c2ea066..fc78f514a4507 100644
--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
@@ -26,8 +26,6 @@ DECLARE_int32(paddle_num_threads);
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class GRUOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -321,10 +319,10 @@ class GRUCPUKernel : public framework::OpKernel<T> {
 
     bool origin_mode = context.Attr<bool>("origin_mode");
     auto* input = context.Input<LoDTensor>("Input");
-    auto* h0 = context.Input<Tensor>("H0");
-    auto* weight = context.Input<Tensor>("Weight");
+    auto* h0 = context.Input<phi::DenseTensor>("H0");
+    auto* weight = context.Input<phi::DenseTensor>("Weight");
     const T* weight_data = weight->data<T>();
-    auto* bias = context.Input<Tensor>("Bias");
+    auto* bias = context.Input<phi::DenseTensor>("Bias");
     auto* hidden = context.Output<LoDTensor>("Hidden");
     hidden->mutable_data<T>(context.GetPlace());
 
diff --git a/paddle/fluid/operators/gru_op.cu.cc b/paddle/fluid/operators/gru_op.cu.cc
index f3665da181641..2d63eb4d3a698 100644
--- a/paddle/fluid/operators/gru_op.cu.cc
+++ b/paddle/fluid/operators/gru_op.cu.cc
@@ -26,10 +26,10 @@ class GRUKernel : public framework::OpKernel<T> {
     bool is_test = context.Attr<bool>("is_test");
     bool origin_mode = context.Attr<bool>("origin_mode");
     auto* input = context.Input<LoDTensor>("Input");
-    auto* h0 = context.Input<Tensor>("H0");
-    auto* weight = context.Input<Tensor>("Weight");
+    auto* h0 = context.Input<phi::DenseTensor>("H0");
+    auto* weight = context.Input<phi::DenseTensor>("Weight");
     const T* weight_data = weight->data<T>();
-    auto* bias = context.Input<Tensor>("Bias");
+    auto* bias = context.Input<phi::DenseTensor>("Bias");
     auto* hidden = context.Output<LoDTensor>("Hidden");
     hidden->mutable_data<T>(context.GetPlace());
 
diff --git a/paddle/fluid/operators/gru_op.h b/paddle/fluid/operators/gru_op.h
index 3e931e7bfa8e0..b95932b51802f 100644
--- a/paddle/fluid/operators/gru_op.h
+++ b/paddle/fluid/operators/gru_op.h
@@ -26,13 +26,13 @@ namespace paddle {
 namespace operators {
 
 using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 inline void ReorderInitState(const DeviceContext& ctx,
-                             const framework::Tensor& src,
+                             const phi::DenseTensor& src,
                              framework::Vector<size_t> index_lod,
-                             framework::Tensor* dst,
+                             phi::DenseTensor* dst,
                              bool indexed_src) {
   phi::funcs::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
   dst->mutable_data<T>(src.dims(), ctx.GetPlace());
@@ -44,8 +44,8 @@ class GRUGradKernel : public framework::OpKernel<T> {
  public:
   void BatchCompute(const framework::ExecutionContext& context) const {
     bool origin_mode = context.Attr<bool>("origin_mode");
-    auto* h0 = context.Input<Tensor>("H0");
-    auto* weight = context.Input<Tensor>("Weight");
+    auto* h0 = context.Input<phi::DenseTensor>("H0");
+    auto* weight = context.Input<phi::DenseTensor>("Weight");
     const T* weight_data = weight->data<T>();
     auto* batch_gate = context.Input<LoDTensor>("BatchGate");
     auto* batch_reset_hidden_prev =
@@ -56,10 +56,12 @@ class GRUGradKernel : public framework::OpKernel<T> {
         context.Input<LoDTensor>(framework::GradVarName("Hidden"));
     auto* input_grad =
         context.Output<LoDTensor>(framework::GradVarName("Input"));
-    auto* h0_grad = context.Output<Tensor>(framework::GradVarName("H0"));
+    auto* h0_grad =
+        context.Output<phi::DenseTensor>(framework::GradVarName("H0"));
     auto* weight_grad =
-        context.Output<Tensor>(framework::GradVarName("Weight"));
-    auto* bias_grad = context.Output<Tensor>(framework::GradVarName("Bias"));
+        context.Output<phi::DenseTensor>(framework::GradVarName("Weight"));
+    auto* bias_grad =
+        context.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
 
     auto gate_dims = batch_gate->dims();
     auto hidden_dims = hidden->dims();
diff --git a/paddle/fluid/operators/gru_unit_op.cc b/paddle/fluid/operators/gru_unit_op.cc
index 24d4771fac539..8e05454f1aefc 100644
--- a/paddle/fluid/operators/gru_unit_op.cc
+++ b/paddle/fluid/operators/gru_unit_op.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class GRUUnitOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/gru_unit_op.h b/paddle/fluid/operators/gru_unit_op.h
index bae428fc570d1..3ed3179a63e63 100644
--- a/paddle/fluid/operators/gru_unit_op.h
+++ b/paddle/fluid/operators/gru_unit_op.h
@@ -23,7 +23,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 enum GRUActivationType { identity = 0, sigmoid = 1, tanh = 2, relu = 3 };
 
@@ -55,15 +55,16 @@ class GRUUnitKernel : public framework::OpKernel<T> {
   }
 
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<Tensor>("Input");
-    auto* hidden_prev = context.Input<Tensor>("HiddenPrev");
-    auto* weight = context.Input<Tensor>("Weight");
-    auto* bias = context.Input<Tensor>("Bias");
-    auto* gate = context.Output<Tensor>("Gate");
+    auto* input = context.Input<phi::DenseTensor>("Input");
+    auto* hidden_prev = context.Input<phi::DenseTensor>("HiddenPrev");
+    auto* weight = context.Input<phi::DenseTensor>("Weight");
+    auto* bias = context.Input<phi::DenseTensor>("Bias");
+    auto* gate = context.Output<phi::DenseTensor>("Gate");
     gate->mutable_data<T>(context.GetPlace());
-    auto* reset_hidden_prev = context.Output<Tensor>("ResetHiddenPrev");
+    auto* reset_hidden_prev =
+        context.Output<phi::DenseTensor>("ResetHiddenPrev");
     reset_hidden_prev->mutable_data<T>(context.GetPlace());
-    auto* hidden = context.Output<Tensor>("Hidden");
+    auto* hidden = context.Output<phi::DenseTensor>("Hidden");
     hidden->mutable_data<T>(context.GetPlace());
 
     int batch_size = input->dims()[0];
@@ -175,18 +176,22 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
   }
 
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<Tensor>("Input");
-    auto* hidden_prev = context.Input<Tensor>("HiddenPrev");
-    auto* weight = context.Input<Tensor>("Weight");
-    auto* gate = context.Input<Tensor>("Gate");
-    auto* reset_hidden_prev = context.Input<Tensor>("ResetHiddenPrev");
-    auto* hidden_grad = context.Input<Tensor>(framework::GradVarName("Hidden"));
-    auto* input_grad = context.Output<Tensor>(framework::GradVarName("Input"));
+    auto* input = context.Input<phi::DenseTensor>("Input");
+    auto* hidden_prev = context.Input<phi::DenseTensor>("HiddenPrev");
+    auto* weight = context.Input<phi::DenseTensor>("Weight");
+    auto* gate = context.Input<phi::DenseTensor>("Gate");
+    auto* reset_hidden_prev =
+        context.Input<phi::DenseTensor>("ResetHiddenPrev");
+    auto* hidden_grad =
+        context.Input<phi::DenseTensor>(framework::GradVarName("Hidden"));
+    auto* input_grad =
+        context.Output<phi::DenseTensor>(framework::GradVarName("Input"));
     auto* hidden_prev_grad =
-        context.Output<Tensor>(framework::GradVarName("HiddenPrev"));
+        context.Output<phi::DenseTensor>(framework::GradVarName("HiddenPrev"));
     auto* weight_grad =
-        context.Output<Tensor>(framework::GradVarName("Weight"));
-    auto* bias_grad = context.Output<Tensor>(framework::GradVarName("Bias"));
+        context.Output<phi::DenseTensor>(framework::GradVarName("Weight"));
+    auto* bias_grad =
+        context.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
     Tensor gate_grad;
     Tensor reset_hidden_prev_grad;
 
diff --git a/paddle/fluid/operators/hinge_loss_op.h b/paddle/fluid/operators/hinge_loss_op.h
index 78e253ad4b0cb..8f06154c79060 100644
--- a/paddle/fluid/operators/hinge_loss_op.h
+++ b/paddle/fluid/operators/hinge_loss_op.h
@@ -24,9 +24,9 @@ template <typename DeviceContext, typename T, typename AttrType = T>
 class HingeLossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* pred = context.Input<framework::Tensor>("Logits");
-    auto* label = context.Input<framework::Tensor>("Labels");
-    auto* loss = context.Output<framework::Tensor>("Loss");
+    auto* pred = context.Input<phi::DenseTensor>("Logits");
+    auto* label = context.Input<phi::DenseTensor>("Labels");
+    auto* loss = context.Output<phi::DenseTensor>("Loss");
     auto& place =
         *context.template device_context<DeviceContext>().eigen_device();
 
@@ -42,12 +42,12 @@ template <typename DeviceContext, typename T, typename AttrType = T>
 class HingeLossGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* pred = context.Input<framework::Tensor>("Logits");
-    auto* label = context.Input<framework::Tensor>("Labels");
+    auto* pred = context.Input<phi::DenseTensor>("Logits");
+    auto* label = context.Input<phi::DenseTensor>("Labels");
     auto* dloss =
-        context.Input<framework::Tensor>(framework::GradVarName("Loss"));
+        context.Input<phi::DenseTensor>(framework::GradVarName("Loss"));
     auto* dpred =
-        context.Output<framework::Tensor>(framework::GradVarName("Logits"));
+        context.Output<phi::DenseTensor>(framework::GradVarName("Logits"));
     auto& place =
         *context.template device_context<DeviceContext>().eigen_device();
 
diff --git a/paddle/fluid/operators/histogram_op.cc b/paddle/fluid/operators/histogram_op.cc
index fc1bec5a2e52b..9d58d65c83135 100644
--- a/paddle/fluid/operators/histogram_op.cc
+++ b/paddle/fluid/operators/histogram_op.cc
@@ -24,7 +24,6 @@ namespace paddle {
 namespace operators {
 
 using framework::OpKernelType;
-using framework::Tensor;
 
 class HistogramOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/huber_loss_op_npu.cc b/paddle/fluid/operators/huber_loss_op_npu.cc
index 61944c2caaf3b..a7be6feb628bf 100644
--- a/paddle/fluid/operators/huber_loss_op_npu.cc
+++ b/paddle/fluid/operators/huber_loss_op_npu.cc
@@ -18,14 +18,14 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 void HuberLossSub(const platform::Place& place,
                   const aclrtStream& stream,
-                  const Tensor* x,
-                  const Tensor* y,
-                  Tensor* z) {
+                  const phi::DenseTensor* x,
+                  const phi::DenseTensor* y,
+                  phi::DenseTensor* z) {
   //  Calculate z = x - y
   z->mutable_data<T>(x->dims(), place);
   const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*z}, {});
@@ -35,9 +35,9 @@ void HuberLossSub(const platform::Place& place,
 template <typename T>
 void HuberLossMuls(const platform::Place& place,
                    const aclrtStream& stream,
-                   const Tensor* x,
+                   const phi::DenseTensor* x,
                    float scalar,
-                   Tensor* y) {
+                   phi::DenseTensor* y) {
   //  Calculate y = x + scale
   y->mutable_data<T>(x->dims(), place);
   const auto& runner = NpuOpRunner("Muls", {*x}, {*y}, {{"value", scalar}});
@@ -47,8 +47,8 @@ void HuberLossMuls(const platform::Place& place,
 template <typename T>
 void HuberLossZerosLike(const platform::Place& place,
                         const aclrtStream& stream,
-                        const Tensor* x,
-                        Tensor* y) {
+                        const phi::DenseTensor* x,
+                        phi::DenseTensor* y) {
   y->mutable_data<T>(x->dims(), place);
   const auto& runner = NpuOpRunner("ZerosLike", {*x}, {*y}, {});
   runner.Run(stream);
@@ -57,10 +57,10 @@ void HuberLossZerosLike(const platform::Place& place,
 template <typename T>
 void HuberLossSmoothL1Loss(const platform::Place& place,
                            const aclrtStream& stream,
-                           const Tensor* x,
-                           const Tensor* y,
+                           const phi::DenseTensor* x,
+                           const phi::DenseTensor* y,
                            float delta,
-                           Tensor* z) {
+                           phi::DenseTensor* z) {
   z->mutable_data<T>(x->dims(), place);
   const auto& runner =
       NpuOpRunner("SmoothL1Loss", {*x, *y}, {*z}, {{"sigma", delta}});
@@ -70,11 +70,11 @@ void HuberLossSmoothL1Loss(const platform::Place& place,
 template <typename T>
 void HuberLossSmoothL1LossGrad(const platform::Place& place,
                                const aclrtStream& stream,
-                               const Tensor* pred,
-                               const Tensor* lab,
-                               const Tensor* dout,
+                               const phi::DenseTensor* pred,
+                               const phi::DenseTensor* lab,
+                               const phi::DenseTensor* dout,
                                float sigma,
-                               Tensor* grad) {
+                               phi::DenseTensor* grad) {
   grad->mutable_data<T>(pred->dims(), place);
   const auto& runner = NpuOpRunner(
       "SmoothL1LossGrad", {*pred, *lab, *dout}, {*grad}, {{"sigma", sigma}});
@@ -85,10 +85,10 @@ template <typename T>
 class HuberLossNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in0 = ctx.Input<Tensor>("X");
-    auto* in1 = ctx.Input<Tensor>("Y");
-    auto* residual = ctx.Output<Tensor>("Residual");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* in0 = ctx.Input<phi::DenseTensor>("X");
+    auto* in1 = ctx.Input<phi::DenseTensor>("Y");
+    auto* residual = ctx.Output<phi::DenseTensor>("Residual");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     auto delta = ctx.Attr<float>("delta");
 
     auto stream =
@@ -106,10 +106,10 @@ template <typename T>
 class HuberLossGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* residual = ctx.Input<Tensor>("Residual");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    auto* residual = ctx.Input<phi::DenseTensor>("Residual");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
     auto delta = ctx.Attr<float>("delta");
 
     auto stream =
diff --git a/paddle/fluid/operators/im2sequence_op.h b/paddle/fluid/operators/im2sequence_op.h
index 383a9abafeaea..b886eb602ccd2 100644
--- a/paddle/fluid/operators/im2sequence_op.h
+++ b/paddle/fluid/operators/im2sequence_op.h
@@ -26,7 +26,7 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 
 inline int Im2SeqOutputSize(
@@ -40,7 +40,7 @@ template <typename DeviceContext, typename T>
 class Im2SequenceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* in = ctx.Input<Tensor>("X");
+    const phi::DenseTensor* in = ctx.Input<phi::DenseTensor>("X");
     LoDTensor* out = ctx.Output<LoDTensor>("Out");
     auto in_dim = in->dims();
     int batch_size = in_dim[0];
@@ -51,7 +51,7 @@ class Im2SequenceKernel : public framework::OpKernel<T> {
     auto strides = ctx.Attr<std::vector<int>>("strides");
     auto paddings = ctx.Attr<std::vector<int>>("paddings");
     if (ctx.HasInput("Y") && batch_size > 1) {
-      const Tensor* imgrealsize = ctx.Input<Tensor>("Y");
+      const phi::DenseTensor* imgrealsize = ctx.Input<phi::DenseTensor>("Y");
       auto out_stride = ctx.Attr<std::vector<int>>("out_stride");
       Tensor cpu_shape_tensor;
       paddle::framework::TensorCopySync(
@@ -157,10 +157,10 @@ template <typename DeviceContext, typename T>
 class Im2SequenceGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    Tensor* d_out =
-        const_cast<Tensor*>(ctx.Input<Tensor>(framework::GradVarName("Out")));
-    auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* in = ctx.Input<phi::DenseTensor>("X");
+    phi::DenseTensor* d_out = const_cast<phi::DenseTensor*>(
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out")));
+    auto* d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     d_x->mutable_data<T>(ctx.GetPlace());
 
     auto x_v = framework::EigenVector<T>::Flatten(*d_x);
diff --git a/paddle/fluid/operators/increment_op_npu.cc b/paddle/fluid/operators/increment_op_npu.cc
index 231cadd661bcd..edd89ac4f9584 100644
--- a/paddle/fluid/operators/increment_op_npu.cc
+++ b/paddle/fluid/operators/increment_op_npu.cc
@@ -22,8 +22,8 @@ template <typename T>
 class IncrementalNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* x_tensor = context.Input<framework::Tensor>("X");
-    auto* out_tensor = context.Output<framework::Tensor>("Out");
+    auto* x_tensor = context.Input<phi::DenseTensor>("X");
+    auto* out_tensor = context.Output<phi::DenseTensor>("Out");
     float step = context.Attr<float>("step");
     out_tensor->mutable_data<T>(context.GetPlace());
 
diff --git a/paddle/fluid/operators/index_sample_op_npu.cc b/paddle/fluid/operators/index_sample_op_npu.cc
index 9dd0a76b9805f..dbb8410d1eada 100644
--- a/paddle/fluid/operators/index_sample_op_npu.cc
+++ b/paddle/fluid/operators/index_sample_op_npu.cc
@@ -17,13 +17,13 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename IndexT>
 void IndexSampleGather(const paddle::platform::NPUDeviceContext& dev_ctx,
-                       const Tensor* index,
-                       const Tensor* input,
-                       Tensor* out) {
+                       const phi::DenseTensor* index,
+                       const phi::DenseTensor* input,
+                       phi::DenseTensor* out) {
   auto index_dims = index->dims();
   auto input_dims = input->dims();
   auto batch_size = input_dims[0];
@@ -72,9 +72,9 @@ class IndexSampleNPUKernel : public framework::OpKernel<T> {
 
 template <typename IndexT>
 void IndexSampleGradScatter(const paddle::platform::NPUDeviceContext& dev_ctx,
-                            const Tensor* index,
-                            const Tensor* out_grad,
-                            Tensor* x_grad) {
+                            const phi::DenseTensor* index,
+                            const phi::DenseTensor* out_grad,
+                            phi::DenseTensor* x_grad) {
   auto index_dims = index->dims();
   auto input_dims = x_grad->dims();
   auto batch_size = input_dims[0];
diff --git a/paddle/fluid/operators/index_select_op.cc b/paddle/fluid/operators/index_select_op.cc
index c6bed95e83dc5..83b0eefecf77f 100644
--- a/paddle/fluid/operators/index_select_op.cc
+++ b/paddle/fluid/operators/index_select_op.cc
@@ -23,8 +23,6 @@
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class IndexSelectOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/index_select_op.h b/paddle/fluid/operators/index_select_op.h
index bf878bbbbc82c..7a6b605df3944 100644
--- a/paddle/fluid/operators/index_select_op.h
+++ b/paddle/fluid/operators/index_select_op.h
@@ -22,7 +22,7 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 using DDim = framework::DDim;
 
diff --git a/paddle/fluid/operators/index_select_op_npu.cc b/paddle/fluid/operators/index_select_op_npu.cc
index 22f4d0161f028..0f18f9793d305 100644
--- a/paddle/fluid/operators/index_select_op_npu.cc
+++ b/paddle/fluid/operators/index_select_op_npu.cc
@@ -19,17 +19,17 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class IndexSelectNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* index = ctx.Input<Tensor>("Index");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* index = ctx.Input<phi::DenseTensor>("Index");
     auto dim = ctx.Attr<int>("dim");
 
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
 
     auto stream =
@@ -50,10 +50,9 @@ template <typename DeviceContext, typename T>
 class IndexSelectGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* index = ctx.Input<Tensor>("Index");
-    auto* out_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* x_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* index = ctx.Input<phi::DenseTensor>("Index");
+    auto* out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
 
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc
index 791fef1f7c59d..c326656b46f0a 100644
--- a/paddle/fluid/operators/inplace_abn_op.cc
+++ b/paddle/fluid/operators/inplace_abn_op.cc
@@ -40,26 +40,26 @@ class InplaceABNOp : public paddle::operators::BatchNormOp {
     if (input_data_type == framework::proto::VarType::FP64) {
       bn_param_type = framework::proto::VarType::FP64;
     }
-    PADDLE_ENFORCE_EQ(
-        bn_param_type,
-        framework::TransToProtoVarType(ctx.Input<Tensor>("Scale")->dtype()),
-        platform::errors::InvalidArgument(
-            "Scale input should be of float type"));
-    PADDLE_ENFORCE_EQ(
-        bn_param_type,
-        framework::TransToProtoVarType(ctx.Input<Tensor>("Bias")->dtype()),
-        platform::errors::InvalidArgument(
-            "Bias input should be of float type"));
-    PADDLE_ENFORCE_EQ(
-        bn_param_type,
-        framework::TransToProtoVarType(ctx.Input<Tensor>("Mean")->dtype()),
-        platform::errors::InvalidArgument(
-            "Mean input should be of float type"));
-    PADDLE_ENFORCE_EQ(
-        bn_param_type,
-        framework::TransToProtoVarType(ctx.Input<Tensor>("Variance")->dtype()),
-        platform::errors::InvalidArgument(
-            "Variance input should be of float type"));
+    PADDLE_ENFORCE_EQ(bn_param_type,
+                      framework::TransToProtoVarType(
+                          ctx.Input<phi::DenseTensor>("Scale")->dtype()),
+                      platform::errors::InvalidArgument(
+                          "Scale input should be of float type"));
+    PADDLE_ENFORCE_EQ(bn_param_type,
+                      framework::TransToProtoVarType(
+                          ctx.Input<phi::DenseTensor>("Bias")->dtype()),
+                      platform::errors::InvalidArgument(
+                          "Bias input should be of float type"));
+    PADDLE_ENFORCE_EQ(bn_param_type,
+                      framework::TransToProtoVarType(
+                          ctx.Input<phi::DenseTensor>("Mean")->dtype()),
+                      platform::errors::InvalidArgument(
+                          "Mean input should be of float type"));
+    PADDLE_ENFORCE_EQ(bn_param_type,
+                      framework::TransToProtoVarType(
+                          ctx.Input<phi::DenseTensor>("Variance")->dtype()),
+                      platform::errors::InvalidArgument(
+                          "Variance input should be of float type"));
 
     framework::LibraryType library = framework::LibraryType::kPlain;
     framework::DataLayout layout = framework::DataLayout::kAnyLayout;
@@ -138,13 +138,13 @@ class InplaceABNGradOp : public paddle::operators::BatchNormGradOp {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     const auto* var = ctx.InputVar(framework::GradVarName("Y"));
-    auto input_data_type =
-        framework::TransToProtoVarType(ctx.Input<Tensor>("Y")->dtype());
+    auto input_data_type = framework::TransToProtoVarType(
+        ctx.Input<phi::DenseTensor>("Y")->dtype());
     if (var == nullptr) {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "can't find gradient variable of Y"));
     }
-    const Tensor* t = nullptr;
+    const phi::DenseTensor* t = nullptr;
     if (var->IsType<Tensor>()) {
       t = &var->Get<Tensor>();
     } else if (var->IsType<LoDTensor>()) {
@@ -221,8 +221,8 @@ template <typename DeviceContext, typename T>
 class InplaceABNKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Output<Tensor>("Y");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* y = ctx.Output<phi::DenseTensor>("Y");
     PADDLE_ENFORCE_EQ(x,
                       y,
                       platform::errors::InvalidArgument(
@@ -231,10 +231,10 @@ class InplaceABNKernel : public framework::OpKernel<T> {
         GetInplaceABNActivationType(ctx.Attr<std::string>("activation"));
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
 
-    auto* scale = ctx.Input<Tensor>("Scale");
-    auto* bias = ctx.Input<Tensor>("Bias");
-    auto* mean = ctx.Input<Tensor>("Mean");
-    auto* variance = ctx.Input<Tensor>("Variance");
+    auto* scale = ctx.Input<phi::DenseTensor>("Scale");
+    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
+    auto* mean = ctx.Input<phi::DenseTensor>("Mean");
+    auto* variance = ctx.Input<phi::DenseTensor>("Variance");
 
     auto momentum = ctx.Attr<float>("momentum");
     auto epsilon = ctx.Attr<float>("epsilon");
@@ -244,11 +244,11 @@ class InplaceABNKernel : public framework::OpKernel<T> {
     auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
     auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
 
-    auto* mean_out = ctx.Output<Tensor>("MeanOut");
-    auto* variance_out = ctx.Output<Tensor>("VarianceOut");
-    auto* saved_mean = ctx.Output<Tensor>("SavedMean");
-    auto* saved_variance = ctx.Output<Tensor>("SavedVariance");
-    auto* reserve_space = ctx.Output<Tensor>("ReserveSpace");
+    auto* mean_out = ctx.Output<phi::DenseTensor>("MeanOut");
+    auto* variance_out = ctx.Output<phi::DenseTensor>("VarianceOut");
+    auto* saved_mean = ctx.Output<phi::DenseTensor>("SavedMean");
+    auto* saved_variance = ctx.Output<phi::DenseTensor>("SavedVariance");
+    auto* reserve_space = ctx.Output<phi::DenseTensor>("ReserveSpace");
 
     auto& dev_ctx = ctx.device_context<DeviceContext>();
     phi::BatchNormKernel<T>(
@@ -283,9 +283,9 @@ template <typename DeviceContext, typename T>
 class InplaceABNGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* y = ctx.Input<phi::DenseTensor>("Y");
+    auto* d_y = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
+    auto* d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     PADDLE_ENFORCE_EQ(d_x,
                       d_y,
                       platform::errors::InvalidArgument(
@@ -304,10 +304,10 @@ class InplaceABNGradKernel : public framework::OpKernel<T> {
 
     // BatchNormGradKernel<DeviceContext, T>::Compute(ctx);
 
-    auto* scale = ctx.Input<Tensor>("Scale");
-    auto* bias = ctx.Input<Tensor>("Bias");
-    auto* saved_mean = ctx.Input<Tensor>("SavedMean");
-    auto* saved_variance = ctx.Input<Tensor>("SavedVariance");
+    auto* scale = ctx.Input<phi::DenseTensor>("Scale");
+    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
+    auto* saved_mean = ctx.Input<phi::DenseTensor>("SavedMean");
+    auto* saved_variance = ctx.Input<phi::DenseTensor>("SavedVariance");
 
     auto momentum = ctx.Attr<float>("momentum");
     auto epsilon = ctx.Attr<float>("epsilon");
@@ -317,12 +317,14 @@ class InplaceABNGradKernel : public framework::OpKernel<T> {
     auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
     auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
 
-    auto* scale_grad = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto* bias_grad = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+    auto* scale_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
+    auto* bias_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
 
-    auto* reserve_space = ctx.Input<Tensor>("ReserveSpace");
-    auto* mean = ctx.Input<Tensor>("ReserveSpace");
-    auto* variance = ctx.Input<Tensor>("ReserveSpace");
+    auto* reserve_space = ctx.Input<phi::DenseTensor>("ReserveSpace");
+    auto* mean = ctx.Input<phi::DenseTensor>("ReserveSpace");
+    auto* variance = ctx.Input<phi::DenseTensor>("ReserveSpace");
 
     paddle::optional<Tensor> space_opt;
     paddle::optional<Tensor> mean_opt;
diff --git a/paddle/fluid/operators/inplace_abn_op.cu b/paddle/fluid/operators/inplace_abn_op.cu
index 044b8118abb0e..0ee6d686a7539 100644
--- a/paddle/fluid/operators/inplace_abn_op.cu
+++ b/paddle/fluid/operators/inplace_abn_op.cu
@@ -27,8 +27,8 @@ template <typename DeviceContext, typename T>
 class InplaceABNKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* y = ctx.Output<Tensor>("Y");
-    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Output<phi::DenseTensor>("Y");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
     PADDLE_ENFORCE_EQ(x,
                       y,
                       platform::errors::InvalidArgument(
@@ -37,10 +37,10 @@ class InplaceABNKernel : public framework::OpKernel<T> {
         GetInplaceABNActivationType(ctx.Attr<std::string>("activation"));
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
 
-    auto* scale = ctx.Input<Tensor>("Scale");
-    auto* bias = ctx.Input<Tensor>("Bias");
-    auto* mean = ctx.Input<Tensor>("Mean");
-    auto* variance = ctx.Input<Tensor>("Variance");
+    auto* scale = ctx.Input<phi::DenseTensor>("Scale");
+    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
+    auto* mean = ctx.Input<phi::DenseTensor>("Mean");
+    auto* variance = ctx.Input<phi::DenseTensor>("Variance");
 
     auto momentum = ctx.Attr<float>("momentum");
     auto epsilon = ctx.Attr<float>("epsilon");
@@ -50,11 +50,11 @@ class InplaceABNKernel : public framework::OpKernel<T> {
     auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
     auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
 
-    auto* mean_out = ctx.Output<Tensor>("MeanOut");
-    auto* variance_out = ctx.Output<Tensor>("VarianceOut");
-    auto* saved_mean = ctx.Output<Tensor>("SavedMean");
-    auto* saved_variance = ctx.Output<Tensor>("SavedVariance");
-    auto* reserve_space = ctx.Output<Tensor>("ReserveSpace");
+    auto* mean_out = ctx.Output<phi::DenseTensor>("MeanOut");
+    auto* variance_out = ctx.Output<phi::DenseTensor>("VarianceOut");
+    auto* saved_mean = ctx.Output<phi::DenseTensor>("SavedMean");
+    auto* saved_variance = ctx.Output<phi::DenseTensor>("SavedVariance");
+    auto* reserve_space = ctx.Output<phi::DenseTensor>("ReserveSpace");
 
     if (ctx.Attr<bool>("use_sync_bn")) {
       auto& dev_ctx = ctx.device_context<DeviceContext>();
@@ -116,9 +116,9 @@ template <typename DeviceContext, typename T>
 class InplaceABNGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* y = ctx.Input<Tensor>("Y");
-    auto* d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    const auto* y = ctx.Input<phi::DenseTensor>("Y");
+    auto* d_y = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
+    auto* d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     PADDLE_ENFORCE_EQ(d_x,
                       d_y,
                       platform::errors::InvalidArgument(
@@ -135,10 +135,10 @@ class InplaceABNGradKernel : public framework::OpKernel<T> {
     InplaceABNActivation<DeviceContext, T> functor;
     functor.GradCompute(ctx, activation, place, cur_y, cur_y, cur_dy, cur_dy);
 
-    auto* scale = ctx.Input<Tensor>("Scale");
-    auto* bias = ctx.Input<Tensor>("Bias");
-    auto* saved_mean = ctx.Input<Tensor>("SavedMean");
-    auto* saved_variance = ctx.Input<Tensor>("SavedVariance");
+    auto* scale = ctx.Input<phi::DenseTensor>("Scale");
+    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
+    auto* saved_mean = ctx.Input<phi::DenseTensor>("SavedMean");
+    auto* saved_variance = ctx.Input<phi::DenseTensor>("SavedVariance");
 
     auto momentum = ctx.Attr<float>("momentum");
     auto epsilon = ctx.Attr<float>("epsilon");
@@ -148,12 +148,14 @@ class InplaceABNGradKernel : public framework::OpKernel<T> {
     auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
     auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
 
-    auto* scale_grad = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto* bias_grad = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+    auto* scale_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
+    auto* bias_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
 
-    auto* reserve_space = ctx.Input<Tensor>("ReserveSpace");
-    auto* mean = ctx.Input<Tensor>("ReserveSpace");
-    auto* variance = ctx.Input<Tensor>("ReserveSpace");
+    auto* reserve_space = ctx.Input<phi::DenseTensor>("ReserveSpace");
+    auto* mean = ctx.Input<phi::DenseTensor>("ReserveSpace");
+    auto* variance = ctx.Input<phi::DenseTensor>("ReserveSpace");
 
     if (ctx.Attr<bool>("use_sync_bn")) {
       auto& dev_ctx = ctx.device_context<DeviceContext>();
diff --git a/paddle/fluid/operators/inplace_abn_op.h b/paddle/fluid/operators/inplace_abn_op.h
index 2b4e89f1c85fa..2a9568e845492 100644
--- a/paddle/fluid/operators/inplace_abn_op.h
+++ b/paddle/fluid/operators/inplace_abn_op.h
@@ -22,7 +22,7 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
diff --git a/paddle/fluid/operators/instance_norm_op.cc b/paddle/fluid/operators/instance_norm_op.cc
index 0da87b2a7c4e3..ae4da5c51a088 100644
--- a/paddle/fluid/operators/instance_norm_op.cc
+++ b/paddle/fluid/operators/instance_norm_op.cc
@@ -40,18 +40,18 @@ framework::OpKernelType InstanceNormOp::GetExpectedKernelType(
     in_param_type = framework::proto::VarType::FP64;
   }
   if (ctx.HasInput("Scale")) {
-    PADDLE_ENFORCE_EQ(
-        in_param_type,
-        framework::TransToProtoVarType(ctx.Input<Tensor>("Scale")->dtype()),
-        platform::errors::InvalidArgument(
-            "Scale input should be of float type"));
+    PADDLE_ENFORCE_EQ(in_param_type,
+                      framework::TransToProtoVarType(
+                          ctx.Input<phi::DenseTensor>("Scale")->dtype()),
+                      platform::errors::InvalidArgument(
+                          "Scale input should be of float type"));
   }
   if (ctx.HasInput("Bias")) {
-    PADDLE_ENFORCE_EQ(
-        in_param_type,
-        framework::TransToProtoVarType(ctx.Input<Tensor>("Bias")->dtype()),
-        platform::errors::InvalidArgument(
-            "Bias input should be of float type"));
+    PADDLE_ENFORCE_EQ(in_param_type,
+                      framework::TransToProtoVarType(
+                          ctx.Input<phi::DenseTensor>("Bias")->dtype()),
+                      platform::errors::InvalidArgument(
+                          "Bias input should be of float type"));
   }
 
   return framework::OpKernelType(input_data_type, ctx.GetPlace());
diff --git a/paddle/fluid/operators/instance_norm_op.h b/paddle/fluid/operators/instance_norm_op.h
index 3f99cdf10c64b..43505cac2817b 100644
--- a/paddle/fluid/operators/instance_norm_op.h
+++ b/paddle/fluid/operators/instance_norm_op.h
@@ -22,7 +22,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 using DataLayout = framework::DataLayout;
 
diff --git a/paddle/fluid/operators/instance_norm_op_npu.cc b/paddle/fluid/operators/instance_norm_op_npu.cc
index 89c6a310d746d..0370cef2ed0cf 100644
--- a/paddle/fluid/operators/instance_norm_op_npu.cc
+++ b/paddle/fluid/operators/instance_norm_op_npu.cc
@@ -18,19 +18,19 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class InstanceNormNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     const auto epsilon = ctx.Attr<float>("epsilon");
-    const auto* x = ctx.Input<Tensor>("X");
-    const auto* scale = ctx.Input<Tensor>("Scale");
-    const auto* bias = ctx.Input<Tensor>("Bias");
-    auto* y = ctx.Output<Tensor>("Y");
-    auto* mean = ctx.Output<Tensor>("SavedMean");
-    auto* variance = ctx.Output<Tensor>("SavedVariance");
+    const auto* x = ctx.Input<phi::DenseTensor>("X");
+    const auto* scale = ctx.Input<phi::DenseTensor>("Scale");
+    const auto* bias = ctx.Input<phi::DenseTensor>("Bias");
+    auto* y = ctx.Output<phi::DenseTensor>("Y");
+    auto* mean = ctx.Output<phi::DenseTensor>("SavedMean");
+    auto* variance = ctx.Output<phi::DenseTensor>("SavedVariance");
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
 
     dev_ctx.template Alloc<T>(y);
diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc
index 213d14ec48f66..056b81fd9a2ef 100644
--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
@@ -23,7 +23,6 @@
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
 using DataLayout = framework::DataLayout;
 
 static void Interpolate1DInferShapeCheck(framework::InferShapeContext* ctx) {
@@ -359,7 +358,7 @@ class InterpolateOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
 #ifdef PADDLE_WITH_MKLDNN
     if ((expected_kernel_type.data_layout_ == framework::DataLayout::kMKLDNN) &&
@@ -612,7 +611,7 @@ class InterpolateOpGrad : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     if (var_name == "SizeTensor" || var_name == "Scale") {
       return expected_kernel_type;
diff --git a/paddle/fluid/operators/interpolate_op.cu b/paddle/fluid/operators/interpolate_op.cu
index 80534d29b5ae4..1e2ba7501a5dc 100644
--- a/paddle/fluid/operators/interpolate_op.cu
+++ b/paddle/fluid/operators/interpolate_op.cu
@@ -19,7 +19,6 @@
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
 using DataLayout = framework::DataLayout;
 
 template <typename T>
@@ -913,8 +912,8 @@ __global__ void KeBicubicInterpBw(T* in,
 
 template <typename T>
 static void Interpolate1DCUDAFwd(const framework::ExecutionContext& ctx,
-                                 const Tensor& input,
-                                 Tensor* output) {
+                                 const phi::DenseTensor& input,
+                                 phi::DenseTensor* output) {
   auto* input_data = input.data<T>();
 
   const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
@@ -928,14 +927,14 @@ static void Interpolate1DCUDAFwd(const framework::ExecutionContext& ctx,
 
   int out_w = ctx.Attr<int>("out_w");
 
-  auto list_new_shape_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  auto list_new_shape_tensor = ctx.MultiInput<phi::DenseTensor>("SizeTensor");
   if (list_new_shape_tensor.size() > 0) {
     // have size tensor
     auto new_size = get_new_shape(list_new_shape_tensor);
     out_w = new_size[0];
   } else {
     float scale;
-    auto scale_tensor = ctx.Input<Tensor>("Scale");
+    auto scale_tensor = ctx.Input<phi::DenseTensor>("Scale");
     if (scale_tensor != nullptr) {
       auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
       scale = scale_data[0];
@@ -945,7 +944,7 @@ static void Interpolate1DCUDAFwd(const framework::ExecutionContext& ctx,
     if (scale > 0) {
       out_w = static_cast<int>(in_w * scale);
     }
-    auto out_size = ctx.Input<Tensor>("OutSize");
+    auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
     if (out_size != nullptr) {
       Tensor sizes;
       framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
@@ -1005,8 +1004,8 @@ static void Interpolate1DCUDAFwd(const framework::ExecutionContext& ctx,
 
 template <typename T>
 static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
-                                 const Tensor& input,
-                                 Tensor* output) {
+                                 const phi::DenseTensor& input,
+                                 phi::DenseTensor* output) {
   auto* input_data = input.data<T>();
 
   const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
@@ -1021,7 +1020,7 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
   int out_h = ctx.Attr<int>("out_h");
   int out_w = ctx.Attr<int>("out_w");
 
-  auto list_new_shape_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  auto list_new_shape_tensor = ctx.MultiInput<phi::DenseTensor>("SizeTensor");
   if (list_new_shape_tensor.size() > 0) {
     // have size tensor
     auto new_size = get_new_shape(list_new_shape_tensor);
@@ -1029,7 +1028,7 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
     out_w = new_size[1];
   } else {
     float scale;
-    auto scale_tensor = ctx.Input<Tensor>("Scale");
+    auto scale_tensor = ctx.Input<phi::DenseTensor>("Scale");
     if (scale_tensor != nullptr) {
       auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
       scale = scale_data[0];
@@ -1040,7 +1039,7 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
       out_h = static_cast<int>(in_h * scale);
       out_w = static_cast<int>(in_w * scale);
     }
-    auto out_size = ctx.Input<Tensor>("OutSize");
+    auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
     if (out_size != nullptr) {
       Tensor sizes;
       framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
@@ -1157,8 +1156,8 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
 
 template <typename T>
 static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
-                                 const Tensor& input,
-                                 Tensor* output) {
+                                 const phi::DenseTensor& input,
+                                 phi::DenseTensor* output) {
   auto* input_data = input.data<T>();
 
   const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
@@ -1174,7 +1173,7 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
   int out_h = ctx.Attr<int>("out_h");
   int out_w = ctx.Attr<int>("out_w");
 
-  auto list_new_shape_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  auto list_new_shape_tensor = ctx.MultiInput<phi::DenseTensor>("SizeTensor");
   if (list_new_shape_tensor.size() > 0) {
     // have size tensor
     auto new_size = get_new_shape(list_new_shape_tensor);
@@ -1183,7 +1182,7 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
     out_w = new_size[2];
   } else {
     float scale;
-    auto scale_tensor = ctx.Input<Tensor>("Scale");
+    auto scale_tensor = ctx.Input<phi::DenseTensor>("Scale");
     if (scale_tensor != nullptr) {
       auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
       scale = scale_data[0];
@@ -1195,7 +1194,7 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
       out_h = static_cast<int>(in_h * scale);
       out_w = static_cast<int>(in_w * scale);
     }
-    auto out_size = ctx.Input<Tensor>("OutSize");
+    auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
     if (out_size != nullptr) {
       Tensor sizes;
       framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
@@ -1289,9 +1288,9 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
 
 template <typename T>
 static void Interpolate1DCUDABwd(const framework::ExecutionContext& ctx,
-                                 Tensor* input_grad,
+                                 phi::DenseTensor* input_grad,
                                  const Tensor output_grad) {
-  auto* input = ctx.Input<Tensor>("X");
+  auto* input = ctx.Input<phi::DenseTensor>("X");
   const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
   const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
   int n, c, in_d, in_h, in_w;
@@ -1303,7 +1302,7 @@ static void Interpolate1DCUDABwd(const framework::ExecutionContext& ctx,
 
   int out_w = ctx.Attr<int>("out_w");
   float scale;
-  auto scale_tensor = ctx.Input<Tensor>("Scale");
+  auto scale_tensor = ctx.Input<phi::DenseTensor>("Scale");
   if (scale_tensor != nullptr) {
     auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
     scale = scale_data[0];
@@ -1314,14 +1313,14 @@ static void Interpolate1DCUDABwd(const framework::ExecutionContext& ctx,
     out_w = static_cast<int>(in_w * scale);
   }
 
-  auto out_size = ctx.Input<Tensor>("OutSize");
+  auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
   if (out_size != nullptr) {
     Tensor sizes;
     framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
     auto size_data = sizes.data<int>();
     out_w = size_data[0];
   }
-  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  auto list_new_size_tensor = ctx.MultiInput<phi::DenseTensor>("SizeTensor");
   if (list_new_size_tensor.size() > 0) {
     // have size tensor
     auto new_size = get_new_shape(list_new_size_tensor);
@@ -1380,9 +1379,9 @@ static void Interpolate1DCUDABwd(const framework::ExecutionContext& ctx,
 
 template <typename T>
 static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
-                                 Tensor* input_grad,
+                                 phi::DenseTensor* input_grad,
                                  const Tensor output_grad) {
-  auto* input = ctx.Input<Tensor>("X");
+  auto* input = ctx.Input<phi::DenseTensor>("X");
   const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
   const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
   int n, c, in_d, in_h, in_w;
@@ -1395,7 +1394,7 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
   int out_h = ctx.Attr<int>("out_h");
   int out_w = ctx.Attr<int>("out_w");
   float scale;
-  auto scale_tensor = ctx.Input<Tensor>("Scale");
+  auto scale_tensor = ctx.Input<phi::DenseTensor>("Scale");
   if (scale_tensor != nullptr) {
     auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
     scale = scale_data[0];
@@ -1407,7 +1406,7 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
     out_w = static_cast<int>(in_w * scale);
   }
 
-  auto out_size = ctx.Input<Tensor>("OutSize");
+  auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
   if (out_size != nullptr) {
     Tensor sizes;
     framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
@@ -1415,7 +1414,7 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
     out_h = size_data[0];
     out_w = size_data[1];
   }
-  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  auto list_new_size_tensor = ctx.MultiInput<phi::DenseTensor>("SizeTensor");
   if (list_new_size_tensor.size() > 0) {
     // have size tensor
     auto new_size = get_new_shape(list_new_size_tensor);
@@ -1526,9 +1525,9 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
 
 template <typename T>
 static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx,
-                                 Tensor* input_grad,
-                                 const Tensor& output_grad) {
-  auto* input = ctx.Input<Tensor>("X");
+                                 phi::DenseTensor* input_grad,
+                                 const phi::DenseTensor& output_grad) {
+  auto* input = ctx.Input<phi::DenseTensor>("X");
   const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
   const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
   int n, c, in_d, in_h, in_w;
@@ -1542,7 +1541,7 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx,
   int out_h = ctx.Attr<int>("out_h");
   int out_w = ctx.Attr<int>("out_w");
   float scale;
-  auto scale_tensor = ctx.Input<Tensor>("Scale");
+  auto scale_tensor = ctx.Input<phi::DenseTensor>("Scale");
   if (scale_tensor != nullptr) {
     auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
     scale = scale_data[0];
@@ -1555,7 +1554,7 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx,
     out_w = static_cast<int>(in_w * scale);
   }
 
-  auto out_size = ctx.Input<Tensor>("OutSize");
+  auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
   if (out_size != nullptr) {
     Tensor sizes;
     framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
@@ -1564,7 +1563,7 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx,
     out_h = size_data[1];
     out_w = size_data[2];
   }
-  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  auto list_new_size_tensor = ctx.MultiInput<phi::DenseTensor>("SizeTensor");
   if (list_new_size_tensor.size() > 0) {
     // have size tensor
     auto new_size = get_new_shape(list_new_size_tensor);
@@ -1651,8 +1650,8 @@ class InterpolateOpCUDAKernel : public framework::OpKernel<T> {
         platform::is_gpu_place(ctx.GetPlace()),
         true,
         platform::errors::NotFound("This kernel only runs on GPU device."));
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
+    auto* input = ctx.Input<phi::DenseTensor>("X");
+    auto* output = ctx.Output<phi::DenseTensor>("Out");
 
     auto input_dims = input->dims();
     if (input_dims.size() == 3) {  // 1D interpolation
@@ -1673,8 +1672,10 @@ class InterpolateGradOpCUDAKernel : public framework::OpKernel<T> {
         platform::is_gpu_place(ctx.GetPlace()),
         true,
         platform::errors::NotFound("This kernel only runs on GPU device."));
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* input_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* output_grad =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
 
     auto output_grad_dims = output_grad->dims();
     if (output_grad_dims.size() == 3) {  // 1D interpolation
diff --git a/paddle/fluid/operators/interpolate_op.h b/paddle/fluid/operators/interpolate_op.h
index ff474cfff9727..87825d5fa4ddd 100644
--- a/paddle/fluid/operators/interpolate_op.h
+++ b/paddle/fluid/operators/interpolate_op.h
@@ -26,11 +26,11 @@ template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using DataLayout = framework::DataLayout;
 
 inline std::vector<int> get_new_shape(
-    const std::vector<const Tensor*>& list_new_shape_tensor) {
+    const std::vector<const phi::DenseTensor*>& list_new_shape_tensor) {
   // get tensor from
   std::vector<int> vec_new_shape;
   for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
@@ -43,7 +43,7 @@ inline std::vector<int> get_new_shape(
                           tensor->dims()));
     if (platform::is_gpu_place(tensor->place()) ||
         platform::is_mlu_place(tensor->place())) {
-      framework::Tensor temp;
+      phi::DenseTensor temp;
       paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
       vec_new_shape.push_back(static_cast<int32_t>(*temp.data<int32_t>()));
     } else {
@@ -55,10 +55,11 @@ inline std::vector<int> get_new_shape(
 }
 
 template <typename T>
-inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) {
+inline std::vector<T> get_new_data_from_tensor(
+    const phi::DenseTensor* new_data_tensor) {
   std::vector<T> vec_new_data;
   auto* new_data = new_data_tensor->data<T>();
-  framework::Tensor cpu_starts_tensor;
+  phi::DenseTensor cpu_starts_tensor;
   if (platform::is_gpu_place(new_data_tensor->place()) ||
       platform::is_mlu_place(new_data_tensor->place())) {
     paddle::framework::TensorCopySync(
@@ -97,8 +98,8 @@ inline void ExtractNCDWH(const framework::DDim& dims,
 }
 
 template <typename T>
-static void NearestNeighborInterpolate(const Tensor& input,
-                                       Tensor* output,
+static void NearestNeighborInterpolate(const phi::DenseTensor& input,
+                                       phi::DenseTensor* output,
                                        const float ratio_h,
                                        const float ratio_w,
                                        const int n,
@@ -131,8 +132,8 @@ static void NearestNeighborInterpolate(const Tensor& input,
 }
 
 template <typename T>
-static void LinearInterpolation(const Tensor& input,
-                                Tensor* output,
+static void LinearInterpolation(const phi::DenseTensor& input,
+                                phi::DenseTensor* output,
                                 const float ratio_w,
                                 const int in_w,
                                 const int n,
@@ -195,8 +196,8 @@ static void LinearInterpolation(const Tensor& input,
 }
 
 template <typename T>
-static void LinearInterpolationGrad(const Tensor& output_grad,
-                                    Tensor* input_grad,
+static void LinearInterpolationGrad(const phi::DenseTensor& output_grad,
+                                    phi::DenseTensor* input_grad,
                                     const float ratio_w,
                                     const int in_w,
                                     const int n,
@@ -237,8 +238,8 @@ static void LinearInterpolationGrad(const Tensor& output_grad,
 }
 
 template <typename T>
-static void BilinearInterpolation(const Tensor& input,
-                                  Tensor* output,
+static void BilinearInterpolation(const phi::DenseTensor& input,
+                                  phi::DenseTensor* output,
                                   const float ratio_h,
                                   const float ratio_w,
                                   const int in_h,
@@ -337,8 +338,8 @@ static void BilinearInterpolation(const Tensor& input,
 }
 
 template <typename T>
-static void TrilinearInterpolation(const Tensor& input,
-                                   Tensor* output,
+static void TrilinearInterpolation(const phi::DenseTensor& input,
+                                   phi::DenseTensor* output,
                                    const float ratio_d,
                                    const float ratio_h,
                                    const float ratio_w,
@@ -522,8 +523,8 @@ static inline T cubic_interp(T x0, T x1, T x2, T x3, T t) {
 }
 
 template <typename T>
-static void BicubicInterpolation(const Tensor& input,
-                                 Tensor* output,
+static void BicubicInterpolation(const phi::DenseTensor& input,
+                                 phi::DenseTensor* output,
                                  const float ratio_h,
                                  const float ratio_w,
                                  const int in_h,
@@ -602,8 +603,8 @@ static void BicubicInterpolation(const Tensor& input,
 }
 
 template <typename T>
-static void NearestNeighborInterpolateGrad(const Tensor& output_grad,
-                                           Tensor* input_grad,
+static void NearestNeighborInterpolateGrad(const phi::DenseTensor& output_grad,
+                                           phi::DenseTensor* input_grad,
                                            const float ratio_h,
                                            const float ratio_w,
                                            const int n,
@@ -637,8 +638,8 @@ static void NearestNeighborInterpolateGrad(const Tensor& output_grad,
 }
 
 template <typename T>
-static void BilinearInterpolationGrad(const Tensor& output_grad,
-                                      Tensor* input_grad,
+static void BilinearInterpolationGrad(const phi::DenseTensor& output_grad,
+                                      phi::DenseTensor* input_grad,
                                       const float ratio_h,
                                       const float ratio_w,
                                       const int in_h,
@@ -696,8 +697,8 @@ static void BilinearInterpolationGrad(const Tensor& output_grad,
 }
 
 template <typename T>
-static void TrilinearInterpolationGrad(const Tensor& output_grad,
-                                       Tensor* input_grad,
+static void TrilinearInterpolationGrad(const phi::DenseTensor& output_grad,
+                                       phi::DenseTensor* input_grad,
                                        const float ratio_d,
                                        const float ratio_h,
                                        const float ratio_w,
@@ -793,8 +794,8 @@ static void TrilinearInterpolationGrad(const Tensor& output_grad,
 }
 
 template <typename T>
-static void BicubicInterpolationGrad(const Tensor& output_grad,
-                                     Tensor* input_grad,
+static void BicubicInterpolationGrad(const phi::DenseTensor& output_grad,
+                                     phi::DenseTensor* input_grad,
                                      const float ratio_h,
                                      const float ratio_w,
                                      const int in_h,
@@ -854,8 +855,8 @@ static void BicubicInterpolationGrad(const Tensor& output_grad,
 
 template <typename T>
 static void Interpolate1DCPUFwd(const framework::ExecutionContext& ctx,
-                                const Tensor& input,
-                                Tensor* output) {
+                                const phi::DenseTensor& input,
+                                phi::DenseTensor* output) {
   const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
   const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
   int n, c, in_d, in_h, in_w;
@@ -866,14 +867,14 @@ static void Interpolate1DCPUFwd(const framework::ExecutionContext& ctx,
   int align_mode = ctx.Attr<int>("align_mode");
 
   int out_w = ctx.Attr<int>("out_w");
-  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  auto list_new_size_tensor = ctx.MultiInput<phi::DenseTensor>("SizeTensor");
   if (list_new_size_tensor.size() > 0) {
     // have size tensor
     auto new_size = get_new_shape(list_new_size_tensor);
     out_w = new_size[0];
   } else {
     float scale;
-    auto scale_tensor = ctx.Input<Tensor>("Scale");
+    auto scale_tensor = ctx.Input<phi::DenseTensor>("Scale");
     if (scale_tensor != nullptr) {
       auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
       scale = scale_data[0];
@@ -883,7 +884,7 @@ static void Interpolate1DCPUFwd(const framework::ExecutionContext& ctx,
     if (scale > 0) {
       out_w = static_cast<int>(in_w * scale);
     }
-    auto out_size = ctx.Input<Tensor>("OutSize");
+    auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
     if (out_size != nullptr) {
       auto out_size_data = get_new_data_from_tensor<int>(out_size);
       out_w = out_size_data[0];
@@ -928,8 +929,8 @@ static void Interpolate1DCPUFwd(const framework::ExecutionContext& ctx,
 
 template <typename T>
 static void Interpolate2DCPUFwd(const framework::ExecutionContext& ctx,
-                                const Tensor& input,
-                                Tensor* output) {
+                                const phi::DenseTensor& input,
+                                phi::DenseTensor* output) {
   const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
   const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
   int n, c, in_d, in_h, in_w;
@@ -942,7 +943,7 @@ static void Interpolate2DCPUFwd(const framework::ExecutionContext& ctx,
   int out_h = ctx.Attr<int>("out_h");
   int out_w = ctx.Attr<int>("out_w");
 
-  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  auto list_new_size_tensor = ctx.MultiInput<phi::DenseTensor>("SizeTensor");
   if (list_new_size_tensor.size() > 0) {
     // have size tensor
     auto new_size = get_new_shape(list_new_size_tensor);
@@ -950,7 +951,7 @@ static void Interpolate2DCPUFwd(const framework::ExecutionContext& ctx,
     out_w = new_size[1];
   } else {
     float scale;
-    auto scale_tensor = ctx.Input<Tensor>("Scale");
+    auto scale_tensor = ctx.Input<phi::DenseTensor>("Scale");
     if (scale_tensor != nullptr) {
       auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
       scale = scale_data[0];
@@ -961,7 +962,7 @@ static void Interpolate2DCPUFwd(const framework::ExecutionContext& ctx,
       out_h = static_cast<int>(in_h * scale);
       out_w = static_cast<int>(in_w * scale);
     }
-    auto out_size = ctx.Input<Tensor>("OutSize");
+    auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
     if (out_size != nullptr) {
       auto out_size_data = get_new_data_from_tensor<int>(out_size);
       out_h = out_size_data[0];
@@ -1045,8 +1046,8 @@ static void Interpolate2DCPUFwd(const framework::ExecutionContext& ctx,
 
 template <typename T>
 static void Interpolate3DCPUFwd(const framework::ExecutionContext& ctx,
-                                const Tensor& input,
-                                Tensor* output) {
+                                const phi::DenseTensor& input,
+                                phi::DenseTensor* output) {
   const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
   const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
   int n, c, in_d, in_h, in_w;
@@ -1060,7 +1061,7 @@ static void Interpolate3DCPUFwd(const framework::ExecutionContext& ctx,
   int out_h = ctx.Attr<int>("out_h");
   int out_w = ctx.Attr<int>("out_w");
 
-  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  auto list_new_size_tensor = ctx.MultiInput<phi::DenseTensor>("SizeTensor");
   if (list_new_size_tensor.size() > 0) {
     // have size tensor
     auto new_size = get_new_shape(list_new_size_tensor);
@@ -1069,7 +1070,7 @@ static void Interpolate3DCPUFwd(const framework::ExecutionContext& ctx,
     out_w = new_size[2];
   } else {
     float scale;
-    auto scale_tensor = ctx.Input<Tensor>("Scale");
+    auto scale_tensor = ctx.Input<phi::DenseTensor>("Scale");
     if (scale_tensor != nullptr) {
       auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
       scale = scale_data[0];
@@ -1081,7 +1082,7 @@ static void Interpolate3DCPUFwd(const framework::ExecutionContext& ctx,
       out_h = static_cast<int>(in_h * scale);
       out_w = static_cast<int>(in_w * scale);
     }
-    auto out_size = ctx.Input<Tensor>("OutSize");
+    auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
     if (out_size != nullptr) {
       auto out_size_data = get_new_data_from_tensor<int>(out_size);
       out_d = out_size_data[0];
@@ -1157,9 +1158,9 @@ static void Interpolate3DCPUFwd(const framework::ExecutionContext& ctx,
 
 template <typename T>
 static void Interpolate1DCPUBwd(const framework::ExecutionContext& ctx,
-                                Tensor* input_grad,
-                                const Tensor& output_grad) {
-  auto* input = ctx.Input<Tensor>("X");
+                                phi::DenseTensor* input_grad,
+                                const phi::DenseTensor& output_grad) {
+  auto* input = ctx.Input<phi::DenseTensor>("X");
   const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
   const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
   int n, c, in_d, in_h, in_w;
@@ -1171,7 +1172,7 @@ static void Interpolate1DCPUBwd(const framework::ExecutionContext& ctx,
 
   int out_w = ctx.Attr<int>("out_w");
   float scale;
-  auto scale_tensor = ctx.Input<Tensor>("Scale");
+  auto scale_tensor = ctx.Input<phi::DenseTensor>("Scale");
   if (scale_tensor != nullptr) {
     auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
     scale = scale_data[0];
@@ -1181,12 +1182,12 @@ static void Interpolate1DCPUBwd(const framework::ExecutionContext& ctx,
   if (scale > 0) {
     out_w = static_cast<int>(in_w * scale);
   }
-  auto out_size = ctx.Input<Tensor>("OutSize");
+  auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
   if (out_size != nullptr) {
     auto out_size_data = get_new_data_from_tensor<int>(out_size);
     out_w = out_size_data[0];
   }
-  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  auto list_new_size_tensor = ctx.MultiInput<phi::DenseTensor>("SizeTensor");
   if (list_new_size_tensor.size() > 0) {
     // have size tensor
     auto new_size = get_new_shape(list_new_size_tensor);
@@ -1231,9 +1232,9 @@ static void Interpolate1DCPUBwd(const framework::ExecutionContext& ctx,
 
 template <typename T>
 static void Interpolate2DCPUBwd(const framework::ExecutionContext& ctx,
-                                Tensor* input_grad,
-                                const Tensor& output_grad) {
-  auto* input = ctx.Input<Tensor>("X");
+                                phi::DenseTensor* input_grad,
+                                const phi::DenseTensor& output_grad) {
+  auto* input = ctx.Input<phi::DenseTensor>("X");
   const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
   const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
   int n, c, in_d, in_h, in_w;
@@ -1246,7 +1247,7 @@ static void Interpolate2DCPUBwd(const framework::ExecutionContext& ctx,
   int out_h = ctx.Attr<int>("out_h");
   int out_w = ctx.Attr<int>("out_w");
   float scale;
-  auto scale_tensor = ctx.Input<Tensor>("Scale");
+  auto scale_tensor = ctx.Input<phi::DenseTensor>("Scale");
   if (scale_tensor != nullptr) {
     auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
     scale = scale_data[0];
@@ -1257,13 +1258,13 @@ static void Interpolate2DCPUBwd(const framework::ExecutionContext& ctx,
     out_h = static_cast<int>(in_h * scale);
     out_w = static_cast<int>(in_w * scale);
   }
-  auto out_size = ctx.Input<Tensor>("OutSize");
+  auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
   if (out_size != nullptr) {
     auto out_size_data = get_new_data_from_tensor<int>(out_size);
     out_h = out_size_data[0];
     out_w = out_size_data[1];
   }
-  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  auto list_new_size_tensor = ctx.MultiInput<phi::DenseTensor>("SizeTensor");
   if (list_new_size_tensor.size() > 0) {
     // have size tensor
     auto new_size = get_new_shape(list_new_size_tensor);
@@ -1342,9 +1343,9 @@ static void Interpolate2DCPUBwd(const framework::ExecutionContext& ctx,
 
 template <typename T>
 static void Interpolate3DCPUBwd(const framework::ExecutionContext& ctx,
-                                Tensor* input_grad,
+                                phi::DenseTensor* input_grad,
                                 const Tensor output_grad) {
-  auto* input = ctx.Input<Tensor>("X");
+  auto* input = ctx.Input<phi::DenseTensor>("X");
   const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
   const DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
   int n, c, in_d, in_h, in_w;
@@ -1358,7 +1359,7 @@ static void Interpolate3DCPUBwd(const framework::ExecutionContext& ctx,
   int out_h = ctx.Attr<int>("out_h");
   int out_w = ctx.Attr<int>("out_w");
   float scale;
-  auto scale_tensor = ctx.Input<Tensor>("Scale");
+  auto scale_tensor = ctx.Input<phi::DenseTensor>("Scale");
   if (scale_tensor != nullptr) {
     auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
     scale = scale_data[0];
@@ -1370,14 +1371,14 @@ static void Interpolate3DCPUBwd(const framework::ExecutionContext& ctx,
     out_h = static_cast<int>(in_h * scale);
     out_w = static_cast<int>(in_w * scale);
   }
-  auto out_size = ctx.Input<Tensor>("OutSize");
+  auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
   if (out_size != nullptr) {
     auto out_size_data = get_new_data_from_tensor<int>(out_size);
     out_d = out_size_data[0];
     out_h = out_size_data[1];
     out_w = out_size_data[2];
   }
-  auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  auto list_new_size_tensor = ctx.MultiInput<phi::DenseTensor>("SizeTensor");
   if (list_new_size_tensor.size() > 0) {
     // have size tensor
     auto new_size = get_new_shape(list_new_size_tensor);
@@ -1442,8 +1443,8 @@ template <typename T>
 class InterpolateKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
+    auto* input = ctx.Input<phi::DenseTensor>("X");
+    auto* output = ctx.Output<phi::DenseTensor>("Out");
 
     auto input_dims = input->dims();
     if (input_dims.size() == 3) {  // 1D interpolation
@@ -1460,8 +1461,10 @@ template <typename T>
 class InterpolateGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* input_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* output_grad =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
 
     auto output_grad_dims = output_grad->dims();
     if (output_grad_dims.size() == 3) {  // 1D interpolation grad
diff --git a/paddle/fluid/operators/interpolate_op_npu.cc b/paddle/fluid/operators/interpolate_op_npu.cc
index 3548506eea4c1..1e99738a6b620 100644
--- a/paddle/fluid/operators/interpolate_op_npu.cc
+++ b/paddle/fluid/operators/interpolate_op_npu.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using DataLayout = framework::DataLayout;
 
 inline static void CheckArgument(const framework::ExecutionContext& ctx) {
@@ -68,7 +68,7 @@ static void CalcOutSize(const framework::ExecutionContext& ctx,
   *out_w = ctx.Attr<int>("out_w");
 
   auto dev_ctx = platform::DeviceContextPool::Instance().Get(ctx.GetPlace());
-  auto list_new_size_tensor = ctx.MultiInput<Tensor>("SizeTensor");
+  auto list_new_size_tensor = ctx.MultiInput<phi::DenseTensor>("SizeTensor");
 
   if (list_new_size_tensor.size() > 0) {
     std::vector<int32_t> new_size_h(1);
@@ -79,7 +79,7 @@ static void CalcOutSize(const framework::ExecutionContext& ctx,
     *out_w = new_size_w[0];
   } else {
     float scale;
-    auto scale_tensor = ctx.Input<Tensor>("Scale");
+    auto scale_tensor = ctx.Input<phi::DenseTensor>("Scale");
     if (scale_tensor != nullptr) {
       std::vector<float> scale_data;
       framework::TensorToVector(*scale_tensor, *dev_ctx, &scale_data);
@@ -93,7 +93,7 @@ static void CalcOutSize(const framework::ExecutionContext& ctx,
       *out_w = static_cast<int32_t>(in_w * scale);
     }
 
-    auto out_size = ctx.Input<Tensor>("OutSize");
+    auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
     if (out_size != nullptr) {
       std::vector<int> out_size_data;
       framework::TensorToVector(*out_size, *dev_ctx, &out_size_data);
@@ -124,7 +124,7 @@ class InterpolateNPUKernel : public framework::OpKernel<T> {
     // when 'align_corners' is 'true' or data type is 'double'
     CheckArgument(ctx);
 
-    auto* input = ctx.Input<Tensor>("X");
+    auto* input = ctx.Input<phi::DenseTensor>("X");
     framework::DDim input_dims = input->dims();
 
     const std::string data_layout_str =
@@ -141,7 +141,7 @@ class InterpolateNPUKernel : public framework::OpKernel<T> {
     input_x.ShareDataWith(*input);
     input_x.set_layout(data_layout);
 
-    auto* output = ctx.Output<Tensor>("Out");
+    auto* output = ctx.Output<phi::DenseTensor>("Out");
     framework::DDim output_dims;
     if (data_layout == DataLayout::kNCHW) {
       output_dims = {n, c, out_h, out_w};
@@ -175,7 +175,7 @@ class InterpolateGradNPUKernel : public framework::OpKernel<T> {
     // when 'align_corners' is 'true' or data type is 'double'
     CheckArgument(ctx);
 
-    auto* input = ctx.Input<Tensor>("X");
+    auto* input = ctx.Input<phi::DenseTensor>("X");
     framework::DDim input_dims = input->dims();
 
     const std::string data_layout_str =
@@ -188,12 +188,14 @@ class InterpolateGradNPUKernel : public framework::OpKernel<T> {
     CalcOutSize(ctx, h, w, &out_h, &out_w);
 
     // the 'output_grad' tensor may has no set (or wrong set) of the layout
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* output_grad =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
     Tensor output_grad_tmp(output_grad->type());
     output_grad_tmp.ShareDataWith(*output_grad);
     output_grad_tmp.set_layout(data_layout);
 
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* input_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     input_grad->set_layout(data_layout);
     framework::DDim input_grad_dims;
     if (data_layout == DataLayout::kNCHW) {
diff --git a/paddle/fluid/operators/interpolate_v2_op.cc b/paddle/fluid/operators/interpolate_v2_op.cc
index 1bb68699a8553..e7a362f543b76 100644
--- a/paddle/fluid/operators/interpolate_v2_op.cc
+++ b/paddle/fluid/operators/interpolate_v2_op.cc
@@ -25,7 +25,6 @@
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
 using DataLayout = framework::DataLayout;
 
 static void Interpolate1DInferShapeCheck(framework::InferShapeContext* ctx) {
@@ -463,7 +462,7 @@ class InterpolateV2Op : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
 #ifdef PADDLE_WITH_MKLDNN
     if ((expected_kernel_type.data_layout_ == framework::DataLayout::kMKLDNN) &&
@@ -713,7 +712,7 @@ class InterpolateV2OpGrad : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     if (var_name == "SizeTensor" || var_name == "Scale") {
       return expected_kernel_type;
diff --git a/paddle/fluid/operators/interpolate_v2_op_mlu.cc b/paddle/fluid/operators/interpolate_v2_op_mlu.cc
index 9e39d97f710c5..1383be6f93fb9 100644
--- a/paddle/fluid/operators/interpolate_v2_op_mlu.cc
+++ b/paddle/fluid/operators/interpolate_v2_op_mlu.cc
@@ -20,11 +20,10 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
 using DataLayout = framework::DataLayout;
 
 inline std::vector<int> get_new_shape_mlu(
-    const std::vector<const Tensor*>& list_new_shape_tensor) {
+    const std::vector<const phi::DenseTensor*>& list_new_shape_tensor) {
   // get tensor from
   std::vector<int> vec_new_shape;
   for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
@@ -33,7 +32,7 @@ inline std::vector<int> get_new_shape_mlu(
         tensor->dims(),
         phi::make_ddim({1}),
         platform::errors::InvalidArgument("shape of dim tensor should be [1]"));
-    framework::Tensor temp;
+    phi::DenseTensor temp;
     paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
     vec_new_shape.push_back(static_cast<int32_t>(*temp.data<int32_t>()));
   }
@@ -46,8 +45,8 @@ class InterpolateV2MLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto& dev_ctx = ctx.template device_context<MLUDeviceContext>();
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
+    auto* input = ctx.Input<phi::DenseTensor>("X");
+    auto* output = ctx.Output<phi::DenseTensor>("Out");
 
     auto input_dims = input->dims();
     PADDLE_ENFORCE_GE(
@@ -79,7 +78,7 @@ class InterpolateV2MLUKernel : public framework::OpKernel<T> {
     float scale_h = -1;
     float scale_w = -1;
 
-    auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+    auto list_new_size_tensor = ctx.MultiInput<phi::DenseTensor>("SizeTensor");
     if (list_new_size_tensor.size() > 0) {
       // have size tensor
       auto new_size = get_new_shape_mlu(list_new_size_tensor);
@@ -94,7 +93,7 @@ class InterpolateV2MLUKernel : public framework::OpKernel<T> {
         out_w = new_size[2];
       }
     } else {
-      auto scale_tensor = ctx.Input<Tensor>("Scale");
+      auto scale_tensor = ctx.Input<phi::DenseTensor>("Scale");
       auto scale = ctx.Attr<std::vector<float>>("scale");
       if (scale_tensor != nullptr) {
         std::vector<float> scale_data;
@@ -146,7 +145,7 @@ class InterpolateV2MLUKernel : public framework::OpKernel<T> {
       if (scale_d > 0.) {
         out_d = static_cast<int>(in_d * scale_d);
       }
-      auto out_size = ctx.Input<Tensor>("OutSize");
+      auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
       if (out_size != nullptr) {
         std::vector<int32_t> out_size_data;
         out_size_data = GetDataFromTensor<int>(out_size);
@@ -359,8 +358,10 @@ class InterpolateV2GradMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto& dev_ctx = ctx.template device_context<MLUDeviceContext>();
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* input_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* output_grad =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
 
     auto output_grad_dims = output_grad->dims();
 
@@ -369,7 +370,7 @@ class InterpolateV2GradMLUKernel : public framework::OpKernel<T> {
                       platform::errors::External(
                           "XPU Interpolategrad kernel only support 2d"));
 
-    auto* input = ctx.Input<Tensor>("X");
+    auto* input = ctx.Input<phi::DenseTensor>("X");
     auto input_dims = input->dims();
     const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
     const DataLayout data_layout =
@@ -388,14 +389,14 @@ class InterpolateV2GradMLUKernel : public framework::OpKernel<T> {
     float scale_h = -1;
     float scale_w = -1;
 
-    auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+    auto list_new_size_tensor = ctx.MultiInput<phi::DenseTensor>("SizeTensor");
     if (list_new_size_tensor.size() > 0) {
       // have size tensor
       auto new_size = get_new_shape_mlu(list_new_size_tensor);
       out_h = new_size[0];
       out_w = new_size[1];
     } else {
-      auto scale_tensor = ctx.Input<Tensor>("Scale");
+      auto scale_tensor = ctx.Input<phi::DenseTensor>("Scale");
       auto scale = ctx.Attr<std::vector<float>>("scale");
       if (scale_tensor != nullptr) {
         std::vector<float> scale_data;
@@ -428,7 +429,7 @@ class InterpolateV2GradMLUKernel : public framework::OpKernel<T> {
         out_h = static_cast<int>(in_h * scale_h);
         out_w = static_cast<int>(in_w * scale_w);
       }
-      auto out_size = ctx.Input<Tensor>("OutSize");
+      auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
       if (out_size != nullptr) {
         std::vector<int32_t> out_size_data;
         out_size_data = GetDataFromTensor<int>(out_size);
diff --git a/paddle/fluid/operators/interpolate_v2_op_npu.cc b/paddle/fluid/operators/interpolate_v2_op_npu.cc
index ea11d3c87a812..cb84e694dade4 100644
--- a/paddle/fluid/operators/interpolate_v2_op_npu.cc
+++ b/paddle/fluid/operators/interpolate_v2_op_npu.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using DataLayout = framework::DataLayout;
 using DDim = framework::DDim;
 using fp16 = paddle::platform::float16;
@@ -38,54 +38,60 @@ struct InterpolateFunction {
     FillNpuTensorWithConstant<float>(&t0, static_cast<float>(0));
     FillNpuTensorWithConstant<float>(&t1, static_cast<float>(1));
   }
-  void Arange(int n, Tensor* x) {
+  void Arange(int n, phi::DenseTensor* x) {
     FillNpuTensorWithConstant<float>(&tn, static_cast<float>(n));
     const auto& runner = NpuOpRunner("Range", {t0, tn, t1}, {*x}, {});
     runner.Run(stream);
   }
-  void ReduceSum(const Tensor* x,
-                 Tensor* y,
+  void ReduceSum(const phi::DenseTensor* x,
+                 phi::DenseTensor* y,
                  const std::vector<int>& dim,
                  bool keep_dims = true) {
     const auto& runner = NpuOpRunner(
         "ReduceSumD", {*x}, {*y}, {{"axes", dim}, {"keep_dims", keep_dims}});
     runner.Run(stream);
   }
-  void Add(const Tensor* x, const Tensor* y, Tensor* z) {
+  void Add(const phi::DenseTensor* x,
+           const phi::DenseTensor* y,
+           phi::DenseTensor* z) {
     const auto& runner = NpuOpRunner("AddV2", {*x, *y}, {*z}, {});
     runner.Run(stream);
   }
-  void Adds(const Tensor* x, float scalar, Tensor* y) {
+  void Adds(const phi::DenseTensor* x, float scalar, phi::DenseTensor* y) {
     const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scalar}});
     runner.Run(stream);
   }
-  void Mul(const Tensor* x, const Tensor* y, Tensor* z) {
+  void Mul(const phi::DenseTensor* x,
+           const phi::DenseTensor* y,
+           phi::DenseTensor* z) {
     const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*z}, {});
     runner.Run(stream);
   }
-  void Sub(const Tensor* x, const Tensor* y, Tensor* z) {
+  void Sub(const phi::DenseTensor* x,
+           const phi::DenseTensor* y,
+           phi::DenseTensor* z) {
     const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*z}, {});
     runner.Run(stream);
   }
-  void Cast(const Tensor* x, Tensor* y) {
+  void Cast(const phi::DenseTensor* x, phi::DenseTensor* y) {
     auto dst_dtype =
         ConvertToNpuDtype(framework::TransToProtoVarType(y->dtype()));
     const auto& runner = NpuOpRunner(
         "Cast", {*x}, {*y}, {{"dst_type", static_cast<int>(dst_dtype)}});
     runner.Run(stream);
   }
-  void Gather(const Tensor* x,
-              const Tensor* indices,
+  void Gather(const phi::DenseTensor* x,
+              const phi::DenseTensor* indices,
               const int axis,
-              Tensor* y) {
+              phi::DenseTensor* y) {
     const auto& runner =
         NpuOpRunner("GatherV2D", {*x, *indices}, {*y}, {{"axis", axis}});
     runner.Run(stream);
   }
-  void GatherGrad(const Tensor* gy,
-                  const Tensor* indices,
+  void GatherGrad(const phi::DenseTensor* gy,
+                  const phi::DenseTensor* indices,
                   const int axis,
-                  Tensor* gx) {
+                  phi::DenseTensor* gx) {
     //  1  gy swapaxis: axis & 0
     int len = (gy->dims()).size();
     std::vector<int> axis_swap(len);
@@ -115,32 +121,38 @@ struct InterpolateFunction {
     //  3  gx swapaxis: axis, 0
     Transpose(&gx_t, gx, axis_swap);
   }
-  void Scatter(const Tensor* x,
-               const Tensor* index,
-               const Tensor* updates,
-               Tensor* y) {
+  void Scatter(const phi::DenseTensor* x,
+               const phi::DenseTensor* index,
+               const phi::DenseTensor* updates,
+               phi::DenseTensor* y) {
     const auto& runner =
         NpuOpRunner("TensorScatterAdd", {*x, *index, *updates}, {*y}, {});
     runner.Run(stream);
   }
-  void Transpose(const Tensor* x, Tensor* y, const std::vector<int>& axis) {
+  void Transpose(const phi::DenseTensor* x,
+                 phi::DenseTensor* y,
+                 const std::vector<int>& axis) {
     const auto& runner =
         NpuOpRunner("TransposeD", {*x}, {*y}, {{"perm", axis}});
     runner.Run(stream);
   }
-  void Muls(const Tensor* x, float scalar, Tensor* y) {
+  void Muls(const phi::DenseTensor* x, float scalar, phi::DenseTensor* y) {
     const auto& runner = NpuOpRunner("Muls", {*x}, {*y}, {{"value", scalar}});
     runner.Run(stream);
   }
-  void Maximum(const Tensor* x, const Tensor* y, Tensor* z) {
+  void Maximum(const phi::DenseTensor* x,
+               const phi::DenseTensor* y,
+               phi::DenseTensor* z) {
     const auto& runner = NpuOpRunner("Maximum", {*x, *y}, {*z}, {});
     runner.Run(stream);
   }
-  void Minimum(const Tensor* x, const Tensor* y, Tensor* z) {
+  void Minimum(const phi::DenseTensor* x,
+               const phi::DenseTensor* y,
+               phi::DenseTensor* z) {
     const auto& runner = NpuOpRunner("Minimum", {*x, *y}, {*z}, {});
     runner.Run(stream);
   }
-  void Floor(const Tensor* x, Tensor* y) {
+  void Floor(const phi::DenseTensor* x, phi::DenseTensor* y) {
     const auto& runner = NpuOpRunner("Floor", {*x}, {*y}, {});
     runner.Run(stream);
   }
@@ -155,7 +167,7 @@ struct InterpolateFunction {
 };
 
 template <>
-void InterpolateFunction<fp16>::Arange(int n, Tensor* x) {
+void InterpolateFunction<fp16>::Arange(int n, phi::DenseTensor* x) {
   Tensor x_fp32(experimental::DataType::FLOAT32);
   x_fp32.mutable_data<float>(x->dims(), place);
   FillNpuTensorWithConstant<float>(&tn, static_cast<float>(n));
@@ -216,14 +228,14 @@ void BilinearParamTensorCompute(const framework::ExecutionContext& ctx,
                                 bool align_cond,
                                 float ratio_h,
                                 float ratio_w,
-                                Tensor* h0,
-                                Tensor* h1,
-                                Tensor* w0,
-                                Tensor* w1,
-                                Tensor* coef_h0,
-                                Tensor* coef_h1,
-                                Tensor* coef_w0,
-                                Tensor* coef_w1) {
+                                phi::DenseTensor* h0,
+                                phi::DenseTensor* h1,
+                                phi::DenseTensor* w0,
+                                phi::DenseTensor* w1,
+                                phi::DenseTensor* coef_h0,
+                                phi::DenseTensor* coef_h1,
+                                phi::DenseTensor* coef_w0,
+                                phi::DenseTensor* coef_w1) {
   InterpolateFunction<T> F(ctx);
   auto place = ctx.GetPlace();
   Tensor _h0, _w0;
@@ -291,8 +303,8 @@ void BilinearParamTensorCompute(const framework::ExecutionContext& ctx,
 
 template <typename T>
 void BilinearFwdNpu(const framework::ExecutionContext& ctx,
-                    const Tensor* input,
-                    Tensor* output,
+                    const phi::DenseTensor* input,
+                    phi::DenseTensor* output,
                     const float scale_h,
                     const float scale_w,
                     const bool align_corners,
@@ -382,8 +394,8 @@ void BilinearFwdNpu(const framework::ExecutionContext& ctx,
 
 template <typename T>
 void BilinearBwdNpu(const framework::ExecutionContext& ctx,
-                    const Tensor* gout,
-                    Tensor* gin,
+                    const phi::DenseTensor* gout,
+                    phi::DenseTensor* gin,
                     const float scale_h,
                     const float scale_w,
                     const bool align_corners,
@@ -477,8 +489,8 @@ template <typename DeviceContext, typename T>
 class InterpolateV2NPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
+    auto* input = ctx.Input<phi::DenseTensor>("X");
+    auto* output = ctx.Output<phi::DenseTensor>("Out");
 
     auto input_dims = input->dims();
     PADDLE_ENFORCE_EQ(input_dims.size(),
@@ -509,8 +521,7 @@ class InterpolateV2NPUKernel : public framework::OpKernel<T> {
     float scale_w = -1;
 
     // Priority: SizeTensor > OutSize > Scale > scale > out_h & out_w
-    auto list_new_shape_tensor =
-        ctx.MultiInput<framework::Tensor>("SizeTensor");
+    auto list_new_shape_tensor = ctx.MultiInput<phi::DenseTensor>("SizeTensor");
     if (list_new_shape_tensor.size() > 0) {
       std::vector<int32_t> output_h(1);
       std::vector<int32_t> output_w(1);
@@ -521,12 +532,12 @@ class InterpolateV2NPUKernel : public framework::OpKernel<T> {
       out_h = output_h[0];
       out_w = output_w[0];
     } else if (ctx.HasInput("OutSize")) {
-      auto out_size = ctx.Input<Tensor>("OutSize");
+      auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
       auto out_size_data = phi::funcs::get_new_data_from_tensor<int>(out_size);
       out_h = out_size_data[0];
       out_w = out_size_data[1];
     } else {
-      auto scale_tensor = ctx.Input<Tensor>("Scale");
+      auto scale_tensor = ctx.Input<phi::DenseTensor>("Scale");
       auto scale = ctx.Attr<std::vector<float>>("scale");
       if (scale_tensor != nullptr) {
         auto scale_data =
@@ -634,9 +645,11 @@ template <typename DeviceContext, typename T>
 class InterpolateV2NPUGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* input = ctx.Input<phi::DenseTensor>("X");
+    auto* output_grad =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* input_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
     const DataLayout data_layout =
@@ -661,7 +674,7 @@ class InterpolateV2NPUGradKernel : public framework::OpKernel<T> {
     float scale_w = -1;
 
     // Priority: SizeTensor > OutSize > Scale > scale > out_h & out_w
-    auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+    auto list_new_size_tensor = ctx.MultiInput<phi::DenseTensor>("SizeTensor");
     if (list_new_size_tensor.size() > 0) {
       std::vector<int32_t> output_h(1);
       std::vector<int32_t> output_w(1);
@@ -672,12 +685,12 @@ class InterpolateV2NPUGradKernel : public framework::OpKernel<T> {
       out_h = output_h[0];
       out_w = output_w[0];
     } else if (ctx.HasInput("OutSize")) {
-      auto out_size = ctx.Input<Tensor>("OutSize");
+      auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
       auto out_size_data = phi::funcs::get_new_data_from_tensor<int>(out_size);
       out_h = out_size_data[0];
       out_w = out_size_data[1];
     } else {
-      auto scale_tensor = ctx.Input<Tensor>("Scale");
+      auto scale_tensor = ctx.Input<phi::DenseTensor>("Scale");
       auto scale = ctx.Attr<std::vector<float>>("scale");
       if (scale_tensor != nullptr) {
         auto scale_data =
diff --git a/paddle/fluid/operators/ipu/ipu_runtime_op.cc b/paddle/fluid/operators/ipu/ipu_runtime_op.cc
index 802cc13ae4e07..e243c8f7d9e36 100644
--- a/paddle/fluid/operators/ipu/ipu_runtime_op.cc
+++ b/paddle/fluid/operators/ipu/ipu_runtime_op.cc
@@ -35,8 +35,8 @@ class IpuRuntimeOp : public framework::OperatorBase {
     auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place);
     framework::RuntimeContext runtime_ctx(inputs_, outputs_, scope);
     framework::ExecutionContext ctx(*this, scope, *dev_ctx, runtime_ctx);
-    auto inputs = ctx.MultiInput<framework::Tensor>("FeedList");
-    auto outputs = ctx.MultiOutput<framework::Tensor>("FetchList");
+    auto inputs = ctx.MultiInput<phi::DenseTensor>("FeedList");
+    auto outputs = ctx.MultiOutput<phi::DenseTensor>("FetchList");
     auto output_names = ctx.OutputNames("FetchList");
     VLOG(4) << "IpuRuntime Kernel, begin to run graph";
     ipu_backend->Run(inputs, outputs, ctx);
diff --git a/paddle/fluid/operators/isfinite_op.h b/paddle/fluid/operators/isfinite_op.h
index 427d35699867b..88d60bc1c15ad 100644
--- a/paddle/fluid/operators/isfinite_op.h
+++ b/paddle/fluid/operators/isfinite_op.h
@@ -32,14 +32,14 @@ class DenseTensor;
 namespace paddle {
 namespace framework {
 // store the result bool in gpu tensor, async operation. Faster than above ones.
-void TensorContainsNAN(const framework::Tensor& tensor, framework::Tensor* out);
-void TensorContainsInf(const framework::Tensor& tensor, framework::Tensor* out);
-void TensorIsfinite(const framework::Tensor& tensor, framework::Tensor* out);
+void TensorContainsNAN(const phi::DenseTensor& tensor, phi::DenseTensor* out);
+void TensorContainsInf(const phi::DenseTensor& tensor, phi::DenseTensor* out);
+void TensorIsfinite(const phi::DenseTensor& tensor, phi::DenseTensor* out);
 
 // copy the result bool to cpu
-bool TensorContainsNAN(const framework::Tensor& tensor);
-bool TensorContainsInf(const framework::Tensor& tensor);
-bool TensorIsfinite(const framework::Tensor& tensor);
+bool TensorContainsNAN(const phi::DenseTensor& tensor);
+bool TensorContainsInf(const phi::DenseTensor& tensor);
+bool TensorIsfinite(const phi::DenseTensor& tensor);
 
 #define FiniteVisitor(type, reduce_type, device)                             \
   struct type##Visitor##device {                                             \
@@ -50,7 +50,7 @@ bool TensorIsfinite(const framework::Tensor& tensor);
       auto place = in_.place();                                              \
       auto* ctx = static_cast<phi::device##Context*>(                        \
           platform::DeviceContextPool::Instance().Get(place));               \
-      Tensor tmp;                                                            \
+      phi::DenseTensor tmp;                                                  \
       tmp.Resize(in_.dims());                                                \
       out_->Resize({1});                                                     \
       std::vector<int64_t> dims(tmp.dims().size());                          \
@@ -73,8 +73,8 @@ FiniteVisitor(Isfinite, All, GPU);
 #endif
 
 // store the result bool in gpu tensor, async operation. Faster than above ones.
-inline void TensorContainsNAN(const framework::Tensor& tensor,
-                              framework::Tensor* out) {
+inline void TensorContainsNAN(const phi::DenseTensor& tensor,
+                              phi::DenseTensor* out) {
   auto place = tensor.place();
   if (platform::is_cpu_place(tensor.place())) {
     VisitDataTypeNormal(TransToProtoVarType(tensor.dtype()),
@@ -90,8 +90,8 @@ inline void TensorContainsNAN(const framework::Tensor& tensor,
 #endif
   PADDLE_THROW(platform::errors::Unimplemented("Not supported on %s.", place));
 }
-inline void TensorContainsInf(const framework::Tensor& tensor,
-                              framework::Tensor* out) {
+inline void TensorContainsInf(const phi::DenseTensor& tensor,
+                              phi::DenseTensor* out) {
   auto place = tensor.place();
   if (platform::is_cpu_place(tensor.place())) {
     VisitDataTypeNormal(TransToProtoVarType(tensor.dtype()),
@@ -107,8 +107,8 @@ inline void TensorContainsInf(const framework::Tensor& tensor,
 #endif
   PADDLE_THROW(platform::errors::Unimplemented("Not supported on %s.", place));
 }
-inline void TensorIsfinite(const framework::Tensor& tensor,
-                           framework::Tensor* out) {
+inline void TensorIsfinite(const phi::DenseTensor& tensor,
+                           phi::DenseTensor* out) {
   auto place = tensor.place();
   if (platform::is_cpu_place(tensor.place())) {
     VisitDataTypeNormal(TransToProtoVarType(tensor.dtype()),
@@ -126,37 +126,37 @@ inline void TensorIsfinite(const framework::Tensor& tensor,
 }
 
 // copy the result bool to cpu
-inline bool TensorContainsNAN(const framework::Tensor& tensor) {
-  Tensor out;
+inline bool TensorContainsNAN(const phi::DenseTensor& tensor) {
+  phi::DenseTensor out;
   TensorContainsNAN(tensor, &out);
   return GetValue<bool>(&out);
 }
-inline bool TensorContainsInf(const framework::Tensor& tensor) {
-  Tensor out;
+inline bool TensorContainsInf(const phi::DenseTensor& tensor) {
+  phi::DenseTensor out;
   TensorContainsInf(tensor, &out);
   return GetValue<bool>(&out);
 }
-inline bool TensorIsfinite(const framework::Tensor& tensor) {
-  Tensor out;
+inline bool TensorIsfinite(const phi::DenseTensor& tensor) {
+  phi::DenseTensor out;
   TensorIsfinite(tensor, &out);
   return GetValue<bool>(&out);
 }
 }  // namespace framework
 namespace operators {
 struct InfinityFunctor {
-  void operator()(const framework::Tensor& tensor, framework::Tensor* out) {
+  void operator()(const phi::DenseTensor& tensor, phi::DenseTensor* out) {
     framework::TensorContainsInf(tensor, out);
   }
 };
 
 struct NANFunctor {
-  void operator()(const framework::Tensor& tensor, framework::Tensor* out) {
+  void operator()(const phi::DenseTensor& tensor, phi::DenseTensor* out) {
     framework::TensorContainsNAN(tensor, out);
   }
 };
 
 struct IsfiniteFunctor {
-  void operator()(const framework::Tensor& tensor, framework::Tensor* out) {
+  void operator()(const phi::DenseTensor& tensor, phi::DenseTensor* out) {
     framework::TensorIsfinite(tensor, out);
   }
 };
@@ -166,22 +166,22 @@ class OverflowKernel : public framework::OpKernel<T> {
  public:
   virtual void Compute(const framework::ExecutionContext& ctx) const {
     auto* x = ctx.InputVar("X");
-    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
     Functor functor;
     if (x->IsType<framework::LoDTensor>()) {
-      auto* in = ctx.Input<framework::Tensor>("X");
+      auto* in = ctx.Input<phi::DenseTensor>("X");
       functor(*in, out);
     } else if (x->IsType<phi::SelectedRows>()) {
       auto& in = ctx.Input<phi::SelectedRows>("X")->value();
       functor(in, out);
     } else {
-      PADDLE_ENFORCE_EQ(
-          true,
-          false,
-          platform::errors::InvalidArgument(
-              "The input type mismatch, the type of Input(X) must be Tensor or "
-              "SelectedRows, please check your input."));
+      PADDLE_ENFORCE_EQ(true,
+                        false,
+                        platform::errors::InvalidArgument(
+                            "The input type mismatch, the type of Input(X) "
+                            "must be phi::DenseTensor or "
+                            "SelectedRows, please check your input."));
     }
   }
 };
diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc
index 50fd6056d84b0..8070527a56a8c 100644
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -135,7 +135,7 @@ void BenchAllImpls(const typename KernelTuple::attr_type& attr, Args... args) {
   LOG(INFO) << loginfos.str();
 }
 
-using Tensor = paddle::framework::Tensor;
+using Tensor = phi::DenseTensor;
 template <typename KernelTuple, typename PlaceType>
 void BenchKernelXYZN() {
   using T = typename KernelTuple::data_type;
diff --git a/paddle/fluid/operators/kldiv_loss_op.cc b/paddle/fluid/operators/kldiv_loss_op.cc
index decee5567b486..9a06fd369f882 100644
--- a/paddle/fluid/operators/kldiv_loss_op.cc
+++ b/paddle/fluid/operators/kldiv_loss_op.cc
@@ -19,8 +19,6 @@
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class KLDivLossOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/kldiv_loss_op_npu.cc b/paddle/fluid/operators/kldiv_loss_op_npu.cc
index 38ad17249bec5..f21e939a7b118 100644
--- a/paddle/fluid/operators/kldiv_loss_op_npu.cc
+++ b/paddle/fluid/operators/kldiv_loss_op_npu.cc
@@ -20,15 +20,15 @@ limitations under the Licnse. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class KLDivLossNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* target = ctx.Input<Tensor>("Target");
-    auto* loss = ctx.Output<Tensor>("Loss");
+    auto* input = ctx.Input<phi::DenseTensor>("X");
+    auto* target = ctx.Input<phi::DenseTensor>("Target");
+    auto* loss = ctx.Output<phi::DenseTensor>("Loss");
     auto reduction = ctx.Attr<std::string>("reduction");
     loss->mutable_data<T>(ctx.GetPlace());
 
@@ -103,9 +103,11 @@ template <typename T>
 class KLDivLossGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* target = ctx.Input<Tensor>("Target");
-    auto* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* target = ctx.Input<phi::DenseTensor>("Target");
+    auto* loss_grad =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Loss"));
+    auto* input_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto reduction = ctx.Attr<std::string>("reduction");
     input_grad->mutable_data<T>(ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/kron_op.cc b/paddle/fluid/operators/kron_op.cc
index 250a5ae0061ed..707d9a47006f2 100644
--- a/paddle/fluid/operators/kron_op.cc
+++ b/paddle/fluid/operators/kron_op.cc
@@ -38,7 +38,7 @@ class KronOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const framework::Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     if (framework::IsComplexType(expected_kernel_type.data_type_)) {
       // only promote inputs’s types when contains complex input
@@ -120,7 +120,7 @@ class KronGradOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const framework::Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     if (framework::IsComplexType(expected_kernel_type.data_type_)) {
       // only promote inputs’s types when contains complex input
diff --git a/paddle/fluid/operators/l1_norm_op.cc b/paddle/fluid/operators/l1_norm_op.cc
index 093a33d89b03f..112a84b00e329 100644
--- a/paddle/fluid/operators/l1_norm_op.cc
+++ b/paddle/fluid/operators/l1_norm_op.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class L1NormOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/l1_norm_op.h b/paddle/fluid/operators/l1_norm_op.h
index 5629ea60dbc40..36465c14bf00a 100644
--- a/paddle/fluid/operators/l1_norm_op.h
+++ b/paddle/fluid/operators/l1_norm_op.h
@@ -25,8 +25,8 @@ template <typename DeviceContext, typename T>
 class L1NormKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    const framework::Tensor *X = context.Input<framework::Tensor>("X");
-    framework::Tensor *Out = context.Output<framework::Tensor>("Out");
+    const phi::DenseTensor *X = context.Input<phi::DenseTensor>("X");
+    phi::DenseTensor *Out = context.Output<phi::DenseTensor>("Out");
     Out->mutable_data<T>(context.GetPlace());
 
     auto x = framework::EigenVector<T>::Flatten(*X);
@@ -43,16 +43,16 @@ template <typename DeviceContext, typename T>
 class L1NormGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    const framework::Tensor *x = context.Input<framework::Tensor>("X");
-    const framework::Tensor *d_out =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    const phi::DenseTensor *x = context.Input<phi::DenseTensor>("X");
+    const phi::DenseTensor *d_out =
+        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
     PADDLE_ENFORCE_EQ(
         d_out->numel(),
         1,
         platform::errors::InvalidArgument(
             "Input(GRAD@Out) of L1NormGradOP should be a scalar."));
-    framework::Tensor *dx =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    phi::DenseTensor *dx =
+        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
     dx->mutable_data<T>(context.GetPlace());
 
     auto x_eigen = framework::EigenVector<T>::Flatten(*x);
diff --git a/paddle/fluid/operators/label_smooth_op_mlu.cc b/paddle/fluid/operators/label_smooth_op_mlu.cc
index 8a91dc1f4c75a..d667db483a9b6 100644
--- a/paddle/fluid/operators/label_smooth_op_mlu.cc
+++ b/paddle/fluid/operators/label_smooth_op_mlu.cc
@@ -18,7 +18,7 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 
 template <typename T>
@@ -26,7 +26,7 @@ class LabelSmoothMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* in_t = ctx.Input<LoDTensor>("X");
-    auto* dist_t = ctx.Input<Tensor>("PriorDist");
+    auto* dist_t = ctx.Input<phi::DenseTensor>("PriorDist");
     auto* out_t = ctx.Output<LoDTensor>("Out");
     auto epsilon = ctx.Attr<float>("epsilon");
     auto epsilon_gt = 1.0f - epsilon;
@@ -55,7 +55,7 @@ class LabelSmoothMLUKernel : public framework::OpKernel<T> {
                         epsilon);
     } else {
       auto& dev_ctx = ctx.template device_context<MLUDeviceContext>();
-      framework::Tensor dist_tensor =
+      phi::DenseTensor dist_tensor =
           ctx.AllocateTmpTensor<T, MLUDeviceContext>({1, label_dim}, dev_ctx);
       MLUCnnlTensorDesc dist_desc(dist_tensor);
       auto value = static_cast<T>(1.0f / label_dim);
diff --git a/paddle/fluid/operators/label_smooth_op_npu.cc b/paddle/fluid/operators/label_smooth_op_npu.cc
index 7289770fc60ed..d899dbf99c525 100644
--- a/paddle/fluid/operators/label_smooth_op_npu.cc
+++ b/paddle/fluid/operators/label_smooth_op_npu.cc
@@ -18,15 +18,15 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 
 template <typename T>
 void LabelSmoothMuls(const platform::Place& place,
                      const aclrtStream& stream,
-                     const Tensor* in,
+                     const phi::DenseTensor* in,
                      float val,
-                     Tensor* out) {
+                     phi::DenseTensor* out) {
   out->mutable_data<T>(in->dims(), place);
   const auto& runner = NpuOpRunner("Muls", {*in}, {*out}, {{"value", val}});
   runner.Run(stream);
@@ -35,9 +35,9 @@ void LabelSmoothMuls(const platform::Place& place,
 template <typename T>
 void LabelSmoothAdds(const platform::Place& place,
                      const aclrtStream& stream,
-                     const Tensor* in,
+                     const phi::DenseTensor* in,
                      float val,
-                     Tensor* out) {
+                     phi::DenseTensor* out) {
   out->mutable_data<T>(in->dims(), place);
   const auto& runner = NpuOpRunner("Adds", {*in}, {*out}, {{"value", val}});
   runner.Run(stream);
@@ -46,9 +46,9 @@ void LabelSmoothAdds(const platform::Place& place,
 template <typename T>
 void LabelSmoothAddBroadCast(const platform::Place& place,
                              const aclrtStream& stream,
-                             const Tensor* in1,
-                             const Tensor* in2,
-                             Tensor* out) {
+                             const phi::DenseTensor* in1,
+                             const phi::DenseTensor* in2,
+                             phi::DenseTensor* out) {
   out->mutable_data<T>(place);
   const auto& runner = NpuOpRunner("AddV2", {*in1, *in2}, {*out}, {});
   runner.Run(stream);
@@ -60,7 +60,7 @@ class LabelSmoothNPUKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* out_t = ctx.Output<LoDTensor>("Out");
     auto* in_t = ctx.Input<LoDTensor>("X");
-    auto* dist_t = ctx.Input<Tensor>("PriorDist");
+    auto* dist_t = ctx.Input<phi::DenseTensor>("PriorDist");
     auto epsilon = ctx.Attr<float>("epsilon");
 
     auto label_dim = in_t->dims()[in_t->dims().size() - 1];
@@ -90,8 +90,8 @@ template <typename T>
 class LabelSmoothGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* d_out_t = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* d_in_t = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* d_out_t = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* d_in_t = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto epsilon = ctx.Attr<float>("epsilon");
 
     auto place = ctx.GetPlace();
diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h
index 899eae3efb45b..d9fa06b7e52f5 100644
--- a/paddle/fluid/operators/layer_norm_kernel.cu.h
+++ b/paddle/fluid/operators/layer_norm_kernel.cu.h
@@ -33,7 +33,7 @@ namespace cub = hipcub;
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 template <typename T>
 using CudnnDataType = platform::CudnnDataType<T>;
 template <typename T>
@@ -937,12 +937,12 @@ void ln_bwd_fast_kernel_driver(const phi::GPUContext &dev_ctx,
     const int gridx = 2 * dev_ctx.GetSMCount();
 
     // get temp space for dscale and dbias.
-    framework::Tensor dscale_temp;
+    phi::DenseTensor dscale_temp;
     dscale_temp.Resize({gridx, cols});
     dscale_temp.mutable_data<U>(dev_ctx.GetPlace());
     U *dscale_temp_ptr = dscale_temp.data<U>();
 
-    framework::Tensor dbias_temp;
+    phi::DenseTensor dbias_temp;
     dbias_temp.Resize({gridx, cols});
     dbias_temp.mutable_data<U>(dev_ctx.GetPlace());
     U *dbias_temp_ptr = dbias_temp.data<U>();
diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc
index 13cd443473040..096125ff2f9d3 100644
--- a/paddle/fluid/operators/layer_norm_op.cc
+++ b/paddle/fluid/operators/layer_norm_op.cc
@@ -24,7 +24,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 using DataLayout = framework::DataLayout;
 
@@ -114,7 +114,8 @@ class LayerNormOp : public framework::OperatorWithKernel {
 #ifdef PADDLE_WITH_MKLDNN
     int begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
     if (this->CanMKLDNNBeUsed(ctx, input_data_type) &&
-        begin_norm_axis == ctx.Input<Tensor>("X")->dims().size() - 1) {
+        begin_norm_axis ==
+            ctx.Input<phi::DenseTensor>("X")->dims().size() - 1) {
       return framework::OpKernelType(input_data_type,
                                      ctx.GetPlace(),
                                      framework::DataLayout::kMKLDNN,
diff --git a/paddle/fluid/operators/layer_norm_op_mlu.cc b/paddle/fluid/operators/layer_norm_op_mlu.cc
index 5819cc3b40801..7058f9f094923 100644
--- a/paddle/fluid/operators/layer_norm_op_mlu.cc
+++ b/paddle/fluid/operators/layer_norm_op_mlu.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using DDim = framework::DDim;
 
 template <typename T>
@@ -28,12 +28,12 @@ class LayerNormMLUKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
     const auto epsilon = ctx.Attr<float>("epsilon");
-    const auto* x = ctx.Input<Tensor>("X");
-    const auto* scale = ctx.Input<Tensor>("Scale");
-    const auto* bias = ctx.Input<Tensor>("Bias");
-    auto* y = ctx.Output<Tensor>("Y");
-    auto* mean = ctx.Output<Tensor>("Mean");
-    auto* variance = ctx.Output<Tensor>("Variance");
+    const auto* x = ctx.Input<phi::DenseTensor>("X");
+    const auto* scale = ctx.Input<phi::DenseTensor>("Scale");
+    const auto* bias = ctx.Input<phi::DenseTensor>("Bias");
+    auto* y = ctx.Output<phi::DenseTensor>("Y");
+    auto* mean = ctx.Output<phi::DenseTensor>("Mean");
+    auto* variance = ctx.Output<phi::DenseTensor>("Variance");
 
     auto place = ctx.GetPlace();
 
@@ -151,14 +151,15 @@ class LayerNormGradMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-    const auto* x = ctx.Input<Tensor>("X");
-    const auto* mean = ctx.Input<Tensor>("Mean");
-    const auto* variance = ctx.Input<Tensor>("Variance");
-    const auto* scale = ctx.Input<Tensor>("Scale");
-    const auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dscale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto* dbias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+    const auto* x = ctx.Input<phi::DenseTensor>("X");
+    const auto* mean = ctx.Input<phi::DenseTensor>("Mean");
+    const auto* variance = ctx.Input<phi::DenseTensor>("Variance");
+    const auto* scale = ctx.Input<phi::DenseTensor>("Scale");
+    const auto* dy = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* dscale =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
+    auto* dbias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
 
     auto place = ctx.GetPlace();
     dx->mutable_data<T>(place);
diff --git a/paddle/fluid/operators/layer_norm_op_npu.cc b/paddle/fluid/operators/layer_norm_op_npu.cc
index 5aed9c76a86ff..146d441fed318 100644
--- a/paddle/fluid/operators/layer_norm_op_npu.cc
+++ b/paddle/fluid/operators/layer_norm_op_npu.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using DDim = framework::DDim;
 
 using DataLayout = framework::DataLayout;
@@ -53,12 +53,12 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
     using U = LayerNormParamType<T>;
     const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
     const auto epsilon = ctx.Attr<float>("epsilon");
-    const auto* x = ctx.Input<Tensor>("X");
-    const auto* scale = ctx.Input<Tensor>("Scale");
-    const auto* bias = ctx.Input<Tensor>("Bias");
-    auto* y = ctx.Output<Tensor>("Y");
-    auto* mean = ctx.Output<Tensor>("Mean");
-    auto* variance = ctx.Output<Tensor>("Variance");
+    const auto* x = ctx.Input<phi::DenseTensor>("X");
+    const auto* scale = ctx.Input<phi::DenseTensor>("Scale");
+    const auto* bias = ctx.Input<phi::DenseTensor>("Bias");
+    auto* y = ctx.Output<phi::DenseTensor>("Y");
+    auto* mean = ctx.Output<phi::DenseTensor>("Mean");
+    auto* variance = ctx.Output<phi::DenseTensor>("Variance");
     const auto& x_dims = x->dims();
     std::vector<int> axes;
     auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
@@ -86,7 +86,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
       runner.Run(stream);
       scale = &default_scale;
     } else {
-      const_cast<Tensor*>(scale)->Resize(phi::make_ddim(axes));
+      const_cast<phi::DenseTensor*>(scale)->Resize(phi::make_ddim(axes));
     }
 
     Tensor default_bias(x->type());
@@ -100,7 +100,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
       runner.Run(stream);
       bias = &default_bias;
     } else {
-      const_cast<Tensor*>(bias)->Resize(phi::make_ddim(axes));
+      const_cast<phi::DenseTensor*>(bias)->Resize(phi::make_ddim(axes));
     }
 
     // cast scale from LayerNormParamType to T if needed
@@ -146,7 +146,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
     y->mutable_data<T>(ctx.GetPlace());
 
     // mean should be of  U type
-    Tensor* tmp_mean = mean;
+    phi::DenseTensor* tmp_mean = mean;
     Tensor cast_mean(x->type());
     if (framework::TransToProtoVarType(x->dtype()) ==
             framework::proto::VarType::FP16 &&
@@ -163,7 +163,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
     }
 
     // same for variance
-    Tensor* tmp_variance = variance;
+    phi::DenseTensor* tmp_variance = variance;
     Tensor cast_variance(x->type());
     if (framework::TransToProtoVarType(x->dtype()) ==
             framework::proto::VarType::FP16 &&
@@ -219,8 +219,8 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
     // revert shape of scale and bias
     // TODO(zhiqiu): better implementation, use tmp tensor to avoid write input
     // tensor.
-    const_cast<Tensor*>(scale)->Resize(phi::make_ddim({right}));
-    const_cast<Tensor*>(bias)->Resize(phi::make_ddim({right}));
+    const_cast<phi::DenseTensor*>(scale)->Resize(phi::make_ddim({right}));
+    const_cast<phi::DenseTensor*>(bias)->Resize(phi::make_ddim({right}));
   }
 };
 
@@ -230,15 +230,16 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     using U = LayerNormParamType<T>;
     const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-    const auto* x = ctx.Input<Tensor>("X");
+    const auto* x = ctx.Input<phi::DenseTensor>("X");
     const auto& x_dims = x->dims();
-    const auto* mean = ctx.Input<Tensor>("Mean");
-    const auto* variance = ctx.Input<Tensor>("Variance");
-    const auto* scale = ctx.Input<Tensor>("Scale");
-    const auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dscale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto* dbias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+    const auto* mean = ctx.Input<phi::DenseTensor>("Mean");
+    const auto* variance = ctx.Input<phi::DenseTensor>("Variance");
+    const auto* scale = ctx.Input<phi::DenseTensor>("Scale");
+    const auto* dy = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* dscale =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
+    auto* dbias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
 
     auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
     int right = static_cast<int>(matrix_dim[1]);
@@ -268,8 +269,9 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
     }
 
     auto mean_dims = mean->dims();
-    const_cast<Tensor*>(mean)->Resize(phi::make_ddim({new_shape}));
-    const_cast<Tensor*>(variance)->Resize(phi::make_ddim({new_shape}));
+    const_cast<phi::DenseTensor*>(mean)->Resize(phi::make_ddim({new_shape}));
+    const_cast<phi::DenseTensor*>(variance)->Resize(
+        phi::make_ddim({new_shape}));
 
     Tensor default_scale(x->type());
     if (!scale) {
@@ -282,7 +284,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
       runner.Run(stream);
       scale = &default_scale;
     } else {
-      const_cast<Tensor*>(scale)->Resize(phi::make_ddim(axes));
+      const_cast<phi::DenseTensor*>(scale)->Resize(phi::make_ddim(axes));
     }
 
     // cast scale from LayerNormParamType to T if needed
@@ -358,7 +360,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
     dbias->Resize(phi::make_ddim(axes));
 
     // dscale should be of  U type
-    Tensor* tmp_dscale = dscale;
+    phi::DenseTensor* tmp_dscale = dscale;
     Tensor cast_dscale(x->type());
     if (framework::TransToProtoVarType(x->dtype()) ==
             framework::proto::VarType::FP16 &&
@@ -375,7 +377,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
     }
 
     // same for dbias
-    Tensor* tmp_dbias = dbias;
+    phi::DenseTensor* tmp_dbias = dbias;
     Tensor cast_dbias(x->type());
     if (framework::TransToProtoVarType(x->dtype()) ==
             framework::proto::VarType::FP16 &&
@@ -427,9 +429,9 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
       runner_cast_dbias.Run(stream);
     }
 
-    const_cast<Tensor*>(mean)->Resize(mean_dims);
-    const_cast<Tensor*>(variance)->Resize(mean_dims);
-    const_cast<Tensor*>(scale)->Resize(phi::make_ddim({right}));
+    const_cast<phi::DenseTensor*>(mean)->Resize(mean_dims);
+    const_cast<phi::DenseTensor*>(variance)->Resize(mean_dims);
+    const_cast<phi::DenseTensor*>(scale)->Resize(phi::make_ddim({right}));
     dscale->Resize(phi::make_ddim({right}));
     dbias->Resize(phi::make_ddim({right}));
   }
diff --git a/paddle/fluid/operators/layout_utils.h b/paddle/fluid/operators/layout_utils.h
index c15786fc83410..d475eab967d78 100644
--- a/paddle/fluid/operators/layout_utils.h
+++ b/paddle/fluid/operators/layout_utils.h
@@ -26,12 +26,12 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 inline void ResizeToChannelFirst(const framework::ExecutionContext& context,
-                                 const Tensor* input,
-                                 Tensor* transformed_input) {
+                                 const phi::DenseTensor* input,
+                                 phi::DenseTensor* transformed_input) {
   int dim = input->dims().size() - 2;
   if (dim == 3) {
     // input
@@ -68,8 +68,8 @@ inline void ResizeToChannelFirst(const framework::ExecutionContext& context,
 
 template <typename DeviceContext, typename T>
 inline void ResizeToChannelLast(const framework::ExecutionContext& context,
-                                const Tensor* input,
-                                Tensor* transformed_input) {
+                                const phi::DenseTensor* input,
+                                phi::DenseTensor* transformed_input) {
   int dim = input->dims().size() - 2;
   if (dim == 3) {
     // input
@@ -106,8 +106,8 @@ inline void ResizeToChannelLast(const framework::ExecutionContext& context,
 
 template <typename DeviceContext, typename T>
 inline void TransToChannelFirst(const framework::ExecutionContext& context,
-                                const Tensor* input,
-                                Tensor* transformed_input) {
+                                const phi::DenseTensor* input,
+                                phi::DenseTensor* transformed_input) {
   VLOG(5) << "Why am I called?";
   int dim = input->dims().size() - 2;
   if (dim == 3) {
@@ -131,8 +131,8 @@ inline void TransToChannelFirst(const framework::ExecutionContext& context,
 
 template <typename DeviceContext, typename T>
 inline void TransToChannelLast(const framework::ExecutionContext& context,
-                               const Tensor* input,
-                               Tensor* transformed_input) {
+                               const phi::DenseTensor* input,
+                               phi::DenseTensor* transformed_input) {
   int dim = input->dims().size() - 2;
   if (dim == 3) {
     auto& dev_ctx = context.template device_context<DeviceContext>();
diff --git a/paddle/fluid/operators/limit_by_capacity_op.cu b/paddle/fluid/operators/limit_by_capacity_op.cu
index d14cc0762617e..079d6cd0bfcc6 100644
--- a/paddle/fluid/operators/limit_by_capacity_op.cu
+++ b/paddle/fluid/operators/limit_by_capacity_op.cu
@@ -29,7 +29,7 @@ namespace paddle {
 namespace operators {
 
 using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 __global__ void limit_by_capacity_impl(
@@ -54,10 +54,10 @@ template <typename T>
 class LimitByCapacityOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto expert_count = context.Input<Tensor>("expert_count");
-    auto capacity = context.Input<Tensor>("capacity");
+    auto expert_count = context.Input<phi::DenseTensor>("expert_count");
+    auto capacity = context.Input<phi::DenseTensor>("capacity");
     auto n_worker = context.Attr<int>("n_worker");
-    auto out = context.Output<Tensor>("Out");
+    auto out = context.Output<phi::DenseTensor>("Out");
 
     auto n_expert = expert_count->numel() / n_worker;
     const auto place = context.GetPlace();
@@ -68,7 +68,7 @@ class LimitByCapacityOpCUDAKernel : public framework::OpKernel<T> {
     auto out_data = out->mutable_data<T>(place);
     const T* ec_data = expert_count->data<T>();
 
-    framework::Tensor capacity_copy;
+    phi::DenseTensor capacity_copy;
     framework::TensorCopy(*capacity, place, dev_ctx, &capacity_copy);
     T* cap_data = capacity_copy.mutable_data<T>(place);
 
diff --git a/paddle/fluid/operators/linear_chain_crf_op.h b/paddle/fluid/operators/linear_chain_crf_op.h
index de6daf33f8426..bd48acc3796b1 100644
--- a/paddle/fluid/operators/linear_chain_crf_op.h
+++ b/paddle/fluid/operators/linear_chain_crf_op.h
@@ -48,20 +48,22 @@ struct ScalarMul {
 
 using framework::LoD;
 using framework::LoDTensor;
-using framework::Tensor;
 
 template <typename DeviceContext, typename T>
 class LinearChainCRFOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* emission_weights = ctx.Input<framework::Tensor>("Emission");
-    const Tensor* transition_weights =
-        ctx.Input<framework::Tensor>("Transition");
-
-    Tensor* emission_exps = ctx.Output<Tensor>("EmissionExps");
-    Tensor* transition_exps = ctx.Output<Tensor>("TransitionExps");
-    Tensor* alpha = ctx.Output<Tensor>("Alpha");
-    Tensor* ll = ctx.Output<Tensor>("LogLikelihood");
+    const phi::DenseTensor* emission_weights =
+        ctx.Input<phi::DenseTensor>("Emission");
+    const phi::DenseTensor* transition_weights =
+        ctx.Input<phi::DenseTensor>("Transition");
+
+    phi::DenseTensor* emission_exps =
+        ctx.Output<phi::DenseTensor>("EmissionExps");
+    phi::DenseTensor* transition_exps =
+        ctx.Output<phi::DenseTensor>("TransitionExps");
+    phi::DenseTensor* alpha = ctx.Output<phi::DenseTensor>("Alpha");
+    phi::DenseTensor* ll = ctx.Output<phi::DenseTensor>("LogLikelihood");
 
     // Because the computation codes only runs on CPU, here the memory for all
     // the outputs is FIXED to be allocated on the CPU memory.
@@ -70,18 +72,19 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
     transition_exps->mutable_data<T>(platform::CPUPlace());
     auto emission_dims = emission_weights->dims();
 
-    const Tensor* label = ctx.Input<framework::Tensor>("Label");
-    Tensor emission_weights_tmp = *emission_weights;
-    Tensor label_tmp = *label;
-    Tensor emission_exps_tmp = *emission_exps;
-    Tensor alpha_tmp = *alpha;
+    const phi::DenseTensor* label = ctx.Input<phi::DenseTensor>("Label");
+    phi::DenseTensor emission_weights_tmp = *emission_weights;
+    phi::DenseTensor label_tmp = *label;
+    phi::DenseTensor emission_exps_tmp = *emission_exps;
+    phi::DenseTensor alpha_tmp = *alpha;
     int64_t seq_num = 0;
     int64_t batch_size;
     int64_t tag_num;
     const int64_t* length_data = nullptr;
     framework::LoD in_lod;
     if (ctx.HasInput("Length")) {
-      const Tensor* label_length = ctx.Input<framework::Tensor>("Length");
+      const phi::DenseTensor* label_length =
+          ctx.Input<phi::DenseTensor>("Length");
       length_data = label_length->data<int64_t>();
       seq_num = label_length->numel();
       PADDLE_ENFORCE_EQ(
@@ -125,7 +128,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
     ll->Resize({seq_num, 1});
     ll->mutable_data<T>(platform::CPUPlace());
     // Now, all the inputs and outputs should be on the CPU memory.
-    Tensor emission_row_max;
+    phi::DenseTensor emission_row_max;
     emission_row_max.mutable_data<T>(
         phi::make_ddim({static_cast<int64_t>(batch_size), 1}),
         platform::CPUPlace());
@@ -158,11 +161,15 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
         log_likelihood[i] = 0.;
         continue;
       }
-      const Tensor one_seq = emission_weights_tmp.Slice(start_pos, end_pos);
-      Tensor one_seq_row_max = emission_row_max.Slice(start_pos, end_pos);
-      Tensor one_seq_exps = emission_exps_tmp.Slice(start_pos, end_pos);
-      const Tensor one_seq_label = label_tmp.Slice(start_pos, end_pos);
-      Tensor one_seq_alpha = alpha_tmp.Slice(start_pos, end_pos);
+      const phi::DenseTensor one_seq =
+          emission_weights_tmp.Slice(start_pos, end_pos);
+      phi::DenseTensor one_seq_row_max =
+          emission_row_max.Slice(start_pos, end_pos);
+      phi::DenseTensor one_seq_exps =
+          emission_exps_tmp.Slice(start_pos, end_pos);
+      const phi::DenseTensor one_seq_label =
+          label_tmp.Slice(start_pos, end_pos);
+      phi::DenseTensor one_seq_alpha = alpha_tmp.Slice(start_pos, end_pos);
       log_likelihood[i] = ForwardOneSequence(one_seq,
                                              one_seq_row_max,
                                              one_seq_exps,
@@ -174,13 +181,13 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
   };
 
  private:
-  T ForwardOneSequence(const Tensor& emission,
-                       const Tensor& emission_row_max,
-                       const Tensor& emission_exps,
-                       const Tensor& trans_weights,
-                       const Tensor& trans_weight_exps,
-                       const Tensor& label,
-                       Tensor* alpha) const {
+  T ForwardOneSequence(const phi::DenseTensor& emission,
+                       const phi::DenseTensor& emission_row_max,
+                       const phi::DenseTensor& emission_exps,
+                       const phi::DenseTensor& trans_weights,
+                       const phi::DenseTensor& trans_weight_exps,
+                       const phi::DenseTensor& label,
+                       phi::DenseTensor* alpha) const {
     const T* x = emission.data<T>();
     const T* x_row_max = emission_row_max.data<T>();
     const T* x_exps = emission_exps.data<T>();
@@ -243,27 +250,31 @@ template <typename DeviceContext, typename T>
 class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* label = ctx.Input<Tensor>("Label");
-    const Tensor* emission_exps = ctx.Input<Tensor>("EmissionExps");
-    const Tensor* transition_exps = ctx.Input<Tensor>("TransitionExps");
-    const Tensor* alpha = ctx.Input<Tensor>("Alpha");
+    const phi::DenseTensor* label = ctx.Input<phi::DenseTensor>("Label");
+    const phi::DenseTensor* emission_exps =
+        ctx.Input<phi::DenseTensor>("EmissionExps");
+    const phi::DenseTensor* transition_exps =
+        ctx.Input<phi::DenseTensor>("TransitionExps");
+    const phi::DenseTensor* alpha = ctx.Input<phi::DenseTensor>("Alpha");
     const T* ll_grad =
-        ctx.Input<Tensor>(framework::GradVarName("LogLikelihood"))->data<T>();
-    Tensor* emission_grad =
-        ctx.Output<Tensor>(framework::GradVarName("Emission"));
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("LogLikelihood"))
+            ->data<T>();
+    phi::DenseTensor* emission_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Emission"));
     auto* emission_grad_data =
         emission_grad->mutable_data<T>(platform::CPUPlace());
     memset(emission_grad_data, 0, emission_grad->numel() * sizeof(T));
-    Tensor alpha_tmp = *alpha;
-    Tensor label_tmp = *label;
-    Tensor emission_exps_tmp = *emission_exps;
-    Tensor emission_grad_tmp = *emission_grad;
+    phi::DenseTensor alpha_tmp = *alpha;
+    phi::DenseTensor label_tmp = *label;
+    phi::DenseTensor emission_exps_tmp = *emission_exps;
+    phi::DenseTensor emission_grad_tmp = *emission_grad;
     // getting seq_num  using padding or not
     int64_t seq_num = 0;
     framework::LoD in_lod;
     const int64_t* length_data = nullptr;
     if (ctx.HasInput("Length")) {
-      const Tensor* label_length = ctx.Input<framework::Tensor>("Length");
+      const phi::DenseTensor* label_length =
+          ctx.Input<phi::DenseTensor>("Length");
       length_data = label_length->data<int64_t>();
       seq_num = label_length->numel();
       auto emission_dims = emission_grad->dims();
@@ -283,8 +294,8 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
       seq_num = static_cast<int64_t>(in_lod[0].size() - 1);
     }
 
-    Tensor* transition_grad =
-        ctx.Output<Tensor>(framework::GradVarName("Transition"));
+    phi::DenseTensor* transition_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Transition"));
 
     // TODO(caoying) Fix this constraint. When the Input(Emission) is from the
     // data reader operator, it can have no gradients.
@@ -298,7 +309,7 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
     // backwark vectors. For a backward vector i (the i-th row of beta), it
     // captures the unnormalized probabilities of partial sequences starting
     // at position i.
-    Tensor beta;
+    phi::DenseTensor beta;
     beta.mutable_data<T>(emission_dims, platform::CPUPlace());
     if (ctx.HasInput("Length")) {
       beta.Resize({emission_dims[0] * emission_dims[1], emission_dims[2]});
@@ -318,12 +329,14 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
       if (end_pos == start_pos) {
         continue;
       }
-      const Tensor one_seq_emission_exps =
+      const phi::DenseTensor one_seq_emission_exps =
           emission_exps_tmp.Slice(start_pos, end_pos);
-      const Tensor one_seq_label = label_tmp.Slice(start_pos, end_pos);
-      const Tensor one_seq_alpha = alpha_tmp.Slice(start_pos, end_pos);
-      Tensor one_seq_beta = beta.Slice(start_pos, end_pos);
-      Tensor one_seq_emission_grad =
+      const phi::DenseTensor one_seq_label =
+          label_tmp.Slice(start_pos, end_pos);
+      const phi::DenseTensor one_seq_alpha =
+          alpha_tmp.Slice(start_pos, end_pos);
+      phi::DenseTensor one_seq_beta = beta.Slice(start_pos, end_pos);
+      phi::DenseTensor one_seq_emission_grad =
           emission_grad_tmp.Slice(start_pos, end_pos);
       BackwardOneSequence(ctx.template device_context<phi::CPUContext>(),
                           ll_grad[i],
@@ -340,13 +353,13 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
  private:
   void BackwardOneSequence(const phi::CPUContext& ctx,
                            const T ll_grad,
-                           const Tensor& emission_exps,
-                           const Tensor& transition_exps,
-                           const Tensor& alpha,
-                           const Tensor& label,
-                           Tensor* beta,
-                           Tensor* transition_grad,
-                           Tensor* emission_grad) const {
+                           const phi::DenseTensor& emission_exps,
+                           const phi::DenseTensor& transition_exps,
+                           const phi::DenseTensor& alpha,
+                           const phi::DenseTensor& label,
+                           phi::DenseTensor* beta,
+                           phi::DenseTensor* transition_grad,
+                           phi::DenseTensor* emission_grad) const {
     const T* w_exps = transition_exps.data<T>();
     const T* x_exps = emission_exps.data<T>();
     const int64_t* label_value = label.data<int64_t>();
@@ -406,7 +419,7 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
 
       // TODO(caoying): Fix this to avoid using this local variable if we can
       // profile the training process.
-      Tensor tmp;
+      phi::DenseTensor tmp;
       tmp.mutable_data<T>(beta->dims(), platform::CPUPlace());
       auto tmp_mat = framework::EigenMatrix<T>::From(tmp);
       auto prob = beta_mat * x_exps_mat;
diff --git a/paddle/fluid/operators/linspace_op.cc b/paddle/fluid/operators/linspace_op.cc
index 6766c9559be16..d9dcfbed5967f 100644
--- a/paddle/fluid/operators/linspace_op.cc
+++ b/paddle/fluid/operators/linspace_op.cc
@@ -37,7 +37,7 @@ class LinspaceOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const framework::Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     if (platform::is_xpu_place(tensor.place())) {
       return framework::OpKernelType(
diff --git a/paddle/fluid/operators/lod_reset_op.cc b/paddle/fluid/operators/lod_reset_op.cc
index b0a6b073b4a02..1e03bb806f192 100644
--- a/paddle/fluid/operators/lod_reset_op.cc
+++ b/paddle/fluid/operators/lod_reset_op.cc
@@ -71,7 +71,7 @@ class LoDResetOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const framework::Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     return framework::OpKernelType(expected_kernel_type.data_type_,
                                    expected_kernel_type.place_,
diff --git a/paddle/fluid/operators/lod_reset_op.h b/paddle/fluid/operators/lod_reset_op.h
index 1d687232974ef..5049653b1fb50 100644
--- a/paddle/fluid/operators/lod_reset_op.h
+++ b/paddle/fluid/operators/lod_reset_op.h
@@ -53,7 +53,7 @@ class LoDResetKernel : public framework::OpKernel<T> {
         return;  // early return, since lod already set
       } else {
         auto* lod = lod_t->data<int>();
-        framework::Tensor lod_cpu;
+        phi::DenseTensor lod_cpu;
         if (platform::is_gpu_place(lod_t->place())) {
           framework::TensorCopySync(*lod_t, platform::CPUPlace(), &lod_cpu);
           lod = lod_cpu.data<int>();
@@ -115,8 +115,8 @@ template <typename DeviceContext, typename T>
 class LoDResetGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* d_out = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     framework::TensorCopy(*d_out, d_out->place(), d_x);
   }
diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc
index ab4d95c592fc1..ee7be39580b49 100644
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
@@ -46,14 +46,14 @@ struct LoDTensorToArrayFunctorImpl {
 
 struct LoDTensorToArrayFunctor
     : public std::unary_function<platform::Place, void> {
-  std::vector<const framework::Tensor *> ref_inputs_;
-  mutable std::vector<framework::Tensor *> outputs_;
-  const framework::Tensor &input_;
+  std::vector<const phi::DenseTensor *> ref_inputs_;
+  mutable std::vector<phi::DenseTensor *> outputs_;
+  const phi::DenseTensor &input_;
 
-  explicit LoDTensorToArrayFunctor(const framework::Tensor &input)
+  explicit LoDTensorToArrayFunctor(const phi::DenseTensor &input)
       : input_(input) {}
 
-  void AddOutput(framework::Tensor *t) {
+  void AddOutput(phi::DenseTensor *t) {
     outputs_.emplace_back(t);
     ref_inputs_.emplace_back(t);
   }
@@ -152,7 +152,7 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
       }
     }
 
-    std::map<size_t, framework::Tensor> outputs;
+    std::map<size_t, phi::DenseTensor> outputs;
 
     for (size_t i = 0; i < max_seq_len; ++i) {
       auto &ranges = copy_ranges[i];
diff --git a/paddle/fluid/operators/log_loss_op_npu.cc b/paddle/fluid/operators/log_loss_op_npu.cc
index 465992588cfd3..47c6bef196be1 100644
--- a/paddle/fluid/operators/log_loss_op_npu.cc
+++ b/paddle/fluid/operators/log_loss_op_npu.cc
@@ -20,14 +20,14 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 void LogLossAdds(const platform::Place& place,
                  const aclrtStream& stream,
-                 const Tensor* x,
+                 const phi::DenseTensor* x,
                  float scale,
-                 Tensor* y) {
+                 phi::DenseTensor* y) {
   //  Calculate y = x + scale
   y->mutable_data<T>(x->dims(), place);
   const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scale}});
@@ -37,9 +37,9 @@ void LogLossAdds(const platform::Place& place,
 template <typename T>
 void LogLossMuls(const platform::Place& place,
                  const aclrtStream& stream,
-                 const Tensor* x,
+                 const phi::DenseTensor* x,
                  float scale,
-                 Tensor* y) {
+                 phi::DenseTensor* y) {
   //  Calculate y = x + scale
   y->mutable_data<T>(x->dims(), place);
   const auto& runner = NpuOpRunner("Muls", {*x}, {*y}, {{"value", scale}});
@@ -49,9 +49,9 @@ void LogLossMuls(const platform::Place& place,
 template <typename T>
 void LogLossBCE(const platform::Place& place,
                 const aclrtStream& stream,
-                const Tensor* x,
-                const Tensor* y,
-                Tensor* z) {
+                const phi::DenseTensor* x,
+                const phi::DenseTensor* y,
+                phi::DenseTensor* z) {
   z->mutable_data<T>(x->dims(), place);
   const auto& runner =
       NpuOpRunner("BinaryCrossEntropy",
@@ -64,10 +64,10 @@ void LogLossBCE(const platform::Place& place,
 template <typename T>
 void LogLossBCEGrad(const platform::Place& place,
                     const aclrtStream& stream,
-                    const Tensor* x,
-                    const Tensor* y,
-                    const Tensor* dout,
-                    Tensor* dx) {
+                    const phi::DenseTensor* x,
+                    const phi::DenseTensor* y,
+                    const phi::DenseTensor* dout,
+                    phi::DenseTensor* dx) {
   dx->mutable_data<T>(x->dims(), place);
   const auto& runner =
       NpuOpRunner("BinaryCrossEntropyGrad",
@@ -81,9 +81,9 @@ template <typename T, typename AttrType = T>
 class LogLossNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* y = ctx.Output<Tensor>("Loss");
-    auto* pred = ctx.Input<Tensor>("Predicted");
-    auto* label = ctx.Input<Tensor>("Labels");
+    auto* y = ctx.Output<phi::DenseTensor>("Loss");
+    auto* pred = ctx.Input<phi::DenseTensor>("Predicted");
+    auto* label = ctx.Input<phi::DenseTensor>("Labels");
     auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
 
     auto place = ctx.GetPlace();
@@ -104,10 +104,11 @@ template <typename T, typename AttrType = T>
 class LogLossGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* pred = ctx.Input<Tensor>("Predicted");
-    auto* label = ctx.Input<Tensor>("Labels");
-    auto* dloss = ctx.Input<Tensor>(framework::GradVarName("Loss"));
-    auto* dpred = ctx.Output<Tensor>(framework::GradVarName("Predicted"));
+    auto* pred = ctx.Input<phi::DenseTensor>("Predicted");
+    auto* label = ctx.Input<phi::DenseTensor>("Labels");
+    auto* dloss = ctx.Input<phi::DenseTensor>(framework::GradVarName("Loss"));
+    auto* dpred =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Predicted"));
     auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
 
     auto place = ctx.GetPlace();
diff --git a/paddle/fluid/operators/log_loss_op_xpu.cc b/paddle/fluid/operators/log_loss_op_xpu.cc
index 62095dc3524fe..59e0c15678247 100644
--- a/paddle/fluid/operators/log_loss_op_xpu.cc
+++ b/paddle/fluid/operators/log_loss_op_xpu.cc
@@ -17,15 +17,15 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T, typename AttrType = T>
 class LogLossXPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* predict = ctx.Input<Tensor>("Predicted");
-    auto* labels = ctx.Input<Tensor>("Labels");
-    auto* loss = ctx.Output<Tensor>("Loss");
+    auto* predict = ctx.Input<phi::DenseTensor>("Predicted");
+    auto* labels = ctx.Input<phi::DenseTensor>("Labels");
+    auto* loss = ctx.Output<phi::DenseTensor>("Loss");
     auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
     loss->mutable_data<T>(ctx.GetPlace());
     int n = predict->numel();
@@ -43,10 +43,11 @@ template <typename DeviceContext, typename T, typename AttrType = T>
 class LogLossGradXPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* predict = ctx.Input<Tensor>("Predicted");
-    auto* labels = ctx.Input<Tensor>("Labels");
-    auto* dloss = ctx.Input<Tensor>(framework::GradVarName("Loss"));
-    auto* dpred = ctx.Output<Tensor>(framework::GradVarName("Predicted"));
+    auto* predict = ctx.Input<phi::DenseTensor>("Predicted");
+    auto* labels = ctx.Input<phi::DenseTensor>("Labels");
+    auto* dloss = ctx.Input<phi::DenseTensor>(framework::GradVarName("Loss"));
+    auto* dpred =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Predicted"));
     if (!dpred) {
       return;
     }
diff --git a/paddle/fluid/operators/log_softmax_op_npu.cc b/paddle/fluid/operators/log_softmax_op_npu.cc
index acdc4db14bed5..b86786b8a3170 100644
--- a/paddle/fluid/operators/log_softmax_op_npu.cc
+++ b/paddle/fluid/operators/log_softmax_op_npu.cc
@@ -25,8 +25,8 @@ template <typename T>
 class LogSoftmaxNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* X = ctx.Input<framework::Tensor>("X");
-    auto* Out = ctx.Output<framework::Tensor>("Out");
+    auto* X = ctx.Input<phi::DenseTensor>("X");
+    auto* Out = ctx.Output<phi::DenseTensor>("Out");
     const int rank = X->dims().size();
     const int axis = phi::funcs::CanonicalAxis(ctx.Attr<int>("axis"), rank);
     Out->mutable_data<T>(ctx.GetPlace());
@@ -44,9 +44,9 @@ template <typename T>
 class LogSoftmaxGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* Out = ctx.Input<framework::Tensor>("Out");
-    auto* dOut = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dX = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* Out = ctx.Input<phi::DenseTensor>("Out");
+    auto* dOut = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dX = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     const int rank = dOut->dims().size();
     const int axis = phi::funcs::CanonicalAxis(ctx.Attr<int>("axis"), rank);
 
diff --git a/paddle/fluid/operators/lookup_table_dequant_op.h b/paddle/fluid/operators/lookup_table_dequant_op.h
index 8a2dceacb2877..e002a031a795a 100644
--- a/paddle/fluid/operators/lookup_table_dequant_op.h
+++ b/paddle/fluid/operators/lookup_table_dequant_op.h
@@ -27,7 +27,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 using SelectedRows = phi::SelectedRows;
 using DDim = framework::DDim;
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index 31a3e40f12e82..c8964647ce372 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -26,7 +26,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 using SelectedRows = phi::SelectedRows;
 using DDim = framework::DDim;
diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu
index 7b4ed84fc209b..41be6b34e6e5b 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.cu
+++ b/paddle/fluid/operators/lookup_table_v2_op.cu
@@ -78,13 +78,13 @@ __global__ void LookupTableV2Grad(T *table,
 template <typename T>
 struct LookupTableV2CUDAFunctor {
   LookupTableV2CUDAFunctor(const framework::ExecutionContext &context,
-                           const framework::Tensor *ids_t)
+                           const phi::DenseTensor *ids_t)
       : context_(context), ids_t_(ids_t) {}
 
   template <typename IdT>
   void apply() {
-    auto *table_t = context_.Input<framework::Tensor>("W");
-    auto *output_t = context_.Output<framework::Tensor>("Out");
+    auto *table_t = context_.Input<phi::DenseTensor>("W");
+    auto *output_t = context_.Output<phi::DenseTensor>("Out");
     int64_t padding_idx = context_.Attr<int64_t>("padding_idx");
 
     size_t N = table_t->dims()[0];
@@ -111,14 +111,14 @@ struct LookupTableV2CUDAFunctor {
 
  private:
   const framework::ExecutionContext &context_;
-  const framework::Tensor *ids_t_;
+  const phi::DenseTensor *ids_t_;
 };
 
 template <typename T>
 class LookupTableV2CUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    const auto *ids_t = context.Input<framework::Tensor>("Ids");
+    const auto *ids_t = context.Input<phi::DenseTensor>("Ids");
     LookupTableV2CUDAFunctor<T> functor(context, ids_t);
     framework::VisitIntDataType(framework::TransToProtoVarType(ids_t->dtype()),
                                 functor);
@@ -137,7 +137,7 @@ __global__ void InputTypeConvert(const InT *in_ids,
 template <typename T>
 struct LookupTableV2GradCUDAFunctor {
   LookupTableV2GradCUDAFunctor(const framework::ExecutionContext &context,
-                               const framework::Tensor *ids_t)
+                               const phi::DenseTensor *ids_t)
       : context_(context), ids_t_(ids_t) {}
 
   template <typename IdT>
@@ -148,9 +148,9 @@ struct LookupTableV2GradCUDAFunctor {
     // Since paddings are not trainable and fixed in forward, the gradient of
     // paddings makes no sense and we don't deal with it in backward.
     if (is_sparse) {
-      auto *table = context_.Input<framework::Tensor>("W");
+      auto *table = context_.Input<phi::DenseTensor>("W");
       auto *d_output =
-          context_.Input<framework::Tensor>(framework::GradVarName("Out"));
+          context_.Input<phi::DenseTensor>(framework::GradVarName("Out"));
       auto *d_table =
           context_.Output<phi::SelectedRows>(framework::GradVarName("W"));
 
@@ -206,9 +206,9 @@ struct LookupTableV2GradCUDAFunctor {
 
     } else {
       auto d_output_t =
-          context_.Input<framework::Tensor>(framework::GradVarName("Out"));
+          context_.Input<phi::DenseTensor>(framework::GradVarName("Out"));
       auto d_table_t =
-          context_.Output<framework::Tensor>(framework::GradVarName("W"));
+          context_.Output<phi::DenseTensor>(framework::GradVarName("W"));
 
       int N = d_table_t->dims()[0];
       int D = d_table_t->dims()[1];
@@ -236,14 +236,14 @@ struct LookupTableV2GradCUDAFunctor {
 
  private:
   const framework::ExecutionContext &context_;
-  const framework::Tensor *ids_t_;
+  const phi::DenseTensor *ids_t_;
 };
 
 template <typename T>
 class LookupTableV2GradCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    const auto *ids_t = context.Input<framework::Tensor>("Ids");
+    const auto *ids_t = context.Input<phi::DenseTensor>("Ids");
     LookupTableV2GradCUDAFunctor<T> functor(context, ids_t);
     framework::VisitIntDataType(framework::TransToProtoVarType(ids_t->dtype()),
                                 functor);
diff --git a/paddle/fluid/operators/lookup_table_v2_op.h b/paddle/fluid/operators/lookup_table_v2_op.h
index 1e12b00ebb944..49ef1c282f016 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.h
+++ b/paddle/fluid/operators/lookup_table_v2_op.h
@@ -27,7 +27,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 using SelectedRows = phi::SelectedRows;
 using DDim = framework::DDim;
@@ -151,7 +151,7 @@ template <typename T>
 class LookupTableV2Kernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    const auto *ids = context.Input<Tensor>("Ids");
+    const auto *ids = context.Input<phi::DenseTensor>("Ids");
     LookupTableV2CPUFunctor<T> functor(context, ids);
     framework::VisitIntDataType(framework::TransToProtoVarType(ids->dtype()),
                                 functor);
@@ -272,7 +272,7 @@ template <typename T>
 class LookupTableV2GradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    const auto *ids = context.Input<Tensor>("Ids");
+    const auto *ids = context.Input<phi::DenseTensor>("Ids");
     LookupTableV2GradCPUFunctor<T> functor(context, ids);
     framework::VisitIntDataType(framework::TransToProtoVarType(ids->dtype()),
                                 functor);
diff --git a/paddle/fluid/operators/lookup_table_v2_op_mlu.cc b/paddle/fluid/operators/lookup_table_v2_op_mlu.cc
index 2cda715f14efa..282b8581ca482 100644
--- a/paddle/fluid/operators/lookup_table_v2_op_mlu.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op_mlu.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class LookupTableV2MLUKernel : public framework::OpKernel<T> {
diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
index 842bbd2c672ee..41d53a3e531ca 100644
--- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
@@ -22,7 +22,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 constexpr int64_t kNoPadding = -1;
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
index ca2fba56697fc..b2ef8f0370e37 100644
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -27,15 +27,14 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
 using DataLayout = framework::DataLayout;
 
 template <typename T>
 struct LRNFunctor<phi::CPUContext, T> {
   void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor& input,
-                  framework::Tensor* out,
-                  framework::Tensor* mid,
+                  const phi::DenseTensor& input,
+                  phi::DenseTensor* out,
+                  phi::DenseTensor* mid,
                   int N,
                   int C,
                   int H,
@@ -49,7 +48,7 @@ struct LRNFunctor<phi::CPUContext, T> {
     auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(ctx);
     phi::funcs::Transpose<phi::CPUContext, T, 4> transpose;
     auto& dev_ctx = ctx.template device_context<phi::CPUContext>();
-    Tensor in_transpose, mid_transpose, out_transpose;
+    phi::DenseTensor in_transpose, mid_transpose, out_transpose;
     // if channel_last, transpose to channel_first
     if (data_layout == DataLayout::kNHWC) {
       auto in_dims = input.dims();
@@ -72,7 +71,7 @@ struct LRNFunctor<phi::CPUContext, T> {
     T* odata = out_transpose.data<T>();
     T* mdata = mid_transpose.data<T>();
 
-    Tensor squared;
+    phi::DenseTensor squared;
     T* sdata = squared.mutable_data<T>({1, C + n - 1, H, W}, place);
     std::memset(sdata, 0, sizeof(T) * squared.numel());
     for (int i = 0; i < mid->numel(); ++i) {
@@ -122,11 +121,11 @@ template struct LRNFunctor<phi::CPUContext, double>;
 template <typename T>
 struct LRNGradFunctor<phi::CPUContext, T> {
   void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor& x,
-                  const framework::Tensor& out,
-                  const framework::Tensor& mid,
-                  framework::Tensor* x_g,
-                  const framework::Tensor& out_g,
+                  const phi::DenseTensor& x,
+                  const phi::DenseTensor& out,
+                  const phi::DenseTensor& mid,
+                  phi::DenseTensor* x_g,
+                  const phi::DenseTensor& out_g,
                   int N,
                   int C,
                   int H,
@@ -241,7 +240,7 @@ class LRNOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
 #ifdef PADDLE_WITH_MKLDNN
     if ((expected_kernel_type.data_layout_ == framework::DataLayout::kMKLDNN) &&
@@ -375,7 +374,7 @@ class LRNOpGrad : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
 #ifdef PADDLE_WITH_MKLDNN
     if ((expected_kernel_type.data_layout_ == framework::DataLayout::kMKLDNN) &&
diff --git a/paddle/fluid/operators/lrn_op.cu b/paddle/fluid/operators/lrn_op.cu
index 8c95cf1d0c9da..4bd0074328189 100644
--- a/paddle/fluid/operators/lrn_op.cu
+++ b/paddle/fluid/operators/lrn_op.cu
@@ -110,9 +110,9 @@ void CrossMapNormal(const framework::ExecutionContext& ctx,
 template <typename T>
 struct LRNFunctor<phi::GPUContext, T> {
   void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor& input,
-                  framework::Tensor* out,
-                  framework::Tensor* mid,
+                  const phi::DenseTensor& input,
+                  phi::DenseTensor* out,
+                  phi::DenseTensor* mid,
                   int N,
                   int C,
                   int H,
@@ -238,11 +238,11 @@ void CrossMapNormalGrad(const framework::ExecutionContext& ctx,
 template <typename T>
 struct LRNGradFunctor<phi::GPUContext, T> {
   void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor& x,
-                  const framework::Tensor& out,
-                  const framework::Tensor& mid,
-                  framework::Tensor* x_g,
-                  const framework::Tensor& out_g,
+                  const phi::DenseTensor& x,
+                  const phi::DenseTensor& out,
+                  const phi::DenseTensor& mid,
+                  phi::DenseTensor* x_g,
+                  const phi::DenseTensor& out_g,
                   int N,
                   int C,
                   int H,
diff --git a/paddle/fluid/operators/lrn_op.h b/paddle/fluid/operators/lrn_op.h
index 890542f0ed1b2..1b8d2c04f69a3 100644
--- a/paddle/fluid/operators/lrn_op.h
+++ b/paddle/fluid/operators/lrn_op.h
@@ -29,9 +29,9 @@ using DataLayout = framework::DataLayout;
 template <typename place, typename T>
 struct LRNFunctor {
   void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor& input,
-                  framework::Tensor* out,
-                  framework::Tensor* mid,
+                  const phi::DenseTensor& input,
+                  phi::DenseTensor* out,
+                  phi::DenseTensor* mid,
                   int N,
                   int C,
                   int H,
@@ -46,14 +46,14 @@ struct LRNFunctor {
 template <typename DeviceContext, typename T>
 class LRNKernel : public framework::OpKernel<T> {
  public:
-  using Tensor = framework::Tensor;
+  using Tensor = phi::DenseTensor;
 
   // f(x) = x * ( k + alpha * SUM((x)^2) )^(-beta)
   // x represents inputs
   // f(x) represents outputs
   void Compute(const framework::ExecutionContext& ctx) const override {
     // input
-    const Tensor& x = *ctx.Input<Tensor>("X");
+    const phi::DenseTensor& x = *ctx.Input<phi::DenseTensor>("X");
     auto x_dims = x.dims();
 
     const std::string data_layout_str = ctx.Attr<std::string>("data_format");
@@ -65,11 +65,11 @@ class LRNKernel : public framework::OpKernel<T> {
     int H = (data_layout != DataLayout::kNHWC ? x_dims[2] : x_dims[1]);
     int W = (data_layout != DataLayout::kNHWC ? x_dims[3] : x_dims[2]);
 
-    Tensor* out = ctx.Output<Tensor>("Out");
+    phi::DenseTensor* out = ctx.Output<phi::DenseTensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
 
     // MidOut save the intermediate result for backward
-    Tensor* mid = ctx.Output<Tensor>("MidOut");
+    phi::DenseTensor* mid = ctx.Output<phi::DenseTensor>("MidOut");
     mid->mutable_data<T>(ctx.GetPlace());
 
     int n = ctx.Attr<int>("n");
@@ -104,11 +104,11 @@ class LRNKernel : public framework::OpKernel<T> {
 template <typename DeviceContext, typename T>
 struct LRNGradFunctor {
   void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor& x,
-                  const framework::Tensor& out,
-                  const framework::Tensor& mid,
-                  framework::Tensor* x_g,
-                  const framework::Tensor& out_g,
+                  const phi::DenseTensor& x,
+                  const phi::DenseTensor& out,
+                  const phi::DenseTensor& mid,
+                  phi::DenseTensor* x_g,
+                  const phi::DenseTensor& out_g,
                   int N,
                   int C,
                   int H,
@@ -141,17 +141,18 @@ struct LRNGradFunctor {
 template <typename DeviceContext, typename T>
 class LRNGradKernel : public framework::OpKernel<T> {
  public:
-  using Tensor = framework::Tensor;
+  using Tensor = phi::DenseTensor;
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor& x = *ctx.Input<Tensor>("X");
-    const Tensor& out = *ctx.Input<Tensor>("Out");
-    const Tensor& out_g = *ctx.Input<Tensor>(framework::GradVarName("Out"));
-    const Tensor& mid = *ctx.Input<Tensor>("MidOut");
+    const phi::DenseTensor& x = *ctx.Input<phi::DenseTensor>("X");
+    const phi::DenseTensor& out = *ctx.Input<phi::DenseTensor>("Out");
+    const phi::DenseTensor& out_g =
+        *ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    const phi::DenseTensor& mid = *ctx.Input<phi::DenseTensor>("MidOut");
     const std::string data_layout_str = ctx.Attr<std::string>("data_format");
     const framework::DataLayout data_layout =
         framework::StringToDataLayout(data_layout_str);
 
-    auto x_g = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto x_g = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     x_g->mutable_data<T>(ctx.GetPlace());
 
     auto x_dims = x.dims();
diff --git a/paddle/fluid/operators/lstm_op.h b/paddle/fluid/operators/lstm_op.h
index 01e381dc7a3cd..b864919259f59 100644
--- a/paddle/fluid/operators/lstm_op.h
+++ b/paddle/fluid/operators/lstm_op.h
@@ -25,13 +25,13 @@ namespace paddle {
 namespace operators {
 
 using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 inline void ReorderInitState(const DeviceContext& ctx,
-                             const framework::Tensor& src,
+                             const phi::DenseTensor& src,
                              framework::Vector<size_t> index_lod,
-                             framework::Tensor* dst,
+                             phi::DenseTensor* dst,
                              bool indexed_src) {
   phi::funcs::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
   dst->mutable_data<T>(src.dims(), ctx.GetPlace());
@@ -45,11 +45,11 @@ class LSTMKernel : public framework::OpKernel<T> {
     bool is_test = ctx.Attr<bool>("is_test");
 
     auto* input = ctx.Input<LoDTensor>("Input");
-    auto* weight = ctx.Input<Tensor>("Weight");
-    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* weight = ctx.Input<phi::DenseTensor>("Weight");
+    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
 
-    auto* hidden_t0 = ctx.Input<Tensor>("H0");
-    auto* cell_t0 = ctx.Input<Tensor>("C0");
+    auto* hidden_t0 = ctx.Input<phi::DenseTensor>("H0");
+    auto* cell_t0 = ctx.Input<phi::DenseTensor>("C0");
 
     LoDTensor* batch_gate = nullptr;
     LoDTensor batch_gate_temp;
@@ -205,8 +205,8 @@ class LSTMGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* input = ctx.Input<LoDTensor>("Input");
-    auto* weight = ctx.Input<Tensor>("Weight");
-    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* weight = ctx.Input<phi::DenseTensor>("Weight");
+    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
 
     auto* hidden_out = ctx.Input<LoDTensor>("Hidden");
     auto* cell_out = ctx.Input<LoDTensor>("Cell");
@@ -217,14 +217,15 @@ class LSTMGradKernel : public framework::OpKernel<T> {
     auto* hidden_g = ctx.Input<LoDTensor>(framework::GradVarName("Hidden"));
 
     auto* in_g = ctx.Output<LoDTensor>(framework::GradVarName("Input"));
-    auto* weight_g = ctx.Output<Tensor>(framework::GradVarName("Weight"));
-    auto* bias_g = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+    auto* weight_g =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Weight"));
+    auto* bias_g = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
 
-    auto* h0 = ctx.Input<Tensor>("H0");
-    auto* c0 = ctx.Input<Tensor>("C0");
+    auto* h0 = ctx.Input<phi::DenseTensor>("H0");
+    auto* c0 = ctx.Input<phi::DenseTensor>("C0");
 
-    auto* h0_g = ctx.Output<Tensor>(framework::GradVarName("H0"));
-    auto* c0_g = ctx.Output<Tensor>(framework::GradVarName("C0"));
+    auto* h0_g = ctx.Output<phi::DenseTensor>(framework::GradVarName("H0"));
+    auto* c0_g = ctx.Output<phi::DenseTensor>(framework::GradVarName("C0"));
 
     auto& device_ctx = ctx.template device_context<DeviceContext>();
     phi::funcs::SetConstant<DeviceContext, T> zero;
diff --git a/paddle/fluid/operators/lstm_unit_op.cu b/paddle/fluid/operators/lstm_unit_op.cu
index 29486400a6f2d..ffc6e42587f1c 100644
--- a/paddle/fluid/operators/lstm_unit_op.cu
+++ b/paddle/fluid/operators/lstm_unit_op.cu
@@ -107,10 +107,10 @@ class LstmUnitOpCUDAKernel : public framework::OpKernel<T> {
         true,
         paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
 
-    auto* x_tensor = ctx.Input<framework::Tensor>("X");
-    auto* c_prev_tensor = ctx.Input<framework::Tensor>("C_prev");
-    auto* c_tensor = ctx.Output<framework::Tensor>("C");
-    auto* h_tensor = ctx.Output<framework::Tensor>("H");
+    auto* x_tensor = ctx.Input<phi::DenseTensor>("X");
+    auto* c_prev_tensor = ctx.Input<phi::DenseTensor>("C_prev");
+    auto* c_tensor = ctx.Output<phi::DenseTensor>("C");
+    auto* h_tensor = ctx.Output<phi::DenseTensor>("H");
 
     auto forget_bias = static_cast<T>(ctx.Attr<float>("forget_bias"));
 
@@ -140,17 +140,20 @@ class LstmUnitGradOpCUDAKernel : public framework::OpKernel<T> {
         true,
         paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
 
-    auto x_tensor = ctx.Input<Tensor>("X");
-    auto c_prev_tensor = ctx.Input<Tensor>("C_prev");
-    auto c_tensor = ctx.Input<Tensor>("C");
-    auto h_tensor = ctx.Input<Tensor>("H");
+    auto x_tensor = ctx.Input<phi::DenseTensor>("X");
+    auto c_prev_tensor = ctx.Input<phi::DenseTensor>("C_prev");
+    auto c_tensor = ctx.Input<phi::DenseTensor>("C");
+    auto h_tensor = ctx.Input<phi::DenseTensor>("H");
 
-    auto hdiff_tensor = ctx.Input<Tensor>(framework::GradVarName("H"));
-    auto cdiff_tensor = ctx.Input<Tensor>(framework::GradVarName("C"));
+    auto hdiff_tensor =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("H"));
+    auto cdiff_tensor =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("C"));
 
-    auto xdiff_tensor = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto xdiff_tensor =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto c_prev_diff_tensor =
-        ctx.Output<Tensor>(framework::GradVarName("C_prev"));
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("C_prev"));
 
     auto* X = x_tensor->data<T>();
     auto* C_prev = c_prev_tensor->data<T>();
diff --git a/paddle/fluid/operators/lstm_unit_op.h b/paddle/fluid/operators/lstm_unit_op.h
index a135ee1369de8..abb2eb1620dbe 100644
--- a/paddle/fluid/operators/lstm_unit_op.h
+++ b/paddle/fluid/operators/lstm_unit_op.h
@@ -23,8 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 template <typename T>
 inline T sigmoid(T x) {
   return 1. / (1. + exp(-x));
@@ -44,10 +42,10 @@ class LstmUnitKernel : public framework::OpKernel<T> {
         true,
         paddle::platform::errors::PreconditionNotMet("It must use CPUPlace."));
 
-    auto* x_tensor = ctx.Input<framework::Tensor>("X");
-    auto* c_prev_tensor = ctx.Input<framework::Tensor>("C_prev");
-    auto* c_tensor = ctx.Output<framework::Tensor>("C");
-    auto* h_tensor = ctx.Output<framework::Tensor>("H");
+    auto* x_tensor = ctx.Input<phi::DenseTensor>("X");
+    auto* c_prev_tensor = ctx.Input<phi::DenseTensor>("C_prev");
+    auto* c_tensor = ctx.Output<phi::DenseTensor>("C");
+    auto* h_tensor = ctx.Output<phi::DenseTensor>("H");
 
     auto forget_bias = static_cast<T>(ctx.Attr<float>("forget_bias"));
 
@@ -89,16 +87,19 @@ class LstmUnitGradKernel : public framework::OpKernel<T> {
         true,
         paddle::platform::errors::PreconditionNotMet("It must use CPUPlace."));
 
-    auto x_tensor = ctx.Input<Tensor>("X");
-    auto c_prev_tensor = ctx.Input<Tensor>("C_prev");
-    auto c_tensor = ctx.Input<Tensor>("C");
+    auto x_tensor = ctx.Input<phi::DenseTensor>("X");
+    auto c_prev_tensor = ctx.Input<phi::DenseTensor>("C_prev");
+    auto c_tensor = ctx.Input<phi::DenseTensor>("C");
 
-    auto hdiff_tensor = ctx.Input<Tensor>(framework::GradVarName("H"));
-    auto cdiff_tensor = ctx.Input<Tensor>(framework::GradVarName("C"));
+    auto hdiff_tensor =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("H"));
+    auto cdiff_tensor =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("C"));
 
-    auto xdiff_tensor = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto xdiff_tensor =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto c_prev_diff_tensor =
-        ctx.Output<Tensor>(framework::GradVarName("C_prev"));
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("C_prev"));
 
     auto* X = x_tensor->data<T>();
     auto* C_prev = c_prev_tensor->data<T>();
diff --git a/paddle/fluid/operators/lstmp_op.h b/paddle/fluid/operators/lstmp_op.h
index a00d0f7f36545..298f54944bbe6 100644
--- a/paddle/fluid/operators/lstmp_op.h
+++ b/paddle/fluid/operators/lstmp_op.h
@@ -30,7 +30,7 @@ namespace paddle {
 namespace operators {
 
 using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using platform::Transform;
 
 template <typename T,
@@ -71,9 +71,9 @@ class _ClipGradFunctor {
 
 template <typename DeviceContext, typename T>
 inline void ReorderInitState(const DeviceContext& ctx,
-                             const framework::Tensor& src,
+                             const phi::DenseTensor& src,
                              framework::Vector<size_t> index,
-                             framework::Tensor* dst,
+                             phi::DenseTensor* dst,
                              bool indexed_src) {
   phi::funcs::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
   dst->mutable_data<T>(src.dims(), ctx.GetPlace());
@@ -108,12 +108,12 @@ class LSTMPKernel : public framework::OpKernel<T> {
 
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* input = ctx.Input<LoDTensor>("Input");
-    auto* weight = ctx.Input<Tensor>("Weight");
-    auto* proj_weight = ctx.Input<Tensor>("ProjWeight");
-    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* weight = ctx.Input<phi::DenseTensor>("Weight");
+    auto* proj_weight = ctx.Input<phi::DenseTensor>("ProjWeight");
+    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
 
-    auto* hidden_t0 = ctx.Input<Tensor>("H0");
-    auto* cell_t0 = ctx.Input<Tensor>("C0");
+    auto* hidden_t0 = ctx.Input<phi::DenseTensor>("H0");
+    auto* cell_t0 = ctx.Input<phi::DenseTensor>("C0");
 
     auto proj_clip = static_cast<T>(ctx.Attr<float>("proj_clip"));
     auto cell_clip = static_cast<T>(ctx.Attr<float>("cell_clip"));
@@ -306,9 +306,9 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
   }
 
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* weight = ctx.Input<Tensor>("Weight");
-    auto* proj_weight = ctx.Input<Tensor>("ProjWeight");
-    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* weight = ctx.Input<phi::DenseTensor>("Weight");
+    auto* proj_weight = ctx.Input<phi::DenseTensor>("ProjWeight");
+    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
 
     auto* proj_out = ctx.Input<LoDTensor>("Projection");
     auto* cell_out = ctx.Input<LoDTensor>("Cell");
@@ -324,16 +324,17 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
         ctx.Input<LoDTensor>(framework::GradVarName("Projection"));
 
     auto* in_g = ctx.Output<LoDTensor>(framework::GradVarName("Input"));
-    auto* weight_g = ctx.Output<Tensor>(framework::GradVarName("Weight"));
+    auto* weight_g =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Weight"));
     auto* proj_weight_g =
-        ctx.Output<Tensor>(framework::GradVarName("ProjWeight"));
-    auto* bias_g = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("ProjWeight"));
+    auto* bias_g = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
 
-    auto* h0 = ctx.Input<Tensor>("H0");
-    auto* c0 = ctx.Input<Tensor>("C0");
+    auto* h0 = ctx.Input<phi::DenseTensor>("H0");
+    auto* c0 = ctx.Input<phi::DenseTensor>("C0");
 
-    auto* h0_g = ctx.Output<Tensor>(framework::GradVarName("H0"));
-    auto* c0_g = ctx.Output<Tensor>(framework::GradVarName("C0"));
+    auto* h0_g = ctx.Output<phi::DenseTensor>(framework::GradVarName("H0"));
+    auto* c0_g = ctx.Output<phi::DenseTensor>(framework::GradVarName("C0"));
 
     auto& device_ctx = ctx.template device_context<DeviceContext>();
     phi::funcs::SetConstant<DeviceContext, T> zero;
diff --git a/paddle/fluid/operators/margin_rank_loss_op.h b/paddle/fluid/operators/margin_rank_loss_op.h
index c1bf44510766b..4968f093f5629 100644
--- a/paddle/fluid/operators/margin_rank_loss_op.h
+++ b/paddle/fluid/operators/margin_rank_loss_op.h
@@ -38,12 +38,12 @@ template <typename DeviceContext, typename T>
 class MarginRankLossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
-    auto* out_t = ctx.Output<framework::Tensor>("Out");
-    auto* act_t = ctx.Output<framework::Tensor>("Activated");
+    auto* out_t = ctx.Output<phi::DenseTensor>("Out");
+    auto* act_t = ctx.Output<phi::DenseTensor>("Activated");
 
-    auto* label_t = ctx.Input<framework::Tensor>("Label");
-    auto* x1_t = ctx.Input<framework::Tensor>("X1");
-    auto* x2_t = ctx.Input<framework::Tensor>("X2");
+    auto* label_t = ctx.Input<phi::DenseTensor>("Label");
+    auto* x1_t = ctx.Input<phi::DenseTensor>("X1");
+    auto* x2_t = ctx.Input<phi::DenseTensor>("X2");
 
     out_t->mutable_data<T>(ctx.GetPlace());
     act_t->mutable_data<T>(ctx.GetPlace());
@@ -71,9 +71,9 @@ class MarginRankLossGradKernel : public framework::OpKernel<T> {
     auto* d_x2_t =
         ctx.Output<framework::LoDTensor>(framework::GradVarName("X2"));
 
-    auto* act_t = ctx.Input<framework::Tensor>("Activated");
-    auto* d_out_t = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* label_t = ctx.Input<framework::Tensor>("Label");
+    auto* act_t = ctx.Input<phi::DenseTensor>("Activated");
+    auto* d_out_t = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* label_t = ctx.Input<phi::DenseTensor>("Label");
 
     auto d_out = framework::EigenVector<T>::Flatten(*d_out_t);
     auto act = framework::EigenVector<T>::Flatten(*act_t);
diff --git a/paddle/fluid/operators/marker_op.cu b/paddle/fluid/operators/marker_op.cu
index 3b52788514b91..7c6fe79ab7ff1 100644
--- a/paddle/fluid/operators/marker_op.cu
+++ b/paddle/fluid/operators/marker_op.cu
@@ -40,8 +40,8 @@ class MarkerOpCUDAKernel : public framework::OpKernel<T> {
     VLOG(3) << "marker role: " << marker_role
             << " marker position: " << marker_pos;
 
-    framework::Tensor A;
-    framework::Tensor B;
+    phi::DenseTensor A;
+    phi::DenseTensor B;
     auto* in_temp = A.mutable_data<T>({32, 1}, ctx.GetPlace());
     auto* out_temp = B.mutable_data<T>({32, 1}, ctx.GetPlace());
     platform::RecordEvent record_event(
diff --git a/paddle/fluid/operators/masked_select_op_mlu.cc b/paddle/fluid/operators/masked_select_op_mlu.cc
index 279096b762ca8..50c9973721836 100644
--- a/paddle/fluid/operators/masked_select_op_mlu.cc
+++ b/paddle/fluid/operators/masked_select_op_mlu.cc
@@ -22,9 +22,9 @@ template <typename T>
 class MaskedSelectedMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto input = ctx.Input<framework::Tensor>("X");
-    auto mask = ctx.Input<framework::Tensor>("Mask");
-    auto out = ctx.Output<framework::Tensor>("Y");
+    auto input = ctx.Input<phi::DenseTensor>("X");
+    auto mask = ctx.Input<phi::DenseTensor>("Mask");
+    auto out = ctx.Output<phi::DenseTensor>("Y");
 
     auto input_dim = input->dims();
     auto mask_dim = mask->dims();
@@ -66,9 +66,9 @@ template <typename T>
 class MaskedSelectedGradMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto mask = ctx.Input<framework::Tensor>("Mask");
-    auto y_grad = ctx.Input<framework::Tensor>(framework::GradVarName("Y"));
-    auto x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto mask = ctx.Input<phi::DenseTensor>("Mask");
+    auto y_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
+    auto x_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     auto& dev_ctx =
         ctx.template device_context<paddle::platform::MLUDeviceContext>();
diff --git a/paddle/fluid/operators/masked_select_op_npu.cc b/paddle/fluid/operators/masked_select_op_npu.cc
index 653da86d81c23..df8a32273297b 100644
--- a/paddle/fluid/operators/masked_select_op_npu.cc
+++ b/paddle/fluid/operators/masked_select_op_npu.cc
@@ -22,9 +22,9 @@ template <typename T>
 class MaskedSelectedNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto input = ctx.Input<framework::Tensor>("X");
-    auto mask = ctx.Input<framework::Tensor>("Mask");
-    auto out = ctx.Output<framework::Tensor>("Y");
+    auto input = ctx.Input<phi::DenseTensor>("X");
+    auto mask = ctx.Input<phi::DenseTensor>("Mask");
+    auto out = ctx.Output<phi::DenseTensor>("Y");
 
     auto input_dim = input->dims();
     auto mask_dim = mask->dims();
@@ -111,9 +111,9 @@ template <typename T>
 class MaskedSelectedGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto mask = ctx.Input<framework::Tensor>("Mask");
-    auto y_grad = ctx.Input<framework::Tensor>(framework::GradVarName("Y"));
-    auto x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto mask = ctx.Input<phi::DenseTensor>("Mask");
+    auto y_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
+    auto x_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     x_grad->mutable_data<T>(ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/match_matrix_tensor_op.cc b/paddle/fluid/operators/match_matrix_tensor_op.cc
index 820e754049a23..12538b72128fe 100644
--- a/paddle/fluid/operators/match_matrix_tensor_op.cc
+++ b/paddle/fluid/operators/match_matrix_tensor_op.cc
@@ -24,7 +24,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 using LoD = framework::LoD;
 
@@ -244,7 +244,7 @@ class CPUMatchMatrixTensorOPKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* x = ctx.Input<LoDTensor>("X");
     auto* y = ctx.Input<LoDTensor>("Y");
-    auto* w = ctx.Input<Tensor>("W");
+    auto* w = ctx.Input<phi::DenseTensor>("W");
     auto* out = ctx.Output<LoDTensor>("Out");
     auto* tmp = ctx.Output<LoDTensor>("Tmp");
 
@@ -324,7 +324,7 @@ class CPUMatchMatrixTensorOPGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* x = ctx.Input<LoDTensor>("X");
     auto* y = ctx.Input<LoDTensor>("Y");
-    auto* w = ctx.Input<Tensor>("W");
+    auto* w = ctx.Input<phi::DenseTensor>("W");
     auto* tmp = ctx.Input<LoDTensor>("Tmp");
 
     int dim_t = ctx.Attr<int>("dim_t");
@@ -391,7 +391,7 @@ class CPUMatchMatrixTensorOPGradKernel : public framework::OpKernel<T> {
     auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(ctx);
 
     auto* t_data = w->data<T>();
-    auto* d_w = ctx.Output<Tensor>(framework::GradVarName("W"));
+    auto* d_w = ctx.Output<phi::DenseTensor>(framework::GradVarName("W"));
     auto* t_diff = d_w->mutable_data<T>(ctx.GetPlace());
     memset(t_diff, 0.0, w->dims()[0] * w->dims()[1] * w->dims()[2] * sizeof(T));
     // bottom_diff
diff --git a/paddle/fluid/operators/match_matrix_tensor_op.h b/paddle/fluid/operators/match_matrix_tensor_op.h
index b067d1c028bd3..72e99222ddffb 100644
--- a/paddle/fluid/operators/match_matrix_tensor_op.h
+++ b/paddle/fluid/operators/match_matrix_tensor_op.h
@@ -18,7 +18,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 class MatchMatrixTensorOP : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/math/beam_search.cc b/paddle/fluid/operators/math/beam_search.cc
index 2b607ade728c4..fcb92a7ac7f38 100644
--- a/paddle/fluid/operators/math/beam_search.cc
+++ b/paddle/fluid/operators/math/beam_search.cc
@@ -32,7 +32,7 @@ class BeamSearchFunctor<phi::CPUContext, T> {
                   const framework::LoDTensor *scores,
                   framework::LoDTensor *selected_ids,
                   framework::LoDTensor *selected_scores,
-                  framework::Tensor *parent_idx,
+                  phi::DenseTensor *parent_idx,
                   size_t level,
                   size_t beam_size,
                   int end_id,
diff --git a/paddle/fluid/operators/math/beam_search.cu b/paddle/fluid/operators/math/beam_search.cu
index 80af6f673c40f..02be32bf146e0 100644
--- a/paddle/fluid/operators/math/beam_search.cu
+++ b/paddle/fluid/operators/math/beam_search.cu
@@ -412,7 +412,7 @@ class BeamSearchFunctor<phi::GPUContext, T> {
                   const framework::LoDTensor* scores,
                   framework::LoDTensor* selected_ids,
                   framework::LoDTensor* selected_scores,
-                  framework::Tensor* parent_idx,
+                  phi::DenseTensor* parent_idx,
                   size_t level,
                   size_t beam_size,
                   int end_id,
diff --git a/paddle/fluid/operators/math/beam_search.h b/paddle/fluid/operators/math/beam_search.h
index d444b0abb4798..c6c05434c0b4a 100644
--- a/paddle/fluid/operators/math/beam_search.h
+++ b/paddle/fluid/operators/math/beam_search.h
@@ -112,7 +112,7 @@ class BeamSearchFunctor {
                   const framework::LoDTensor* scores,
                   framework::LoDTensor* selected_ids,
                   framework::LoDTensor* selected_scores,
-                  framework::Tensor* parent_idx,
+                  phi::DenseTensor* parent_idx,
                   size_t level,
                   size_t beam_size,
                   int end_id,
diff --git a/paddle/fluid/operators/math/beam_search_npu.cc b/paddle/fluid/operators/math/beam_search_npu.cc
index b49d4e848b067..1daf97194ed52 100644
--- a/paddle/fluid/operators/math/beam_search_npu.cc
+++ b/paddle/fluid/operators/math/beam_search_npu.cc
@@ -41,7 +41,7 @@ class BeamSearchFunctor<platform::NPUDeviceContext, T> {
                   const framework::LoDTensor* scores,
                   framework::LoDTensor* selected_ids,
                   framework::LoDTensor* selected_scores,
-                  framework::Tensor* parent_idx,
+                  phi::DenseTensor* parent_idx,
                   size_t level,
                   size_t beam_size,
                   int end_id,
@@ -185,8 +185,8 @@ class BeamSearchFunctor<platform::NPUDeviceContext, T> {
           "FillD", {true_tmp_tensor}, {second_pos_true_tensors}, fill_attr2);
       runner_fill_true_tensors.Run(stream);
 
-      std::vector<framework::Tensor> concat_inputs = {first_pos_false_tensors,
-                                                      second_pos_true_tensors};
+      std::vector<phi::DenseTensor> concat_inputs = {first_pos_false_tensors,
+                                                     second_pos_true_tensors};
       std::vector<std::string> concat_names = {"x0", "x1"};
       NpuOpRunner runner_concat_false_true{"ConcatD",
                                            {concat_inputs},
@@ -403,8 +403,8 @@ class BeamSearchFunctor<platform::NPUDeviceContext, T> {
 
     sorted_score_indices.Resize(
         phi::make_ddim({num_seqs, static_cast<int64_t>(beam_size), 1}));
-    std::vector<framework::Tensor> concat_inputs2 = {batch_ids,
-                                                     sorted_score_indices};
+    std::vector<phi::DenseTensor> concat_inputs2 = {batch_ids,
+                                                    sorted_score_indices};
     std::vector<std::string> concat_names = {"x0", "x1"};
     NpuOpRunner runner_concat_score_indices{"ConcatD",
                                             {concat_inputs2},
@@ -429,8 +429,8 @@ class BeamSearchFunctor<platform::NPUDeviceContext, T> {
         phi::make_ddim({num_seqs, static_cast<int64_t>(beam_size), 2}));
     gather_nd_id_indices.mutable_data<int>(place);
 
-    std::vector<framework::Tensor> concat_inputs3 = {batch_ids,
-                                                     cast_sort_tmp_indices};
+    std::vector<phi::DenseTensor> concat_inputs3 = {batch_ids,
+                                                    cast_sort_tmp_indices};
     NpuOpRunner runner_concat_id_indices{"ConcatD",
                                          {concat_inputs3},
                                          {gather_nd_id_indices},
diff --git a/paddle/fluid/operators/math/beam_search_xpu.cc b/paddle/fluid/operators/math/beam_search_xpu.cc
index 9904c142e5a08..ad8edd5d23f81 100644
--- a/paddle/fluid/operators/math/beam_search_xpu.cc
+++ b/paddle/fluid/operators/math/beam_search_xpu.cc
@@ -62,7 +62,7 @@ class BeamSearchFunctor<platform::XPUDeviceContext, T> {
                   const framework::LoDTensor *scores,
                   framework::LoDTensor *selected_ids,
                   framework::LoDTensor *selected_scores,
-                  framework::Tensor *parent_idx,
+                  phi::DenseTensor *parent_idx,
                   size_t level,
                   size_t beam_size,
                   int end_id,
diff --git a/paddle/fluid/operators/math/concat_and_split.cc b/paddle/fluid/operators/math/concat_and_split.cc
index 603584629cc92..a74b345ec835f 100644
--- a/paddle/fluid/operators/math/concat_and_split.cc
+++ b/paddle/fluid/operators/math/concat_and_split.cc
@@ -41,9 +41,9 @@ template <typename T>
 class ConcatFunctor<phi::CPUContext, T> {
  public:
   void operator()(const phi::CPUContext& context,
-                  const std::vector<framework::Tensor>& input,
+                  const std::vector<phi::DenseTensor>& input,
                   int axis,
-                  framework::Tensor* output) {
+                  phi::DenseTensor* output) {
     phi::funcs::ConcatFunctor<phi::CPUContext, T> functor;
     functor(context, input, axis, output);
   }
@@ -57,10 +57,10 @@ template <typename T>
 class SplitFunctor<phi::CPUContext, T> {
  public:
   void operator()(const phi::CPUContext& context,
-                  const framework::Tensor& input,
-                  const std::vector<const framework::Tensor*>& ref_inputs,
+                  const phi::DenseTensor& input,
+                  const std::vector<const phi::DenseTensor*>& ref_inputs,
                   const int axis,
-                  std::vector<framework::Tensor*>* outputs) {
+                  std::vector<phi::DenseTensor*>* outputs) {
     phi::funcs::SplitFunctor<phi::CPUContext, T> functor;
     functor(context, input, ref_inputs, axis, outputs);
   }
@@ -75,9 +75,9 @@ template <typename T>
 class ConcatFunctor<platform::XPUDeviceContext, T> {
  public:
   void operator()(const platform::XPUDeviceContext& context,
-                  const std::vector<framework::Tensor>& input,
+                  const std::vector<phi::DenseTensor>& input,
                   int axis,
-                  framework::Tensor* output) {
+                  phi::DenseTensor* output) {
     int dev_id = context.GetPlace().GetDeviceId();
     platform::XPUDeviceGuard guard(dev_id);
 
@@ -115,10 +115,10 @@ template <typename T>
 class SplitFunctor<platform::XPUDeviceContext, T> {
  public:
   void operator()(const platform::XPUDeviceContext& context,
-                  const framework::Tensor& input,
-                  const std::vector<const framework::Tensor*>& ref_inputs,
+                  const phi::DenseTensor& input,
+                  const std::vector<const phi::DenseTensor*>& ref_inputs,
                   const int axis,
-                  std::vector<framework::Tensor*>* outputs) {
+                  std::vector<phi::DenseTensor*>* outputs) {
     int dev_id = context.GetPlace().GetDeviceId();
     platform::XPUDeviceGuard guard(dev_id);
 
@@ -168,9 +168,9 @@ template <typename T>
 class ConcatFunctor<platform::NPUDeviceContext, T> {
  public:
   void operator()(const platform::NPUDeviceContext& context,
-                  const std::vector<framework::Tensor>& input,
+                  const std::vector<phi::DenseTensor>& input,
                   int axis,
-                  framework::Tensor* output) {
+                  phi::DenseTensor* output) {
     int dev_id = context.GetPlace().GetDeviceId();
     platform::NPUDeviceGuard guard(dev_id);
 
@@ -192,10 +192,10 @@ template <typename T>
 class SplitFunctor<platform::NPUDeviceContext, T> {
  public:
   void operator()(const platform::NPUDeviceContext& context,
-                  const framework::Tensor& input,
-                  const std::vector<const framework::Tensor*>& ref_inputs,
+                  const phi::DenseTensor& input,
+                  const std::vector<const phi::DenseTensor*>& ref_inputs,
                   const int axis,
-                  std::vector<framework::Tensor*>* outputs) {
+                  std::vector<phi::DenseTensor*>* outputs) {
     if (input.numel() == 0) {
       return;
     }
@@ -246,9 +246,9 @@ template <typename T>
 class ConcatFunctor<platform::MLUDeviceContext, T> {
  public:
   void operator()(const platform::MLUDeviceContext& context,
-                  const std::vector<framework::Tensor>& input,
+                  const std::vector<phi::DenseTensor>& input,
                   int axis,
-                  framework::Tensor* output) {
+                  phi::DenseTensor* output) {
     int dev_id = context.GetPlace().GetDeviceId();
     platform::MLUDeviceGuard guard(dev_id);
 
@@ -287,10 +287,10 @@ template <typename T>
 class SplitFunctor<platform::MLUDeviceContext, T> {
  public:
   void operator()(const platform::MLUDeviceContext& context,
-                  const framework::Tensor& input,
-                  const std::vector<const framework::Tensor*>& ref_inputs,
+                  const phi::DenseTensor& input,
+                  const std::vector<const phi::DenseTensor*>& ref_inputs,
                   const int axis,
-                  std::vector<framework::Tensor*>* outputs) {
+                  std::vector<phi::DenseTensor*>* outputs) {
     if (input.numel() == 0) {
       return;
     }
diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu
index 11508fd2d1eae..69b183aa9a01c 100644
--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
@@ -26,9 +26,9 @@ template <typename T>
 class ConcatFunctor<phi::GPUContext, T> {
  public:
   void operator()(const phi::GPUContext& context,
-                  const std::vector<framework::Tensor>& input,
+                  const std::vector<phi::DenseTensor>& input,
                   int axis,
-                  framework::Tensor* output) {
+                  phi::DenseTensor* output) {
     phi::funcs::ConcatFunctor<phi::GPUContext, T> functor;
     functor(context, input, axis, output);
   }
@@ -42,10 +42,10 @@ template <typename T>
 class SplitFunctor<phi::GPUContext, T> {
  public:
   void operator()(const phi::GPUContext& context,
-                  const framework::Tensor& input,
-                  const std::vector<const framework::Tensor*>& ref_inputs,
+                  const phi::DenseTensor& input,
+                  const std::vector<const phi::DenseTensor*>& ref_inputs,
                   int axis,
-                  std::vector<framework::Tensor*>* outputs) {
+                  std::vector<phi::DenseTensor*>* outputs) {
     phi::funcs::SplitFunctor<phi::GPUContext, T> functor;
     functor(context, input, ref_inputs, axis, outputs);
   }
diff --git a/paddle/fluid/operators/math/concat_and_split.h b/paddle/fluid/operators/math/concat_and_split.h
index 66727b8cdbd94..83513caa14573 100644
--- a/paddle/fluid/operators/math/concat_and_split.h
+++ b/paddle/fluid/operators/math/concat_and_split.h
@@ -38,9 +38,9 @@ template <typename DeviceContext, typename T>
 class ConcatFunctor {
  public:
   void operator()(const DeviceContext& context,
-                  const std::vector<framework::Tensor>& input,
+                  const std::vector<phi::DenseTensor>& input,
                   int axis,
-                  framework::Tensor* output);
+                  phi::DenseTensor* output);
 };
 
 /*
@@ -59,10 +59,10 @@ template <typename DeviceContext, typename T>
 class SplitFunctor {
  public:
   void operator()(const DeviceContext& context,
-                  const framework::Tensor& input,
-                  const std::vector<const framework::Tensor*>& ref_inputs,
+                  const phi::DenseTensor& input,
+                  const std::vector<const phi::DenseTensor*>& ref_inputs,
                   int axis,
-                  std::vector<framework::Tensor*>* outputs);
+                  std::vector<phi::DenseTensor*>* outputs);
 };
 
 }  // namespace math
diff --git a/paddle/fluid/operators/math/concat_test.cc b/paddle/fluid/operators/math/concat_test.cc
index ccbe1c2aeed00..b350167cfb46b 100644
--- a/paddle/fluid/operators/math/concat_test.cc
+++ b/paddle/fluid/operators/math/concat_test.cc
@@ -29,13 +29,13 @@ limitations under the License. */
  */
 template <typename DeviceContext, typename Place>
 void ConcatCase1(DeviceContext* context) {
-  paddle::framework::Tensor input_a_cpu;
-  paddle::framework::Tensor input_b_cpu;
-  paddle::framework::Tensor out_cpu;
+  phi::DenseTensor input_a_cpu;
+  phi::DenseTensor input_b_cpu;
+  phi::DenseTensor out_cpu;
 
-  paddle::framework::Tensor input_a;
-  paddle::framework::Tensor input_b;
-  paddle::framework::Tensor out;
+  phi::DenseTensor input_a;
+  phi::DenseTensor input_b;
+  phi::DenseTensor out;
 
   auto dim_a = phi::make_ddim({2, 3, 4});
   auto dim_b = phi::make_ddim({3, 3, 4});
@@ -73,7 +73,7 @@ void ConcatCase1(DeviceContext* context) {
     paddle::framework::TensorCopySync(input_b_cpu, Place(), &input_b);
   }
 
-  std::vector<paddle::framework::Tensor> input;
+  std::vector<phi::DenseTensor> input;
   input.push_back(input_a);
   input.push_back(input_b);
 
@@ -134,13 +134,13 @@ void ConcatCase1(DeviceContext* context) {
  */
 template <typename DeviceContext, typename Place>
 void ConcatCase2(DeviceContext* context) {
-  paddle::framework::Tensor input_a_cpu;
-  paddle::framework::Tensor input_b_cpu;
-  paddle::framework::Tensor out_cpu;
+  phi::DenseTensor input_a_cpu;
+  phi::DenseTensor input_b_cpu;
+  phi::DenseTensor out_cpu;
 
-  paddle::framework::Tensor input_a;
-  paddle::framework::Tensor input_b;
-  paddle::framework::Tensor out;
+  phi::DenseTensor input_a;
+  phi::DenseTensor input_b;
+  phi::DenseTensor out;
 
   auto dim_a = phi::make_ddim({2, 3, 4});
   auto dim_b = phi::make_ddim({2, 4, 4});
@@ -178,7 +178,7 @@ void ConcatCase2(DeviceContext* context) {
     paddle::framework::TensorCopySync(input_b_cpu, Place(), &input_b);
   }
 
-  std::vector<paddle::framework::Tensor> input;
+  std::vector<phi::DenseTensor> input;
   input.push_back(input_a);
   input.push_back(input_b);
 
@@ -243,13 +243,13 @@ void ConcatCase2(DeviceContext* context) {
  */
 template <typename DeviceContext, typename Place>
 void ConcatCase3(DeviceContext* context) {
-  paddle::framework::Tensor input_a_cpu;
-  paddle::framework::Tensor input_b_cpu;
-  paddle::framework::Tensor out_cpu;
+  phi::DenseTensor input_a_cpu;
+  phi::DenseTensor input_b_cpu;
+  phi::DenseTensor out_cpu;
 
-  paddle::framework::Tensor input_a;
-  paddle::framework::Tensor input_b;
-  paddle::framework::Tensor out;
+  phi::DenseTensor input_a;
+  phi::DenseTensor input_b;
+  phi::DenseTensor out;
 
   auto dim_a = phi::make_ddim({2, 3, 4});
   auto dim_b = phi::make_ddim({2, 3, 5});
@@ -287,7 +287,7 @@ void ConcatCase3(DeviceContext* context) {
     paddle::framework::TensorCopySync(input_b_cpu, Place(), &input_b);
   }
 
-  std::vector<paddle::framework::Tensor> input;
+  std::vector<phi::DenseTensor> input;
   input.push_back(input_a);
   input.push_back(input_b);
 
@@ -354,13 +354,13 @@ void ConcatCase3(DeviceContext* context) {
  */
 template <typename DeviceContext, typename Place>
 void ConcatCase4(DeviceContext* context) {
-  paddle::framework::Tensor input_a_cpu;
-  paddle::framework::Tensor input_b_cpu;
-  paddle::framework::Tensor out_cpu;
+  phi::DenseTensor input_a_cpu;
+  phi::DenseTensor input_b_cpu;
+  phi::DenseTensor out_cpu;
 
-  paddle::framework::Tensor input_a;
-  paddle::framework::Tensor input_b;
-  paddle::framework::Tensor out;
+  phi::DenseTensor input_a;
+  phi::DenseTensor input_b;
+  phi::DenseTensor out;
 
   auto dim_a = phi::make_ddim({2, 3, 4});
   auto dim_b = phi::make_ddim({2, 3, 4});
@@ -398,7 +398,7 @@ void ConcatCase4(DeviceContext* context) {
     paddle::framework::TensorCopySync(input_b_cpu, Place(), &input_b);
   }
 
-  std::vector<paddle::framework::Tensor> input;
+  std::vector<phi::DenseTensor> input;
   input.push_back(input_a);
   input.push_back(input_b);
 
diff --git a/paddle/fluid/operators/math/context_project.h b/paddle/fluid/operators/math/context_project.h
index 026b6a9d8fef5..7811bc2854ffd 100644
--- a/paddle/fluid/operators/math/context_project.h
+++ b/paddle/fluid/operators/math/context_project.h
@@ -26,7 +26,7 @@ namespace operators {
 
 namespace math {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 
 /*
@@ -90,14 +90,14 @@ class ContextProjectFunctor {
  public:
   void operator()(const DeviceContext& context,
                   const LoDTensor& in,
-                  const Tensor* padding_data,
+                  const phi::DenseTensor* padding_data,
                   bool padding_trainable,
                   const int context_start,
                   const int context_length,
                   const int context_stride,
                   const int up_pad,
                   const int down_pad,
-                  Tensor* col) {
+                  phi::DenseTensor* col) {
     auto lod_level_0 = in.lod()[0];
 
     math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, float> im2col_ocf;
@@ -226,8 +226,8 @@ class ContextProjectGradFunctor {
                   const int down_pad,
                   bool pad_grad,
                   bool input_grad,
-                  Tensor* padding_data,
-                  Tensor* col) {
+                  phi::DenseTensor* padding_data,
+                  phi::DenseTensor* col) {
     auto lod_level_0 = in.lod()[0];
 
     math::Col2ImFunctor<math::ColFormat::kOCF, DeviceContext, float> col2im_ocf;
diff --git a/paddle/fluid/operators/math/cross_entropy.cc b/paddle/fluid/operators/math/cross_entropy.cc
index 17ff6aff6f93d..f87f5a107e696 100644
--- a/paddle/fluid/operators/math/cross_entropy.cc
+++ b/paddle/fluid/operators/math/cross_entropy.cc
@@ -21,7 +21,7 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
@@ -29,9 +29,9 @@ using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename T>
 struct HardLabelCrossEntropyCPUFunctorImpl {
-  HardLabelCrossEntropyCPUFunctorImpl(framework::Tensor* out,
-                                      const framework::Tensor* prob,
-                                      const framework::Tensor* labels,
+  HardLabelCrossEntropyCPUFunctorImpl(phi::DenseTensor* out,
+                                      const phi::DenseTensor* prob,
+                                      const phi::DenseTensor* labels,
                                       const int ignore_index,
                                       const int axis_dim)
       : out_(out),
@@ -85,9 +85,9 @@ struct HardLabelCrossEntropyCPUFunctorImpl {
   }
 
  private:
-  framework::Tensor* out_;
-  const framework::Tensor* prob_;
-  const framework::Tensor* labels_;
+  phi::DenseTensor* out_;
+  const phi::DenseTensor* prob_;
+  const phi::DenseTensor* labels_;
   const int ignore_index_;
   const int axis_dim_;
 };
@@ -95,9 +95,9 @@ struct HardLabelCrossEntropyCPUFunctorImpl {
 template <typename DeviceContext, typename T>
 void CrossEntropyFunctor<DeviceContext, T>::operator()(
     const DeviceContext& ctx,
-    framework::Tensor* out,
-    const framework::Tensor* prob,
-    const framework::Tensor* labels,
+    phi::DenseTensor* out,
+    const phi::DenseTensor* prob,
+    const phi::DenseTensor* labels,
     const bool softLabel,
     const int ignore_index,
     const int axis_dim) {
diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu
index c366dd6fcef34..0e5b95542455e 100644
--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
@@ -111,9 +111,9 @@ struct HardLabelCrossEntropyCUDAFunctorImpl {
 template <typename DeviceContext, typename T>
 void CrossEntropyFunctor<DeviceContext, T>::operator()(
     const DeviceContext& ctx,
-    framework::Tensor* out,
-    const framework::Tensor* prob,
-    const framework::Tensor* labels,
+    phi::DenseTensor* out,
+    const phi::DenseTensor* prob,
+    const phi::DenseTensor* labels,
     const bool softLabel,
     const int ignore_index,
     const int axis_dim) {
diff --git a/paddle/fluid/operators/math/cross_entropy.h b/paddle/fluid/operators/math/cross_entropy.h
index 0de10789ba02e..fba4c2ebc61c2 100644
--- a/paddle/fluid/operators/math/cross_entropy.h
+++ b/paddle/fluid/operators/math/cross_entropy.h
@@ -61,9 +61,9 @@ template <typename DeviceContext, typename T>
 class CrossEntropyFunctor {
  public:
   void operator()(const DeviceContext& context,
-                  framework::Tensor* out,
-                  const framework::Tensor* prob,
-                  const framework::Tensor* labels,
+                  phi::DenseTensor* out,
+                  const phi::DenseTensor* prob,
+                  const phi::DenseTensor* labels,
                   const bool softLabel,
                   const int ignore_index,
                   const int axis_dim);
diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc
index 9192badedcfff..39b0312e67766 100644
--- a/paddle/fluid/operators/math/im2col.cc
+++ b/paddle/fluid/operators/math/im2col.cc
@@ -35,11 +35,11 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
                     T> {
  public:
   void operator()(const DeviceContext& context,
-                  const framework::Tensor& im,
+                  const phi::DenseTensor& im,
                   const std::vector<int>& dilation,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding,
-                  framework::Tensor* col,
+                  phi::DenseTensor* col,
                   const DataLayout data_layout) {
     PADDLE_ENFORCE_EQ(im.dims().size(),
                       3,
@@ -82,11 +82,11 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
                     T> {
  public:
   void operator()(const DeviceContext& context,
-                  const framework::Tensor& col,
+                  const phi::DenseTensor& col,
                   const std::vector<int>& dilation,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding,
-                  framework::Tensor* im,
+                  phi::DenseTensor* im,
                   const DataLayout data_layout) {
     PADDLE_ENFORCE_EQ(im->dims().size(),
                       3,
@@ -184,11 +184,11 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
                     T> {
  public:
   void operator()(const DeviceContext& context,
-                  const framework::Tensor& im,
+                  const phi::DenseTensor& im,
                   const std::vector<int>& dilation,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding,
-                  framework::Tensor* col,
+                  phi::DenseTensor* col,
                   const DataLayout data_layout) {
     PADDLE_ENFORCE_EQ(im.dims().size(),
                       3,
@@ -259,11 +259,11 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
                     T> {
  public:
   void operator()(const DeviceContext& context,
-                  const framework::Tensor& col,
+                  const phi::DenseTensor& col,
                   const std::vector<int>& dilation,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding,
-                  framework::Tensor* im,
+                  phi::DenseTensor* im,
                   const DataLayout data_layout) {
     PADDLE_ENFORCE_EQ(im->dims().size(),
                       3,
diff --git a/paddle/fluid/operators/math/im2col.cu b/paddle/fluid/operators/math/im2col.cu
index 5812b5d9b26b1..843e50c50a697 100644
--- a/paddle/fluid/operators/math/im2col.cu
+++ b/paddle/fluid/operators/math/im2col.cu
@@ -91,11 +91,11 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
                     T> {
  public:
   void operator()(const DeviceContext& context,
-                  const framework::Tensor& im,
+                  const phi::DenseTensor& im,
                   const std::vector<int>& dilation,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding,
-                  framework::Tensor* col,
+                  phi::DenseTensor* col,
                   const DataLayout data_layout) {
     PADDLE_ENFORCE_EQ(im.dims().size(),
                       3,
@@ -228,11 +228,11 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
                     T> {
  public:
   void operator()(const DeviceContext& context,
-                  const framework::Tensor& col,
+                  const phi::DenseTensor& col,
                   const std::vector<int>& dilation,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding,
-                  framework::Tensor* im,
+                  phi::DenseTensor* im,
                   const DataLayout data_layout) {
     PADDLE_ENFORCE_EQ(im->dims().size(),
                       3,
@@ -372,11 +372,11 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
                     T> {
  public:
   void operator()(const DeviceContext& context,
-                  const framework::Tensor& im,
+                  const phi::DenseTensor& im,
                   const std::vector<int>& dilation,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding,
-                  framework::Tensor* col,
+                  phi::DenseTensor* col,
                   const DataLayout data_layout) {
     PADDLE_ENFORCE_EQ(im.dims().size(),
                       3,
@@ -485,11 +485,11 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
                     T> {
  public:
   void operator()(const DeviceContext& context,
-                  const framework::Tensor& col,
+                  const phi::DenseTensor& col,
                   const std::vector<int>& dilation,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding,
-                  framework::Tensor* im,
+                  phi::DenseTensor* im,
                   const DataLayout data_layout) {
     PADDLE_ENFORCE_EQ(im->dims().size(),
                       3,
diff --git a/paddle/fluid/operators/math/im2col.h b/paddle/fluid/operators/math/im2col.h
index 5e02f166d65be..3cc87ca5d23da 100644
--- a/paddle/fluid/operators/math/im2col.h
+++ b/paddle/fluid/operators/math/im2col.h
@@ -87,11 +87,11 @@ template <ColFormat Format, typename DeviceContext, typename T>
 class Im2ColFunctor {
  public:
   void operator()(const DeviceContext& context,
-                  const framework::Tensor& im,
+                  const phi::DenseTensor& im,
                   const std::vector<int>& dilation,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding,
-                  framework::Tensor* col,
+                  phi::DenseTensor* col,
                   const DataLayout data_layout = DataLayout::kNCHW);
 };
 
@@ -99,11 +99,11 @@ template <ColFormat Format, typename DeviceContext, typename T>
 class Col2ImFunctor {
  public:
   void operator()(const DeviceContext& context,
-                  const framework::Tensor& col,
+                  const phi::DenseTensor& col,
                   const std::vector<int>& dilation,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding,
-                  framework::Tensor* im,
+                  phi::DenseTensor* im,
                   const DataLayout data_layout = DataLayout::kNCHW);
 };
 
diff --git a/paddle/fluid/operators/math/im2col_cfo_cpu.h b/paddle/fluid/operators/math/im2col_cfo_cpu.h
index ab560d6d7005b..bef9e0a8449f6 100644
--- a/paddle/fluid/operators/math/im2col_cfo_cpu.h
+++ b/paddle/fluid/operators/math/im2col_cfo_cpu.h
@@ -27,11 +27,11 @@ namespace math {
  * Support dilation, stride and padding.
  */
 template <typename T>
-inline void im2col_common(const framework::Tensor& im,
+inline void im2col_common(const phi::DenseTensor& im,
                           const std::vector<int>& dilation,
                           const std::vector<int>& stride,
                           const std::vector<int>& padding,
-                          framework::Tensor* col,
+                          phi::DenseTensor* col,
                           const DataLayout data_layout = DataLayout::kNCHW) {
   int im_channels =
       (data_layout != DataLayout::kNHWC ? im.dims()[0] : im.dims()[2]);
@@ -77,8 +77,8 @@ inline void im2col_common(const framework::Tensor& im,
  */
 template <typename T>
 inline void im2col_sh1sw1dh1dw1ph0pw0(
-    const framework::Tensor& im,
-    framework::Tensor* col,
+    const phi::DenseTensor& im,
+    phi::DenseTensor* col,
     const DataLayout data_layout = DataLayout::kNCHW) {
   int im_channels =
       (data_layout != DataLayout::kNHWC ? im.dims()[0] : im.dims()[2]);
@@ -129,8 +129,8 @@ inline void im2col_sh1sw1dh1dw1ph0pw0(
  * and filter_width == 1 have a special implementation
  */
 template <typename T>
-inline void im2col_sh1sw1dh1dw1ph1pw1(const framework::Tensor& im,
-                                      framework::Tensor* col,
+inline void im2col_sh1sw1dh1dw1ph1pw1(const phi::DenseTensor& im,
+                                      phi::DenseTensor* col,
                                       const DataLayout data_layout) {
   int im_channels =
       (data_layout != DataLayout::kNHWC ? im.dims()[0] : im.dims()[2]);
diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc
index 09ec777ebb633..70ac7a225d6a3 100644
--- a/paddle/fluid/operators/math/im2col_test.cc
+++ b/paddle/fluid/operators/math/im2col_test.cc
@@ -22,11 +22,11 @@ limitations under the License. */
 
 template <typename DeviceContext, typename Place>
 void testIm2col() {
-  paddle::framework::Tensor input_tmp;
-  paddle::framework::Tensor input;
-  paddle::framework::Tensor output_cfo;
-  paddle::framework::Tensor output_ocf;
-  paddle::framework::Tensor output_tmp;
+  phi::DenseTensor input_tmp;
+  phi::DenseTensor input;
+  phi::DenseTensor output_cfo;
+  phi::DenseTensor output_ocf;
+  phi::DenseTensor output_tmp;
 
   /**
    * input = [0, 1, 2,
@@ -180,11 +180,11 @@ void testIm2col() {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <>
 void testIm2col<phi::GPUContext, paddle::platform::CUDAPlace>() {
-  paddle::framework::Tensor input_tmp;
-  paddle::framework::Tensor input;
-  paddle::framework::Tensor output_cfo;
-  paddle::framework::Tensor output_ocf;
-  paddle::framework::Tensor output_tmp;
+  phi::DenseTensor input_tmp;
+  phi::DenseTensor input;
+  phi::DenseTensor output_cfo;
+  phi::DenseTensor output_ocf;
+  phi::DenseTensor output_tmp;
 
   /**
    * input = [0, 1, 2,
@@ -349,9 +349,9 @@ TEST(math, im2col) {
 #define PREPARE_IM2COL_CPU                                                   \
   paddle::platform::CPUPlace place;                                          \
   phi::CPUContext context(place);                                            \
-  paddle::framework::Tensor input;                                           \
-  paddle::framework::Tensor out;                                             \
-  paddle::framework::Tensor ref;                                             \
+  phi::DenseTensor input;                                                    \
+  phi::DenseTensor out;                                                      \
+  phi::DenseTensor ref;                                                      \
   std::vector<int> padding({ph, pw});                                        \
   std::vector<int> stride({1, 1});                                           \
   std::vector<int> dilation({1, 1});                                         \
diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc
index 0648f2497d9d7..aa2779c350ab6 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.cc
+++ b/paddle/fluid/operators/math/matrix_bit_code.cc
@@ -20,10 +20,10 @@ namespace math {
 
 template <typename T>
 struct MatrixBitCodeFunctorAdd {
-  const framework::Tensor &vec_;
-  framework::Tensor *tmat_;
+  const phi::DenseTensor &vec_;
+  phi::DenseTensor *tmat_;
 
-  MatrixBitCodeFunctorAdd(const framework::Tensor &vec, framework::Tensor *tmat)
+  MatrixBitCodeFunctorAdd(const phi::DenseTensor &vec, phi::DenseTensor *tmat)
       : vec_(vec), tmat_(tmat) {}
 
   template <typename CodeTable>
@@ -44,18 +44,18 @@ struct MatrixBitCodeFunctorAdd {
 };
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::Add(const framework::Tensor &vec,
-                                  framework::Tensor *tmat) {
+void MatrixBitCodeFunctor<T>::Add(const phi::DenseTensor &vec,
+                                  phi::DenseTensor *tmat) {
   MatrixBitCodeFunctorAdd<T> func(vec, tmat);
   paddle::visit(func, code_table_);
 }
 
 template <typename T>
 struct MatrixBitCodeFunctorAddGrad {
-  const framework::Tensor &tmat_;
-  framework::Tensor *vec_;
-  MatrixBitCodeFunctorAddGrad(const framework::Tensor &tmat,
-                              framework::Tensor *vec)
+  const phi::DenseTensor &tmat_;
+  phi::DenseTensor *vec_;
+  MatrixBitCodeFunctorAddGrad(const phi::DenseTensor &tmat,
+                              phi::DenseTensor *vec)
       : tmat_(tmat), vec_(vec) {}
 
   template <typename CodeTable>
@@ -76,20 +76,20 @@ struct MatrixBitCodeFunctorAddGrad {
 };
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor &tmat,
-                                      framework::Tensor *vec) {
+void MatrixBitCodeFunctor<T>::AddGrad(const phi::DenseTensor &tmat,
+                                      phi::DenseTensor *vec) {
   MatrixBitCodeFunctorAddGrad<T> func(tmat, vec);
   paddle::visit(func, code_table_);
 }
 
 template <typename T>
 struct MatrixBitCodeFunctorSum {
-  const framework::Tensor &tmat_;
-  framework::Tensor *sum_;
+  const phi::DenseTensor &tmat_;
+  phi::DenseTensor *sum_;
   T scale_sum_;
 
-  MatrixBitCodeFunctorSum(const framework::Tensor &tmat,
-                          framework::Tensor *sum,
+  MatrixBitCodeFunctorSum(const phi::DenseTensor &tmat,
+                          phi::DenseTensor *sum,
                           T scale_sum)
       : tmat_(tmat), sum_(sum), scale_sum_(scale_sum) {}
 
@@ -117,8 +117,8 @@ struct MatrixBitCodeFunctorSum {
 };
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::Sum(const framework::Tensor &tmat,
-                                  framework::Tensor *sum,
+void MatrixBitCodeFunctor<T>::Sum(const phi::DenseTensor &tmat,
+                                  phi::DenseTensor *sum,
                                   T scale_sum) {
   MatrixBitCodeFunctorSum<T> func(tmat, sum, scale_sum);
   paddle::visit(func, code_table_);
@@ -126,13 +126,13 @@ void MatrixBitCodeFunctor<T>::Sum(const framework::Tensor &tmat,
 
 template <typename T>
 struct MatrixBitCodeFunctorMul {
-  framework::Tensor *tmat_;
-  const framework::Tensor &weight_;
-  const framework::Tensor &input_;
+  phi::DenseTensor *tmat_;
+  const phi::DenseTensor &weight_;
+  const phi::DenseTensor &input_;
 
-  MatrixBitCodeFunctorMul(framework::Tensor *tmat,
-                          const framework::Tensor &weight,
-                          const framework::Tensor &input)
+  MatrixBitCodeFunctorMul(phi::DenseTensor *tmat,
+                          const phi::DenseTensor &weight,
+                          const phi::DenseTensor &input)
       : tmat_(tmat), weight_(weight), input_(input) {}
 
   template <typename CodeTable>
@@ -160,9 +160,9 @@ struct MatrixBitCodeFunctorMul {
 };
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::Mul(framework::Tensor *tmat,
-                                  const framework::Tensor &weight,
-                                  const framework::Tensor &input) {
+void MatrixBitCodeFunctor<T>::Mul(phi::DenseTensor *tmat,
+                                  const phi::DenseTensor &weight,
+                                  const phi::DenseTensor &input) {
   MatrixBitCodeFunctorMul<T> func(tmat, weight, input);
   paddle::visit(func, code_table_);
 }
@@ -175,12 +175,12 @@ class ReservedVector : public std::vector<T> {
 
 template <typename T>
 struct MatrixBitCodeFunctorMulGradWeight {
-  const framework::Tensor &tmat_;
-  framework::Tensor *weight_;
-  const framework::Tensor &input_;
-  MatrixBitCodeFunctorMulGradWeight(const framework::Tensor &tmat,
-                                    framework::Tensor *weight,
-                                    const framework::Tensor &input)
+  const phi::DenseTensor &tmat_;
+  phi::DenseTensor *weight_;
+  const phi::DenseTensor &input_;
+  MatrixBitCodeFunctorMulGradWeight(const phi::DenseTensor &tmat,
+                                    phi::DenseTensor *weight,
+                                    const phi::DenseTensor &input)
       : tmat_(tmat), weight_(weight), input_(input) {}
   template <typename CodeTable>
   void operator()(const CodeTable &code_table) {
@@ -216,22 +216,22 @@ struct MatrixBitCodeFunctorMulGradWeight {
 };
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor &tmat,
-                                            framework::Tensor *weight,
-                                            const framework::Tensor &input) {
+void MatrixBitCodeFunctor<T>::MulGradWeight(const phi::DenseTensor &tmat,
+                                            phi::DenseTensor *weight,
+                                            const phi::DenseTensor &input) {
   MatrixBitCodeFunctorMulGradWeight<T> func(tmat, weight, input);
   paddle::visit(func, code_table_);
 }
 
 template <typename T>
 struct MatrixBitCodeFunctorMulGradWeightSR {
-  const framework::Tensor &tmat_;
+  const phi::DenseTensor &tmat_;
   phi::SelectedRows *weight_;
-  const framework::Tensor &input_;
+  const phi::DenseTensor &input_;
 
-  MatrixBitCodeFunctorMulGradWeightSR(const framework::Tensor &tmat,
+  MatrixBitCodeFunctorMulGradWeightSR(const phi::DenseTensor &tmat,
                                       phi::SelectedRows *weight,
-                                      const framework::Tensor &input)
+                                      const phi::DenseTensor &input)
       : tmat_(tmat), weight_(weight), input_(input) {}
 
   template <typename CodeTable>
@@ -271,22 +271,22 @@ struct MatrixBitCodeFunctorMulGradWeightSR {
 };
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor &tmat,
+void MatrixBitCodeFunctor<T>::MulGradWeight(const phi::DenseTensor &tmat,
                                             phi::SelectedRows *weight,
-                                            const framework::Tensor &input) {
+                                            const phi::DenseTensor &input) {
   MatrixBitCodeFunctorMulGradWeightSR<T> func(tmat, weight, input);
   paddle::visit(func, code_table_);
 }
 
 template <typename T>
 struct MatrixBitCodeFunctorMulGradError {
-  const framework::Tensor &tmat_;
-  const framework::Tensor &weight_;
-  framework::Tensor *input_;
+  const phi::DenseTensor &tmat_;
+  const phi::DenseTensor &weight_;
+  phi::DenseTensor *input_;
 
-  MatrixBitCodeFunctorMulGradError(const framework::Tensor &tmat,
-                                   const framework::Tensor &weight,
-                                   framework::Tensor *input)
+  MatrixBitCodeFunctorMulGradError(const phi::DenseTensor &tmat,
+                                   const phi::DenseTensor &weight,
+                                   phi::DenseTensor *input)
       : tmat_(tmat), weight_(weight), input_(input) {}
   template <typename CodeTable>
   void operator()(const CodeTable &code_table) {
@@ -315,18 +315,18 @@ struct MatrixBitCodeFunctorMulGradError {
 };
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::MulGradError(const framework::Tensor &tmat,
-                                           const framework::Tensor &weight,
-                                           framework::Tensor *input) {
+void MatrixBitCodeFunctor<T>::MulGradError(const phi::DenseTensor &tmat,
+                                           const phi::DenseTensor &weight,
+                                           phi::DenseTensor *input) {
   MatrixBitCodeFunctorMulGradError<T> func(tmat, weight, input);
   paddle::visit(func, code_table_);
 }
 
 template <typename T>
 struct MatrixBitCodeFunctorSub {
-  framework::Tensor *tmat_;
+  phi::DenseTensor *tmat_;
 
-  explicit MatrixBitCodeFunctorSub(framework::Tensor *tmat) : tmat_(tmat) {}
+  explicit MatrixBitCodeFunctorSub(phi::DenseTensor *tmat) : tmat_(tmat) {}
 
   template <typename CodeTable>
   void operator()(const CodeTable &code_table) {
@@ -346,7 +346,7 @@ struct MatrixBitCodeFunctorSub {
 };
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::Sub(framework::Tensor *tmat) {
+void MatrixBitCodeFunctor<T>::Sub(phi::DenseTensor *tmat) {
   MatrixBitCodeFunctorSub<T> func(tmat);
   paddle::visit(func, code_table_);
 }
diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h
index 7c9d94aa8713b..eb232940b8552 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
@@ -128,8 +128,8 @@ class SimpleCode {
 template <typename T>
 class CustomCode {
  public:
-  CustomCode(const framework::Tensor& path_table,
-             const framework::Tensor& path_code,
+  CustomCode(const phi::DenseTensor& path_table,
+             const phi::DenseTensor& path_code,
              const int64_t* ids,
              int index) {
     seq_len_ = path_table.dims()[1];
@@ -188,8 +188,8 @@ class SimpleCodeTable {
 template <typename T>
 class CustomCodeTable {
  public:
-  CustomCodeTable(const framework::Tensor& path_table,
-                  const framework::Tensor& path_code,
+  CustomCodeTable(const phi::DenseTensor& path_table,
+                  const phi::DenseTensor& path_code,
                   const int64_t* ids)
       : ptable_(path_table), pcode_(path_code), ids_(ids) {}
 
@@ -203,8 +203,8 @@ class CustomCodeTable {
   }
 
  private:
-  const framework::Tensor& ptable_;
-  const framework::Tensor& pcode_;
+  const phi::DenseTensor& ptable_;
+  const phi::DenseTensor& pcode_;
   const int64_t* ids_;
 };
 
@@ -218,8 +218,8 @@ class MatrixBitCodeFunctor {
         ids_(ids),
         code_table_(SimpleCodeTable(num_classes, ids)) {}
 
-  MatrixBitCodeFunctor(const framework::Tensor& path_table,
-                       const framework::Tensor& path_code,
+  MatrixBitCodeFunctor(const phi::DenseTensor& path_table,
+                       const phi::DenseTensor& path_code,
                        const int64_t* ids)
       : num_classes_(static_cast<size_t>(path_table.dims()[1])),
         ids_(ids),
@@ -227,47 +227,47 @@ class MatrixBitCodeFunctor {
   /* For j < code_length
        tmat(i, j) += vec(0, index(i, j))
   */
-  void Add(const framework::Tensor& vec, framework::Tensor* tmat);
+  void Add(const phi::DenseTensor& vec, phi::DenseTensor* tmat);
 
   /* For j < code_length
        vec(0, index(i, j)) += tmat(i, j)
   */
-  void AddGrad(const framework::Tensor& tmat, framework::Tensor* vec);
+  void AddGrad(const phi::DenseTensor& tmat, phi::DenseTensor* vec);
 
   /* For j < code_length
     sum(i, 0) = \sum_j bit(i, j) * tmat(i, j)
   */
-  void Sum(const framework::Tensor& tmat, framework::Tensor* sum, T scale_sum);
+  void Sum(const phi::DenseTensor& tmat, phi::DenseTensor* sum, T scale_sum);
 
   /* For j < code_length
        tmat(i, j) -= bit(i, j)
   */
-  void Sub(framework::Tensor* tmat);
+  void Sub(phi::DenseTensor* tmat);
   /* For j < code_length
        input.row(i) += tmat(i, j) * weight.row(index(i, j))
   */
-  void Mul(framework::Tensor* tmat,
-           const framework::Tensor& weight,
-           const framework::Tensor& input);
+  void Mul(phi::DenseTensor* tmat,
+           const phi::DenseTensor& weight,
+           const phi::DenseTensor& input);
 
   /* For index(i, j) >= 0:
       weight.row(index(i, j)) += tmat(i, j) * input.row(i)
   */
-  void MulGradWeight(const framework::Tensor& tmat,
-                     framework::Tensor* weight,
-                     const framework::Tensor& input);
+  void MulGradWeight(const phi::DenseTensor& tmat,
+                     phi::DenseTensor* weight,
+                     const phi::DenseTensor& input);
   /* For SelectedRows Weight, For index(i, j) >= 0:
       weight.row(index(i, j)) += tmat(i, j) * input.row(i)
   */
-  void MulGradWeight(const framework::Tensor& tmat,
+  void MulGradWeight(const phi::DenseTensor& tmat,
                      phi::SelectedRows* weight,
-                     const framework::Tensor& input);
+                     const phi::DenseTensor& input);
   /* For j < code_length
     input.row(i) += tmat(i, j) * weight.row(index(i, j))
   */
-  void MulGradError(const framework::Tensor& tmat,
-                    const framework::Tensor& weight,
-                    framework::Tensor* input);
+  void MulGradError(const phi::DenseTensor& tmat,
+                    const phi::DenseTensor& weight,
+                    phi::DenseTensor* input);
 
   size_t num_classes_;
   const int64_t* ids_;
diff --git a/paddle/fluid/operators/math/maxouting.cc b/paddle/fluid/operators/math/maxouting.cc
index 2205ed51e1913..91ae7d472d931 100644
--- a/paddle/fluid/operators/math/maxouting.cc
+++ b/paddle/fluid/operators/math/maxouting.cc
@@ -23,8 +23,8 @@ namespace math {
 // All tensors are in NCHW or NHWC format, and the groups must be greater than 1
 template <typename DeviceContext, typename T>
 void MaxOutFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
-                                                 const framework::Tensor& input,
-                                                 framework::Tensor* output,
+                                                 const phi::DenseTensor& input,
+                                                 phi::DenseTensor* output,
                                                  const int groups,
                                                  const int axis) {
   const int batch_size = input.dims()[0];
@@ -66,10 +66,10 @@ void MaxOutFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
 template <typename DeviceContext, typename T>
 void MaxOutGradFunctor<DeviceContext, T>::operator()(
     const DeviceContext& context,
-    const framework::Tensor& input,
-    framework::Tensor* input_grad,
-    const framework::Tensor& output,
-    const framework::Tensor& output_grad,
+    const phi::DenseTensor& input,
+    phi::DenseTensor* input_grad,
+    const phi::DenseTensor& output,
+    const phi::DenseTensor& output_grad,
     const int groups,
     const int axis) {
   const int batch_size = input.dims()[0];
diff --git a/paddle/fluid/operators/math/maxouting.cu b/paddle/fluid/operators/math/maxouting.cu
index c84d90897220e..df115fd16966d 100644
--- a/paddle/fluid/operators/math/maxouting.cu
+++ b/paddle/fluid/operators/math/maxouting.cu
@@ -107,8 +107,8 @@ __global__ void KernelMaxoutGrad(const int nthreads,
  */
 template <typename DeviceContext, typename T>
 void MaxOutFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
-                                                 const framework::Tensor& input,
-                                                 framework::Tensor* output,
+                                                 const phi::DenseTensor& input,
+                                                 phi::DenseTensor* output,
                                                  const int groups,
                                                  const int axis) {
   const int batch_size = input.dims()[0];
@@ -140,10 +140,10 @@ void MaxOutFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
 template <typename DeviceContext, typename T>
 void MaxOutGradFunctor<DeviceContext, T>::operator()(
     const DeviceContext& context,
-    const framework::Tensor& input,
-    framework::Tensor* input_grad,
-    const framework::Tensor& output,
-    const framework::Tensor& output_grad,
+    const phi::DenseTensor& input,
+    phi::DenseTensor* input_grad,
+    const phi::DenseTensor& output,
+    const phi::DenseTensor& output_grad,
     const int groups,
     const int axis) {
   const int batch_size = input.dims()[0];
diff --git a/paddle/fluid/operators/math/maxouting.h b/paddle/fluid/operators/math/maxouting.h
index d1a6f92185cba..f42bbdb0e38ee 100644
--- a/paddle/fluid/operators/math/maxouting.h
+++ b/paddle/fluid/operators/math/maxouting.h
@@ -26,8 +26,8 @@ template <typename DeviceContext, typename T>
 class MaxOutFunctor {
  public:
   void operator()(const DeviceContext& context,
-                  const framework::Tensor& input,
-                  framework::Tensor* output,
+                  const phi::DenseTensor& input,
+                  phi::DenseTensor* output,
                   const int groups,
                   const int axis = 1);
 };
@@ -36,10 +36,10 @@ template <typename DeviceContext, typename T>
 class MaxOutGradFunctor {
  public:
   void operator()(const DeviceContext& context,
-                  const framework::Tensor& input,
-                  framework::Tensor* input_grad,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
+                  const phi::DenseTensor& input,
+                  phi::DenseTensor* input_grad,
+                  const phi::DenseTensor& output,
+                  const phi::DenseTensor& output_grad,
                   const int groups,
                   const int axis = 1);
 };
diff --git a/paddle/fluid/operators/math/sample_prob.cu b/paddle/fluid/operators/math/sample_prob.cu
index f18053e297e55..e3cc5a5741b02 100644
--- a/paddle/fluid/operators/math/sample_prob.cu
+++ b/paddle/fluid/operators/math/sample_prob.cu
@@ -31,7 +31,7 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 __device__ T gpu_adjust_prob(const T prob,
@@ -129,9 +129,9 @@ void GPUSampleWithProb<T>::operator()(const phi::GPUContext& context,
                                       const int dict_size,
                                       const bool uniq,
                                       const std::size_t num_samples,
-                                      const Tensor* L,
-                                      Tensor* S,
-                                      Tensor* P) {
+                                      const phi::DenseTensor* L,
+                                      phi::DenseTensor* S,
+                                      phi::DenseTensor* P) {
   // UNDERSTAND: dimension issues
   const auto lbl_dim = L->dims();
   const int batch_size = lbl_dim[0];
diff --git a/paddle/fluid/operators/math/sample_prob.h b/paddle/fluid/operators/math/sample_prob.h
index ad4d3489c21fe..2464ac25186f0 100644
--- a/paddle/fluid/operators/math/sample_prob.h
+++ b/paddle/fluid/operators/math/sample_prob.h
@@ -27,7 +27,7 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 /* UNDERSTAND: utility function to adjust probability for unique sampling,
 return whatever as it is if not using unique samping */
@@ -46,9 +46,9 @@ class SampleWithProb {
   void operator()(const DeviceContext& context,
                   const Sampler& sampler,
                   const std::size_t num_samples,
-                  const Tensor* L,
-                  Tensor* S,
-                  Tensor* P) {
+                  const phi::DenseTensor* L,
+                  phi::DenseTensor* S,
+                  phi::DenseTensor* P) {
     // UNDERSTAND: dimension issues
     const auto& lbl_dim = L->dims();
     const int batch_size = lbl_dim[0];
@@ -117,9 +117,9 @@ class GPUSampleWithProb {
                   const int dict_size,
                   const bool uniq,
                   const std::size_t num_samples,
-                  const Tensor* L,
-                  Tensor* S,
-                  Tensor* P);
+                  const phi::DenseTensor* L,
+                  phi::DenseTensor* S,
+                  phi::DenseTensor* P);
 };
 #endif
 }  // namespace math
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index 354af32beabee..c1b57899cf7ef 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -113,8 +113,8 @@ template <typename T>
 struct SelectedRowsAddTensor<phi::CPUContext, T> {
   void operator()(const phi::CPUContext& context,
                   const phi::SelectedRows& input1,
-                  const framework::Tensor& input2,
-                  framework::Tensor* output) {
+                  const phi::DenseTensor& input2,
+                  phi::DenseTensor* output) {
     auto in1_height = input1.height();
     const auto& in2_dims = input2.dims();
     const auto& out_dims = output->dims();
@@ -280,7 +280,7 @@ template <typename T>
 struct SelectedRowsAddToTensor<phi::CPUContext, T> {
   void operator()(const phi::CPUContext& context,
                   const phi::SelectedRows& input1,
-                  framework::Tensor* input2) {
+                  phi::DenseTensor* input2) {
     if (UNLIKELY(input1.rows().size() == 0)) {
       LOG(WARNING) << "input selected rows is empty!";
       return;
@@ -851,7 +851,7 @@ struct UpdateToTensor<phi::CPUContext, T> {
   void operator()(const phi::CPUContext& context,
                   const ScatterOps& op,
                   const phi::SelectedRows& input1,
-                  framework::Tensor* input2) {
+                  phi::DenseTensor* input2) {
     auto in1_height = input1.height();
     const auto& in2_dims = input2->dims();
     PADDLE_ENFORCE_EQ(
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
index 7fa9dc27db9cd..27ee703ac5a79 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -137,8 +137,8 @@ template <typename T>
 struct SelectedRowsAddTensor<phi::GPUContext, T> {
   void operator()(const phi::GPUContext& context,
                   const phi::SelectedRows& input1,
-                  const framework::Tensor& input2,
-                  framework::Tensor* output) {
+                  const phi::DenseTensor& input2,
+                  phi::DenseTensor* output) {
     auto in1_height = input1.height();
     auto in2_dims = input2.dims();
     auto out_dims = output->dims();
@@ -289,7 +289,7 @@ template <typename T>
 struct SelectedRowsAddToTensor<phi::GPUContext, T> {
   void operator()(const phi::GPUContext& context,
                   const phi::SelectedRows& input1,
-                  framework::Tensor* input2) {
+                  phi::DenseTensor* input2) {
     auto in1_height = input1.height();
     auto in2_dims = input2->dims();
     PADDLE_ENFORCE_EQ(
@@ -591,7 +591,7 @@ struct UpdateToTensor<phi::GPUContext, T> {
   void operator()(const phi::GPUContext& context,
                   const ScatterOps& op,
                   const phi::SelectedRows& input1,
-                  framework::Tensor* input2) {
+                  phi::DenseTensor* input2) {
     // NOTE: Use SelectedRowsAddToTensor for better performance
     //       no additional MergeAdd called.
     MergeAdd<phi::GPUContext, T> merge_func;
diff --git a/paddle/fluid/operators/math/selected_rows_functor.h b/paddle/fluid/operators/math/selected_rows_functor.h
index cf64b5d77e5be..76df85f0a6807 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.h
+++ b/paddle/fluid/operators/math/selected_rows_functor.h
@@ -44,8 +44,8 @@ template <typename DeviceContext, typename T>
 struct SelectedRowsAddTensor {
   void operator()(const DeviceContext& context,
                   const phi::SelectedRows& input1,
-                  const framework::Tensor& input2,
-                  framework::Tensor* output);
+                  const phi::DenseTensor& input2,
+                  phi::DenseTensor* output);
 };
 
 // input2 = input1 + input2
@@ -73,7 +73,7 @@ template <typename DeviceContext, typename T>
 struct SelectedRowsAddToTensor {
   void operator()(const DeviceContext& context,
                   const phi::SelectedRows& input1,
-                  framework::Tensor* input2);
+                  phi::DenseTensor* input2);
 };
 
 namespace scatter {
@@ -115,7 +115,7 @@ struct UpdateToTensor {
   void operator()(const DeviceContext& context,
                   const ScatterOps& op,
                   const phi::SelectedRows& input1,
-                  framework::Tensor* input2);
+                  phi::DenseTensor* input2);
 };
 
 }  // namespace scatter
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cc
index ecb8aa7824724..700050420826d 100644
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc
@@ -79,13 +79,11 @@ TEST(selected_rows_functor, cpu_add) {
   EXPECT_EQ(out_data[5 * row_numel + 7], 2.0);
   EXPECT_EQ(out_data[6 * row_numel + 9], 2.0);
 
-  std::unique_ptr<paddle::framework::Tensor> tensor1{
-      new paddle::framework::Tensor()};
+  std::unique_ptr<phi::DenseTensor> tensor1{new phi::DenseTensor()};
   tensor1->mutable_data<float>(phi::make_ddim({height, row_numel}), cpu_place);
   functor(ctx, tensor1.get(), 3.0);
 
-  std::unique_ptr<paddle::framework::Tensor> tensor2{
-      new paddle::framework::Tensor()};
+  std::unique_ptr<phi::DenseTensor> tensor2{new phi::DenseTensor()};
   tensor2->mutable_data<float>(phi::make_ddim({height, row_numel}), cpu_place);
 
   paddle::operators::math::SelectedRowsAddTensor<phi::CPUContext, float>
@@ -174,8 +172,7 @@ TEST(selected_rows_functor, cpu_add_to) {
   EXPECT_EQ(out_data[5 * row_numel + 7], 2.0);
   EXPECT_EQ(out_data[6 * row_numel + 9], 2.0);
 
-  std::unique_ptr<paddle::framework::Tensor> tensor1{
-      new paddle::framework::Tensor()};
+  std::unique_ptr<phi::DenseTensor> tensor1{new phi::DenseTensor()};
   tensor1->mutable_data<float>(phi::make_ddim({height, row_numel}), cpu_place);
   functor(ctx, tensor1.get(), 3.0);
 
@@ -475,8 +472,7 @@ TEST(selected_rows_functor, cpu_sum_to) {
   EXPECT_EQ(out_data[4 * row_numel + 4], 2.0);
   EXPECT_EQ(out_data[5 * row_numel + 7], 2.0);
   EXPECT_EQ(out_data[6 * row_numel + 9], 2.0);
-  std::unique_ptr<paddle::framework::Tensor> tensor1{
-      new paddle::framework::Tensor()};
+  std::unique_ptr<phi::DenseTensor> tensor1{new phi::DenseTensor()};
   tensor1->mutable_data<float>(phi::make_ddim({height, row_numel}), cpu_place);
   functor(ctx, tensor1.get(), 3.0);
   paddle::operators::math::SelectedRowsAddToTensor<phi::CPUContext, float>
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
index 746a64ff58cde..7c04b466b006d 100644
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
@@ -79,7 +79,7 @@ TEST(selected_rows_functor, gpu_add) {
   EXPECT_EQ(out_rows[5], 7);
   EXPECT_EQ(out_rows[6], 9);
 
-  paddle::framework::Tensor out_cpu;
+  phi::DenseTensor out_cpu;
   paddle::framework::TensorCopy(*out_value, cpu_place, ctx, &out_cpu);
   ctx.Wait();
 
@@ -96,20 +96,18 @@ TEST(selected_rows_functor, gpu_add) {
   EXPECT_EQ(out_cpu_data[5 * row_numel + 7], 2.0);
   EXPECT_EQ(out_cpu_data[6 * row_numel + 9], 2.0);
 
-  std::unique_ptr<paddle::framework::Tensor> tensor1{
-      new paddle::framework::Tensor()};
+  std::unique_ptr<phi::DenseTensor> tensor1{new phi::DenseTensor()};
   tensor1->mutable_data<float>(phi::make_ddim({height, row_numel}), gpu_place);
   functor(ctx, tensor1.get(), 3.0);
 
-  std::unique_ptr<paddle::framework::Tensor> tensor2{
-      new paddle::framework::Tensor()};
+  std::unique_ptr<phi::DenseTensor> tensor2{new phi::DenseTensor()};
   tensor2->mutable_data<float>(phi::make_ddim({height, row_numel}), gpu_place);
 
   paddle::operators::math::SelectedRowsAddTensor<phi::GPUContext, float>
       add_tensor_functor;
   add_tensor_functor(ctx, *output, *tensor1, tensor2.get());
 
-  paddle::framework::Tensor tensor2_cpu;
+  phi::DenseTensor tensor2_cpu;
   paddle::framework::TensorCopy(*tensor2, cpu_place, ctx, &tensor2_cpu);
   ctx.Wait();
 
@@ -184,7 +182,7 @@ TEST(selected_rows_functor, gpu_add_to) {
   EXPECT_EQ(out_rows[5], 7);
   EXPECT_EQ(out_rows[6], 9);
 
-  paddle::framework::Tensor out_cpu;
+  phi::DenseTensor out_cpu;
   paddle::framework::TensorCopy(*out_value, cpu_place, ctx, &out_cpu);
   ctx.Wait();
 
@@ -201,8 +199,7 @@ TEST(selected_rows_functor, gpu_add_to) {
   EXPECT_EQ(out_cpu_data[5 * row_numel + 7], 2.0);
   EXPECT_EQ(out_cpu_data[6 * row_numel + 9], 2.0);
 
-  std::unique_ptr<paddle::framework::Tensor> tensor1{
-      new paddle::framework::Tensor()};
+  std::unique_ptr<phi::DenseTensor> tensor1{new phi::DenseTensor()};
   tensor1->mutable_data<float>(phi::make_ddim({height, row_numel}), gpu_place);
   functor(ctx, tensor1.get(), 3.0);
 
@@ -210,7 +207,7 @@ TEST(selected_rows_functor, gpu_add_to) {
       add_to_tensor_functor;
   add_to_tensor_functor(ctx, *output, tensor1.get());
 
-  paddle::framework::Tensor tensor1_cpu;
+  phi::DenseTensor tensor1_cpu;
   paddle::framework::TensorCopy(*tensor1, cpu_place, ctx, &tensor1_cpu);
   ctx.Wait();
 
@@ -269,7 +266,7 @@ TEST(selected_rows_functor, gpu_merge_add) {
   inputs.push_back(selected_rows2.get());
   merge_add_functor(ctx, inputs, output.get());
 
-  paddle::framework::Tensor output_cpu;
+  phi::DenseTensor output_cpu;
   paddle::framework::TensorCopy(output->value(), cpu_place, ctx, &output_cpu);
   ctx.Wait();
 
diff --git a/paddle/fluid/operators/math/sequence_padding.cc b/paddle/fluid/operators/math/sequence_padding.cc
index 273f99a5f9691..9575f4e6e2466 100644
--- a/paddle/fluid/operators/math/sequence_padding.cc
+++ b/paddle/fluid/operators/math/sequence_padding.cc
@@ -26,8 +26,8 @@ namespace operators {
 namespace math {
 
 template <typename T>
-void CopyValidData(framework::Tensor* dst_tensor,
-                   const framework::Tensor* src_tensor,
+void CopyValidData(phi::DenseTensor* dst_tensor,
+                   const phi::DenseTensor* src_tensor,
                    const framework::Vector<size_t>& seq_offsets,
                    int pad_seq_len,
                    int step_width,
diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc
index a600c37a89108..bcd683d6ec137 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
@@ -24,7 +24,7 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
@@ -42,7 +42,7 @@ class MaxSeqPoolFunctor {
                   const framework::LoDTensor& input,
                   T pad_value,
                   framework::LoDTensor* output,
-                  framework::Tensor* index) {
+                  phi::DenseTensor* index) {
     auto in_dims = input.dims();
     auto out_dims = output->dims();
     auto idx_dims = index->dims();
@@ -121,7 +121,7 @@ class MaxSeqPoolFunctor<T, true> {
                   const framework::LoDTensor& input,
                   T pad_value,
                   framework::LoDTensor* output,
-                  framework::Tensor* index) {
+                  phi::DenseTensor* index) {
     auto in_dims = input.dims();
     auto out_dims = output->dims();
     PADDLE_ENFORCE_GT(in_dims.size(),
@@ -180,7 +180,7 @@ class MaxSeqPoolGradFunctor {
  public:
   void operator()(const phi::CPUContext& context,
                   const framework::LoDTensor& out_grad,
-                  const framework::Tensor& index,
+                  const phi::DenseTensor& index,
                   framework::LoDTensor* in_grad) {
     auto og_dims = out_grad.dims();
     auto ig_dims = in_grad->dims();
@@ -352,7 +352,7 @@ class SequencePoolFunctor<phi::CPUContext, T> {
                   const framework::LoDTensor& input,
                   framework::LoDTensor* output,
                   bool is_test,
-                  framework::Tensor* index = nullptr) {
+                  phi::DenseTensor* index = nullptr) {
     if (pooltype == "MAX") {
       if (is_test) {
         math::MaxSeqPoolFunctor<T, true> max_pool;
@@ -442,7 +442,7 @@ class SequencePoolGradFunctor<phi::CPUContext, T> {
                   const framework::LoDTensor& out_grad,
                   framework::LoDTensor* in_grad,
                   /* max pool has index */
-                  const framework::Tensor* index = nullptr) {
+                  const phi::DenseTensor* index = nullptr) {
     if (pooltype == "MAX") {
       math::MaxSeqPoolGradFunctor<T> max_pool_grad;
       max_pool_grad(context, out_grad, *index, in_grad);
diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu
index a5edb1db95c3f..41b322ba23179 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cu
+++ b/paddle/fluid/operators/math/sequence_pooling.cu
@@ -197,7 +197,7 @@ class SequencePoolFunctor<phi::GPUContext, T> {
                   const framework::LoDTensor& input,
                   framework::LoDTensor* output,
                   bool is_test,
-                  framework::Tensor* index = nullptr) {
+                  phi::DenseTensor* index = nullptr) {
     auto lod_level = input.lod().size();
     auto& lod = input.lod()[lod_level - 1];
     const size_t item_dim = output->numel() / output->dims()[0];
@@ -415,7 +415,7 @@ class SequencePoolGradFunctor<phi::GPUContext, T> {
                   const framework::LoDTensor& out_grad,
                   framework::LoDTensor* in_grad,
                   /* max pool has index */
-                  const framework::Tensor* index = nullptr) {
+                  const phi::DenseTensor* index = nullptr) {
     auto lod_level = in_grad->lod().size();
     auto& lod = in_grad->lod()[lod_level - 1];
     const size_t item_dim = in_grad->numel() / in_grad->dims()[0];
diff --git a/paddle/fluid/operators/math/sequence_pooling.h b/paddle/fluid/operators/math/sequence_pooling.h
index 378fb3a172add..a82d7ad4f802d 100644
--- a/paddle/fluid/operators/math/sequence_pooling.h
+++ b/paddle/fluid/operators/math/sequence_pooling.h
@@ -33,7 +33,7 @@ class SequencePoolFunctor {
                   const framework::LoDTensor& input,
                   framework::LoDTensor* output,
                   bool is_test = false,
-                  framework::Tensor* index = nullptr);
+                  phi::DenseTensor* index = nullptr);
 };
 
 template <typename DeviceContext, typename T>
@@ -44,7 +44,7 @@ class SequencePoolGradFunctor {
                   const framework::LoDTensor& out_grad,
                   framework::LoDTensor* in_grad,
                   /* max pool has index */
-                  const framework::Tensor* index = nullptr);
+                  const phi::DenseTensor* index = nullptr);
 };
 
 }  // namespace math
diff --git a/paddle/fluid/operators/math/sequence_pooling_test.cc b/paddle/fluid/operators/math/sequence_pooling_test.cc
index 9cff64f75607b..422e52351c235 100644
--- a/paddle/fluid/operators/math/sequence_pooling_test.cc
+++ b/paddle/fluid/operators/math/sequence_pooling_test.cc
@@ -92,7 +92,7 @@ void TestSequencePoolingSum(const DeviceContext &context,
     for (size_t i = 0; i < in_grad.lod()[0].size() - 1; ++i) {
       int64_t begin = in_grad.lod()[0][i];
       int64_t end = in_grad.lod()[0][i + 1];
-      paddle::framework::Tensor tmp = in_grad.Slice(begin, end);
+      phi::DenseTensor tmp = in_grad.Slice(begin, end);
       for (int64_t j = 0; j != tmp.numel() / second_dim; ++j) {
         for (int64_t m = 0; m != second_dim; ++m) {
           EXPECT_EQ(tmp.data<T>()[m + j * second_dim],
@@ -104,7 +104,7 @@ void TestSequencePoolingSum(const DeviceContext &context,
     for (size_t i = 0; i < cpu_in_grad.lod()[0].size() - 1; ++i) {
       int64_t begin = cpu_in_grad.lod()[0][i];
       int64_t end = cpu_in_grad.lod()[0][i + 1];
-      paddle::framework::Tensor tmp = cpu_in_grad.Slice(begin, end);
+      phi::DenseTensor tmp = cpu_in_grad.Slice(begin, end);
       for (int64_t j = 0; j != tmp.numel() / second_dim; ++j) {
         for (int64_t m = 0; m != second_dim; ++m) {
           EXPECT_EQ(tmp.data<T>()[m + j * second_dim],
diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu
index 6729b962f2af2..c70e1e3e7405a 100644
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -23,7 +23,7 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
 using DataLayout = platform::DataLayout;
 template <typename T>
@@ -32,8 +32,8 @@ using CudnnDataType = platform::CudnnDataType<T>;
 template <typename T, typename DeviceContext>
 void SoftmaxCUDNNFunctor<T, DeviceContext>::operator()(
     const DeviceContext& context,
-    const framework::Tensor* X,
-    framework::Tensor* Y) {
+    const phi::DenseTensor* X,
+    phi::DenseTensor* Y) {
   // ------------------- cudnn descriptors ---------------------
   ScopedTensorDescriptor xDesc;
   ScopedTensorDescriptor yDesc;
@@ -83,9 +83,9 @@ void SoftmaxCUDNNFunctor<T, DeviceContext>::operator()(
 template <typename T, typename DeviceContext>
 void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
     const DeviceContext& context,
-    const framework::Tensor* Y,
-    const framework::Tensor* YGrad,
-    framework::Tensor* XGrad) {
+    const phi::DenseTensor* Y,
+    const phi::DenseTensor* YGrad,
+    phi::DenseTensor* XGrad) {
   // ------------------- cudnn descriptors ---------------------
   ScopedTensorDescriptor yDesc;
   ScopedTensorDescriptor dyDesc;
diff --git a/paddle/fluid/operators/math/softmax.h b/paddle/fluid/operators/math/softmax.h
index 958244bdbb208..9d25309d146a8 100644
--- a/paddle/fluid/operators/math/softmax.h
+++ b/paddle/fluid/operators/math/softmax.h
@@ -24,8 +24,8 @@ class SoftmaxFunctor {
  public:
   void operator()(const DeviceContext& context,
                   const int axis_dim,
-                  const framework::Tensor* X,
-                  framework::Tensor* Y);
+                  const phi::DenseTensor* X,
+                  phi::DenseTensor* Y);
 };
 
 template <typename DeviceContext, typename T, typename Enable = void>
@@ -33,9 +33,9 @@ class SoftmaxGradFunctor {
  public:
   void operator()(const DeviceContext& context,
                   const int axis_dim,
-                  const framework::Tensor* y,
-                  const framework::Tensor* y_grad,
-                  framework::Tensor* x_grad);
+                  const phi::DenseTensor* y,
+                  const phi::DenseTensor* y_grad,
+                  phi::DenseTensor* x_grad);
 };
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -43,17 +43,17 @@ template <typename T, typename DeviceContext>
 class SoftmaxCUDNNFunctor {
  public:
   void operator()(const DeviceContext& context,
-                  const framework::Tensor* X,
-                  framework::Tensor* Y);
+                  const phi::DenseTensor* X,
+                  phi::DenseTensor* Y);
 };
 
 template <typename T, typename DeviceContext>
 class SoftmaxGradCUDNNFunctor {
  public:
   void operator()(const DeviceContext& context,
-                  const framework::Tensor* Y,
-                  const framework::Tensor* y_grad,
-                  framework::Tensor* x_grad);
+                  const phi::DenseTensor* Y,
+                  const phi::DenseTensor* y_grad,
+                  phi::DenseTensor* x_grad);
 };
 
 #endif
diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h
index 8a0eb2ad7a91c..3ce7374e4d39f 100644
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -47,8 +47,8 @@ class SoftmaxEigen {
  public:
   void operator()(const DeviceContext& context,
                   const int axis_dim,
-                  const framework::Tensor* X,
-                  framework::Tensor* Y) {
+                  const phi::DenseTensor* X,
+                  phi::DenseTensor* Y) {
     constexpr int kBatchDim = 0;
     constexpr int kClassDim = 1;
     constexpr int kAxisDim = 1;
@@ -108,8 +108,8 @@ class SoftmaxEigen<DeviceContext, platform::float16> {
  public:
   void operator()(const DeviceContext& context,
                   const int axis_dim,
-                  const framework::Tensor* X,
-                  framework::Tensor* Y) {
+                  const phi::DenseTensor* X,
+                  phi::DenseTensor* Y) {
     constexpr int kBatchDim = 0;
     constexpr int kClassDim = 1;
     constexpr int kAxisDim = 1;
@@ -166,8 +166,8 @@ class SoftmaxEigen<DeviceContext, platform::bfloat16> {
  public:
   void operator()(const DeviceContext& context,
                   const int axis_dim,
-                  const framework::Tensor* X,
-                  framework::Tensor* Y) {
+                  const phi::DenseTensor* X,
+                  phi::DenseTensor* Y) {
     constexpr int kBatchDim = 0;
     constexpr int kClassDim = 1;
     constexpr int kAxisDim = 1;
@@ -223,8 +223,8 @@ template <typename DeviceContext, typename T, typename Enable>
 void SoftmaxFunctor<DeviceContext, T, Enable>::operator()(
     const DeviceContext& context,
     const int axis_dim,
-    const framework::Tensor* X,
-    framework::Tensor* Y) {
+    const phi::DenseTensor* X,
+    phi::DenseTensor* Y) {
   SoftmaxEigen<DeviceContext, T>()(context, axis_dim, X, Y);
 }
 
@@ -237,8 +237,8 @@ class SoftmaxFunctor<DeviceContext, T, enable_if_CPU<DeviceContext>> {
  public:
   void operator()(const DeviceContext& context,
                   const int axis_dim,
-                  const framework::Tensor* X,
-                  framework::Tensor* Y) {
+                  const phi::DenseTensor* X,
+                  phi::DenseTensor* Y) {
     const auto& in_dims = X->dims();
     constexpr int kBatchDim = 0;
     constexpr int kClassDim = 1;
@@ -277,9 +277,9 @@ class SoftmaxGradEigen {
  public:
   void operator()(const DeviceContext& context,
                   const int axis_dim,
-                  const framework::Tensor* y,
-                  const framework::Tensor* y_grad,
-                  framework::Tensor* x_grad) {
+                  const phi::DenseTensor* y,
+                  const phi::DenseTensor* y_grad,
+                  phi::DenseTensor* x_grad) {
     auto softmax = EigenMatrix<T>::From(*y);
     auto softmax_grad = EigenMatrix<T>::From(*y_grad);
     auto logits_grad = EigenMatrix<T>::From(*x_grad);
@@ -312,9 +312,9 @@ class SoftmaxGradEigen<DeviceContext, platform::float16> {
  public:
   void operator()(const DeviceContext& context,
                   const int axis_dim,
-                  const framework::Tensor* y,
-                  const framework::Tensor* y_grad,
-                  framework::Tensor* x_grad) {
+                  const phi::DenseTensor* y,
+                  const phi::DenseTensor* y_grad,
+                  phi::DenseTensor* x_grad) {
     auto softmax = EigenMatrix<platform::float16>::From(*y);
     auto softmax_grad = EigenMatrix<platform::float16>::From(*y_grad);
     auto logits_grad = EigenMatrix<platform::float16>::From(*x_grad);
@@ -346,9 +346,9 @@ class SoftmaxGradEigen<DeviceContext, platform::bfloat16> {
  public:
   void operator()(const DeviceContext& context,
                   const int axis_dim,
-                  const framework::Tensor* y,
-                  const framework::Tensor* y_grad,
-                  framework::Tensor* x_grad) {
+                  const phi::DenseTensor* y,
+                  const phi::DenseTensor* y_grad,
+                  phi::DenseTensor* x_grad) {
     auto softmax = EigenMatrix<platform::bfloat16>::From(*y);
     auto softmax_grad = EigenMatrix<platform::bfloat16>::From(*y_grad);
     auto logits_grad = EigenMatrix<platform::bfloat16>::From(*x_grad);
@@ -379,9 +379,9 @@ template <typename DeviceContext, typename T, typename Enable>
 void SoftmaxGradFunctor<DeviceContext, T, Enable>::operator()(
     const DeviceContext& context,
     const int axis_dim,
-    const framework::Tensor* y,
-    const framework::Tensor* y_grad,
-    framework::Tensor* x_grad) {
+    const phi::DenseTensor* y,
+    const phi::DenseTensor* y_grad,
+    phi::DenseTensor* x_grad) {
   SoftmaxGradEigen<DeviceContext, T>()(context, axis_dim, y, y_grad, x_grad);
 }
 
@@ -390,9 +390,9 @@ class SoftmaxGradFunctor<DeviceContext, T, enable_if_CPU<DeviceContext>> {
  public:
   void operator()(const DeviceContext& context,
                   const int axis_dim,
-                  const framework::Tensor* y,
-                  const framework::Tensor* y_grad,
-                  framework::Tensor* x_grad) {
+                  const phi::DenseTensor* y,
+                  const phi::DenseTensor* y_grad,
+                  phi::DenseTensor* x_grad) {
     const auto& out_dims = y->dims();
     constexpr int kBatchDim = 0;
     constexpr int kClassDim = 1;
diff --git a/paddle/fluid/operators/math/tree2col.cc b/paddle/fluid/operators/math/tree2col.cc
index 70f377e42e59f..1bf20c9cc75a1 100644
--- a/paddle/fluid/operators/math/tree2col.cc
+++ b/paddle/fluid/operators/math/tree2col.cc
@@ -51,7 +51,7 @@ std::vector<TreeNode> Tree2ColUtil::construct_patch(
   return patch;
 }
 
-void Tree2ColUtil::construct_tree(const framework::Tensor &EdgeSet,
+void Tree2ColUtil::construct_tree(const phi::DenseTensor &EdgeSet,
                                   std::vector<std::vector<int>> *tr,
                                   size_t *node_count) {
   const auto &edge_set_dims = EdgeSet.dims();
@@ -87,9 +87,9 @@ template <typename T>
 class Tree2ColFunctor<phi::CPUContext, T> {
  public:
   void operator()(const phi::CPUContext &context,
-                  const framework::Tensor &EdgeSet,
-                  const framework::Tensor &node_features,
-                  framework::Tensor *patch,
+                  const phi::DenseTensor &EdgeSet,
+                  const phi::DenseTensor &node_features,
+                  phi::DenseTensor *patch,
                   int max_depth) {
     std::vector<std::vector<int>> tr;
     const auto &feature_dims = node_features.dims();
@@ -141,9 +141,9 @@ template <typename T>
 class Col2TreeFunctor<phi::CPUContext, T> {
  public:
   void operator()(const phi::CPUContext &context,
-                  const framework::Tensor &EdgeSet,
-                  const framework::Tensor &out_grad,
-                  framework::Tensor *in_grad,
+                  const phi::DenseTensor &EdgeSet,
+                  const phi::DenseTensor &out_grad,
+                  phi::DenseTensor *in_grad,
                   int max_depth) {
     std::vector<std::vector<int>> tr;
     const auto &output_dims = out_grad.dims();
diff --git a/paddle/fluid/operators/math/tree2col.cu b/paddle/fluid/operators/math/tree2col.cu
index 3aceceac32de2..3b467448ac09d 100644
--- a/paddle/fluid/operators/math/tree2col.cu
+++ b/paddle/fluid/operators/math/tree2col.cu
@@ -20,7 +20,7 @@
 namespace paddle {
 namespace operators {
 namespace math {
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using Node = paddle::operators::math::TreeNode;
 template <typename T>
 __global__ void tree2col(const T* eta,
@@ -54,9 +54,9 @@ template <typename T>
 class Tree2ColFunctor<phi::GPUContext, T> {
  public:
   void operator()(const phi::GPUContext& context,
-                  const framework::Tensor& EdgeSet,
-                  const framework::Tensor& node_features,
-                  framework::Tensor* patch,
+                  const phi::DenseTensor& EdgeSet,
+                  const phi::DenseTensor& node_features,
+                  phi::DenseTensor* patch,
                   int max_depth) {
     std::vector<std::vector<int>> tr;
     auto gpu_place = context.GetPlace();
@@ -131,9 +131,9 @@ template <typename T>
 class Col2TreeFunctor<phi::GPUContext, T> {
  public:
   void operator()(const phi::GPUContext& context,
-                  const framework::Tensor& EdgeSet,
-                  const framework::Tensor& patch_grad,
-                  framework::Tensor* embedding_grad,
+                  const phi::DenseTensor& EdgeSet,
+                  const phi::DenseTensor& patch_grad,
+                  phi::DenseTensor* embedding_grad,
                   int max_depth) {
     std::vector<std::vector<int>> tr;
     auto gpu_place = context.GetPlace();
diff --git a/paddle/fluid/operators/math/tree2col.h b/paddle/fluid/operators/math/tree2col.h
index 9509a5cf3b745..154d6c2a90a43 100644
--- a/paddle/fluid/operators/math/tree2col.h
+++ b/paddle/fluid/operators/math/tree2col.h
@@ -65,7 +65,7 @@ class Tree2ColUtil {
   static std::vector<TreeNode> construct_patch(
       size_t root, int max_depth, const std::vector<std::vector<int>> &tr);
 
-  static void construct_tree(const framework::Tensor &EdgeSet,
+  static void construct_tree(const phi::DenseTensor &EdgeSet,
                              std::vector<std::vector<int>> *tr,
                              size_t *node_count);
 };
@@ -74,18 +74,18 @@ template <typename DeviceContext, typename T>
 class Tree2ColFunctor {
  public:
   void operator()(const DeviceContext &context,
-                  const framework::Tensor &EdgeSet,
-                  const framework::Tensor &node_features,
-                  framework::Tensor *patch,
+                  const phi::DenseTensor &EdgeSet,
+                  const phi::DenseTensor &node_features,
+                  phi::DenseTensor *patch,
                   int max_depth);
 };
 template <typename DeviceContext, typename T>
 class Col2TreeFunctor {
  public:
   void operator()(const DeviceContext &context,
-                  const framework::Tensor &EdgeSet,
-                  const framework::Tensor &out_grad,
-                  framework::Tensor *in_grad,
+                  const phi::DenseTensor &EdgeSet,
+                  const phi::DenseTensor &out_grad,
+                  phi::DenseTensor *in_grad,
                   int max_depth);
 };
 }  // namespace math
diff --git a/paddle/fluid/operators/math/unpooling.cc b/paddle/fluid/operators/math/unpooling.cc
index d119e814585b5..bcfdc876b4b26 100644
--- a/paddle/fluid/operators/math/unpooling.cc
+++ b/paddle/fluid/operators/math/unpooling.cc
@@ -21,9 +21,9 @@ template <typename T>
 class Unpool2dMaxFunctor<phi::CPUContext, T> {
  public:
   void operator()(const phi::CPUContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& indices,
-                  framework::Tensor* output) {
+                  const phi::DenseTensor& input,
+                  const phi::DenseTensor& indices,
+                  phi::DenseTensor* output) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -64,11 +64,11 @@ template <class T>
 class Unpool2dMaxGradFunctor<phi::CPUContext, T> {
  public:
   void operator()(const phi::CPUContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& indices,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  framework::Tensor* input_grad) {
+                  const phi::DenseTensor& input,
+                  const phi::DenseTensor& indices,
+                  const phi::DenseTensor& output,
+                  const phi::DenseTensor& output_grad,
+                  phi::DenseTensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -110,9 +110,9 @@ template <typename T>
 class Unpool3dMaxFunctor<phi::CPUContext, T> {
  public:
   void operator()(const phi::CPUContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& indices,
-                  framework::Tensor* output) {
+                  const phi::DenseTensor& input,
+                  const phi::DenseTensor& indices,
+                  phi::DenseTensor* output) {
     const int batch_size = input.dims()[0];
     const int input_depth = input.dims()[2];
     const int input_height = input.dims()[3];
@@ -156,11 +156,11 @@ template <class T>
 class Unpool3dMaxGradFunctor<phi::CPUContext, T> {
  public:
   void operator()(const phi::CPUContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& indices,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  framework::Tensor* input_grad) {
+                  const phi::DenseTensor& input,
+                  const phi::DenseTensor& indices,
+                  const phi::DenseTensor& output,
+                  const phi::DenseTensor& output_grad,
+                  phi::DenseTensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_depth = input.dims()[2];
     const int input_height = input.dims()[3];
diff --git a/paddle/fluid/operators/math/unpooling.cu b/paddle/fluid/operators/math/unpooling.cu
index 253f4cb027938..e3d7abb6e0d71 100644
--- a/paddle/fluid/operators/math/unpooling.cu
+++ b/paddle/fluid/operators/math/unpooling.cu
@@ -114,9 +114,9 @@ template <typename T>
 class Unpool2dMaxFunctor<phi::GPUContext, T> {
  public:
   void operator()(const phi::GPUContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& indices,
-                  framework::Tensor* output) {
+                  const phi::DenseTensor& input,
+                  const phi::DenseTensor& indices,
+                  phi::DenseTensor* output) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -151,11 +151,11 @@ template <typename T>
 class Unpool2dMaxGradFunctor<phi::GPUContext, T> {
  public:
   void operator()(const phi::GPUContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& indices,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  framework::Tensor* input_grad) {
+                  const phi::DenseTensor& input,
+                  const phi::DenseTensor& indices,
+                  const phi::DenseTensor& output,
+                  const phi::DenseTensor& output_grad,
+                  phi::DenseTensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -192,9 +192,9 @@ template <typename T>
 class Unpool3dMaxFunctor<phi::GPUContext, T> {
  public:
   void operator()(const phi::GPUContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& indices,
-                  framework::Tensor* output) {
+                  const phi::DenseTensor& input,
+                  const phi::DenseTensor& indices,
+                  phi::DenseTensor* output) {
     const int batch_size = input.dims()[0];
     const int input_depth = input.dims()[2];
     const int input_height = input.dims()[3];
@@ -233,11 +233,11 @@ template <typename T>
 class Unpool3dMaxGradFunctor<phi::GPUContext, T> {
  public:
   void operator()(const phi::GPUContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& indices,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  framework::Tensor* input_grad) {
+                  const phi::DenseTensor& input,
+                  const phi::DenseTensor& indices,
+                  const phi::DenseTensor& output,
+                  const phi::DenseTensor& output_grad,
+                  phi::DenseTensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_depth = input.dims()[2];
     const int input_height = input.dims()[3];
diff --git a/paddle/fluid/operators/math/unpooling.h b/paddle/fluid/operators/math/unpooling.h
index 1b0f52dacd970..11d6f14a2ece3 100644
--- a/paddle/fluid/operators/math/unpooling.h
+++ b/paddle/fluid/operators/math/unpooling.h
@@ -23,38 +23,38 @@ template <typename DeviceContext, typename T>
 class Unpool2dMaxFunctor {
  public:
   void operator()(const DeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& indices,
-                  framework::Tensor* output);
+                  const phi::DenseTensor& input,
+                  const phi::DenseTensor& indices,
+                  phi::DenseTensor* output);
 };
 template <typename DeviceContext, class T>
 class Unpool2dMaxGradFunctor {
  public:
   void operator()(const DeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& indices,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  framework::Tensor* input_grad);
+                  const phi::DenseTensor& input,
+                  const phi::DenseTensor& indices,
+                  const phi::DenseTensor& output,
+                  const phi::DenseTensor& output_grad,
+                  phi::DenseTensor* input_grad);
 };
 
 template <typename DeviceContext, typename T>
 class Unpool3dMaxFunctor {
  public:
   void operator()(const DeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& indices,
-                  framework::Tensor* output);
+                  const phi::DenseTensor& input,
+                  const phi::DenseTensor& indices,
+                  phi::DenseTensor* output);
 };
 template <typename DeviceContext, class T>
 class Unpool3dMaxGradFunctor {
  public:
   void operator()(const DeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& indices,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  framework::Tensor* input_grad);
+                  const phi::DenseTensor& input,
+                  const phi::DenseTensor& indices,
+                  const phi::DenseTensor& output,
+                  const phi::DenseTensor& output_grad,
+                  phi::DenseTensor* input_grad);
 };
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/vol2col.cc b/paddle/fluid/operators/math/vol2col.cc
index 680cd6a344579..041d79ee1f175 100644
--- a/paddle/fluid/operators/math/vol2col.cc
+++ b/paddle/fluid/operators/math/vol2col.cc
@@ -30,11 +30,11 @@ template <class T>
 class Vol2ColFunctor<phi::CPUContext, T> {
  public:
   void operator()(const phi::CPUContext& context,
-                  const framework::Tensor& vol,
+                  const phi::DenseTensor& vol,
                   const std::vector<int>& dilations,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
-                  framework::Tensor* col,
+                  phi::DenseTensor* col,
                   const DataLayout data_layout) const {
     PADDLE_ENFORCE_EQ(vol.dims().size(),
                       4,
@@ -156,11 +156,11 @@ template <class T>
 class Col2VolFunctor<phi::CPUContext, T> {
  public:
   void operator()(const phi::CPUContext& context,
-                  const framework::Tensor& col,
+                  const phi::DenseTensor& col,
                   const std::vector<int>& dilations,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
-                  framework::Tensor* vol,
+                  phi::DenseTensor* vol,
                   const DataLayout data_layout) const {
     PADDLE_ENFORCE_EQ(vol->dims().size(),
                       4,
diff --git a/paddle/fluid/operators/math/vol2col.cu b/paddle/fluid/operators/math/vol2col.cu
index 90c2fcf6e27df..765f31eba34f0 100644
--- a/paddle/fluid/operators/math/vol2col.cu
+++ b/paddle/fluid/operators/math/vol2col.cu
@@ -104,11 +104,11 @@ __global__ void vol2col(int num_kernels,
 template <class DeviceContext, class T>
 void Vol2ColFunctor<DeviceContext, T>::operator()(
     const DeviceContext& context,
-    const framework::Tensor& vol,
+    const phi::DenseTensor& vol,
     const std::vector<int>& dilations,
     const std::vector<int>& strides,
     const std::vector<int>& paddings,
-    framework::Tensor* col,
+    phi::DenseTensor* col,
     const DataLayout data_layout) const {
   PADDLE_ENFORCE_EQ(vol.dims().size(),
                     4,
@@ -310,11 +310,11 @@ __global__ void col2vol(int num_kernels,
 template <class DeviceContext, class T>
 void Col2VolFunctor<DeviceContext, T>::operator()(
     const DeviceContext& context,
-    const framework::Tensor& col,
+    const phi::DenseTensor& col,
     const std::vector<int>& dilations,
     const std::vector<int>& strides,
     const std::vector<int>& paddings,
-    framework::Tensor* vol,
+    phi::DenseTensor* vol,
     const DataLayout data_layout) const {
   PADDLE_ENFORCE_EQ(vol->dims().size(),
                     4,
diff --git a/paddle/fluid/operators/math/vol2col.h b/paddle/fluid/operators/math/vol2col.h
index 92ac7b66a0f5d..a5df8f93382ce 100644
--- a/paddle/fluid/operators/math/vol2col.h
+++ b/paddle/fluid/operators/math/vol2col.h
@@ -72,11 +72,11 @@ template <typename DeviceContext, typename T>
 class Vol2ColFunctor {
  public:
   void operator()(const DeviceContext& context,
-                  const framework::Tensor& vol,
+                  const phi::DenseTensor& vol,
                   const std::vector<int>& dilations,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
-                  framework::Tensor* col,
+                  phi::DenseTensor* col,
                   const DataLayout data_layout = DataLayout::kNCHW) const;
 };
 
@@ -84,11 +84,11 @@ template <typename DeviceContext, typename T>
 class Col2VolFunctor {
  public:
   void operator()(const DeviceContext& context,
-                  const framework::Tensor& col,
+                  const phi::DenseTensor& col,
                   const std::vector<int>& dilations,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
-                  framework::Tensor* vol,
+                  phi::DenseTensor* vol,
                   const DataLayout data_layout = DataLayout::kNCHW) const;
 };
 
diff --git a/paddle/fluid/operators/math/vol2col_test.cc b/paddle/fluid/operators/math/vol2col_test.cc
index c0c4ed5bb5d69..65db94752b987 100644
--- a/paddle/fluid/operators/math/vol2col_test.cc
+++ b/paddle/fluid/operators/math/vol2col_test.cc
@@ -21,10 +21,10 @@ limitations under the License. */
 
 template <typename DeviceContext, typename Place>
 void testVol2col() {
-  paddle::framework::Tensor input;
-  paddle::framework::Tensor input_tmp;
-  paddle::framework::Tensor output;
-  paddle::framework::Tensor output_tmp;
+  phi::DenseTensor input;
+  phi::DenseTensor input_tmp;
+  phi::DenseTensor output;
+  phi::DenseTensor output_tmp;
 
   auto* place = new Place();
   DeviceContext* context = new DeviceContext(*place);
@@ -133,10 +133,10 @@ void testVol2col() {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <>
 void testVol2col<phi::GPUContext, paddle::platform::CUDAPlace>() {
-  paddle::framework::Tensor input;
-  paddle::framework::Tensor input_tmp;
-  paddle::framework::Tensor output;
-  paddle::framework::Tensor output_tmp;
+  phi::DenseTensor input;
+  phi::DenseTensor input_tmp;
+  phi::DenseTensor output;
+  phi::DenseTensor output_tmp;
 
   auto* place = new paddle::platform::CUDAPlace();
   auto* context = new phi::GPUContext(*place);
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index a49ceb42559c5..58fac7b69925b 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -61,10 +61,10 @@ class MatMulKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
     auto &x = GET_DATA_SAFELY(
-        context.Input<framework::Tensor>("X"), "Input", "X", "MatMul");
+        context.Input<phi::DenseTensor>("X"), "Input", "X", "MatMul");
     auto &y = GET_DATA_SAFELY(
-        context.Input<framework::Tensor>("Y"), "Input", "Y", "MatMul");
-    auto *out = context.Output<framework::Tensor>("Out");
+        context.Input<phi::DenseTensor>("Y"), "Input", "Y", "MatMul");
+    auto *out = context.Output<phi::DenseTensor>("Out");
 
     auto &dev_ctx = context.template device_context<DeviceContext>();
     dev_ctx.template Alloc<T>(out, out->numel() * sizeof(T));
@@ -116,7 +116,7 @@ class MatMulKernel : public framework::OpKernel<T> {
 
 // Reshape a rank-3 tensor from P x M x N to (P * M) x N.
 // Identity op if the tensor is not of rank 3.
-static framework::Tensor FoldInitDims(const framework::Tensor &input) {
+static phi::DenseTensor FoldInitDims(const phi::DenseTensor &input) {
   auto output = input;
   auto in_dims = input.dims();
   if (in_dims.size() == 3) {
@@ -129,13 +129,13 @@ static framework::Tensor FoldInitDims(const framework::Tensor &input) {
 // (Warning: This requires transposing data and writes into new memory.)
 // Identity op if the tensor is not of rank 3.
 template <typename DeviceContext, typename T>
-static framework::Tensor FoldHeadAndLastDims(const DeviceContext &context,
-                                             const framework::Tensor &input) {
+static phi::DenseTensor FoldHeadAndLastDims(const DeviceContext &context,
+                                            const phi::DenseTensor &input) {
   auto in_dims = input.dims();
   if (in_dims.size() != 3) {
     return input;
   }
-  framework::Tensor output;
+  phi::DenseTensor output;
   output.Resize({in_dims[1], in_dims[0], in_dims[2]});
   output.mutable_data<T>(context.GetPlace());
   std::vector<int> axis = {1, 0, 2};
@@ -153,7 +153,7 @@ static framework::Tensor FoldHeadAndLastDims(const DeviceContext &context,
  * If transposed, `H,W` will be swapped.
  */
 static void ReshapeTensorIntoMatrixSequence(
-    framework::Tensor *x, const phi::funcs::MatDescriptor &descriptor) {
+    phi::DenseTensor *x, const phi::funcs::MatDescriptor &descriptor) {
   int64_t h, w;
   h = descriptor.height_;
   w = descriptor.width_;
@@ -181,9 +181,9 @@ static void ReshapeTensorIntoMatrixSequence(
  * If any of `X` and `Y` has batch size BatchSize, the out will have the
  * BatchSize.
  */
-static void ReshapeXYOutIntoMatrixSequence(framework::Tensor *x,
-                                           framework::Tensor *y,
-                                           framework::Tensor *out,
+static void ReshapeXYOutIntoMatrixSequence(phi::DenseTensor *x,
+                                           phi::DenseTensor *y,
+                                           phi::DenseTensor *out,
                                            bool trans_x,
                                            bool trans_y) {
   auto x_dim = RowMatrixFromVector(x->dims());
@@ -231,11 +231,11 @@ template <typename DeviceContext, typename T>
 class MatMulGradKernel : public framework::OpKernel<T> {
  public:
   void MatMul(const framework::ExecutionContext &context,
-              const framework::Tensor &a,
+              const phi::DenseTensor &a,
               bool trans_a,
-              const framework::Tensor &b,
+              const phi::DenseTensor &b,
               bool trans_b,
-              framework::Tensor *out) const {
+              phi::DenseTensor *out) const {
     out->mutable_data<T>(context.GetPlace());
     auto blas = phi::funcs::GetBlas<DeviceContext, T>(context);
     auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a.dims(), 0, trans_a);
@@ -266,13 +266,13 @@ class MatMulGradKernel : public framework::OpKernel<T> {
   }
 
   void CalcInputGrad(const framework::ExecutionContext &context,
-                     const framework::Tensor &a,
+                     const phi::DenseTensor &a,
                      bool trans_a,
                      bool is_fold_init_dims_a,
-                     const framework::Tensor &b,
+                     const phi::DenseTensor &b,
                      bool trans_b,
                      bool is_fold_init_dims_b,
-                     framework::Tensor *out) const {
+                     phi::DenseTensor *out) const {
     if (out == nullptr) return;
     bool need_combine = (a.dims().size() == 3 || b.dims().size() == 3) &&
                         out->dims().size() == 2;
@@ -293,12 +293,11 @@ class MatMulGradKernel : public framework::OpKernel<T> {
   }
 
   void Compute(const framework::ExecutionContext &context) const override {
-    auto x = *context.Input<framework::Tensor>("X");
-    auto y = *context.Input<framework::Tensor>("Y");
-    auto dout =
-        *context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto *dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto *dy = context.Output<framework::Tensor>(framework::GradVarName("Y"));
+    auto x = *context.Input<phi::DenseTensor>("X");
+    auto y = *context.Input<phi::DenseTensor>("Y");
+    auto dout = *context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto *dx = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto *dy = context.Output<phi::DenseTensor>(framework::GradVarName("Y"));
     bool transpose_x = context.Attr<bool>("transpose_X");
     bool transpose_y = context.Attr<bool>("transpose_Y");
 
@@ -370,12 +369,12 @@ template <typename DeviceContext, typename T>
 class MatMulDoubleGradKernel : public framework::OpKernel<T> {
  public:
   void MatMul(const framework::ExecutionContext &context,
-              const framework::Tensor &a,
+              const phi::DenseTensor &a,
               bool trans_a,
-              const framework::Tensor &b,
+              const phi::DenseTensor &b,
               bool trans_b,
               bool flag,
-              framework::Tensor *out) const {
+              phi::DenseTensor *out) const {
     out->mutable_data<T>(context.GetPlace());
     auto blas = phi::funcs::GetBlas<DeviceContext, T>(context);
     auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a.dims(), 0, trans_a);
@@ -404,14 +403,14 @@ class MatMulDoubleGradKernel : public framework::OpKernel<T> {
   }
 
   void CalcInputGrad(const framework::ExecutionContext &context,
-                     const framework::Tensor &a,
+                     const phi::DenseTensor &a,
                      bool trans_a,
                      bool is_fold_init_dims_a,
-                     const framework::Tensor &b,
+                     const phi::DenseTensor &b,
                      bool trans_b,
                      bool is_fold_init_dims_b,
                      bool flag,
-                     framework::Tensor *out) const {
+                     phi::DenseTensor *out) const {
     if (out == nullptr) return;
     bool need_combine = (a.dims().size() == 3 || b.dims().size() == 3) &&
                         out->dims().size() == 2;
@@ -433,8 +432,8 @@ class MatMulDoubleGradKernel : public framework::OpKernel<T> {
   }
 
   void Compute(const framework::ExecutionContext &context) const override {
-    auto x = *context.Input<framework::Tensor>("X");
-    auto y = *context.Input<framework::Tensor>("Y");
+    auto x = *context.Input<phi::DenseTensor>("X");
+    auto y = *context.Input<phi::DenseTensor>("Y");
     auto dout = *context.Input<framework::LoDTensor>("DOut");
     auto *ddx = context.Input<framework::LoDTensor>("DDX");
     auto *ddy = context.Input<framework::LoDTensor>("DDY");
@@ -713,7 +712,7 @@ class MatMulOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const framework::Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     if (framework::IsComplexType(expected_kernel_type.data_type_)) {
       // only promote inputs’s types when contains complex input
diff --git a/paddle/fluid/operators/matmul_op_mlu.cc b/paddle/fluid/operators/matmul_op_mlu.cc
index c5484e2d0406f..e55996903a7d1 100644
--- a/paddle/fluid/operators/matmul_op_mlu.cc
+++ b/paddle/fluid/operators/matmul_op_mlu.cc
@@ -18,13 +18,13 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 static void Mul(const framework::ExecutionContext& ctx,
-                const Tensor& X,
-                const Tensor& Y,
-                Tensor* Out,
+                const phi::DenseTensor& X,
+                const phi::DenseTensor& Y,
+                phi::DenseTensor* Out,
                 const float alpha) {
   Out->mutable_data<T>(ctx.GetPlace());
 
@@ -48,9 +48,9 @@ static void Mul(const framework::ExecutionContext& ctx,
 
 template <typename T>
 static void MatMul2D(const framework::ExecutionContext& ctx,
-                     const Tensor& X,
-                     const Tensor& Y,
-                     Tensor* Out,
+                     const phi::DenseTensor& X,
+                     const phi::DenseTensor& Y,
+                     phi::DenseTensor* Out,
                      const bool trans_x,
                      const bool trans_y,
                      const float alpha) {
@@ -81,9 +81,9 @@ static void MatMul2D(const framework::ExecutionContext& ctx,
 
 template <typename T>
 static void MatMulND(const framework::ExecutionContext& ctx,
-                     const Tensor& X,
-                     const Tensor& Y,
-                     Tensor* Out,
+                     const phi::DenseTensor& X,
+                     const phi::DenseTensor& Y,
+                     phi::DenseTensor* Out,
                      const bool trans_x,
                      const bool trans_y,
                      const float alpha) {
@@ -118,8 +118,8 @@ template <typename T>
 static void ReduceDims(const framework::ExecutionContext& ctx,
                        const std::vector<int64_t>& dims,
                        const std::vector<int64_t>& bcast_dims,
-                       const Tensor& in,
-                       Tensor* out) {
+                       const phi::DenseTensor& in,
+                       phi::DenseTensor* out) {
   std::vector<int64_t> axes;
   int64_t size = bcast_dims.size();
   int64_t diff = bcast_dims.size() - dims.size();
@@ -162,9 +162,9 @@ template <typename T>
 class MatMulMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* X = ctx.Input<framework::Tensor>("X");
-    auto* Y = ctx.Input<framework::Tensor>("Y");
-    auto* Out = ctx.Output<framework::Tensor>("Out");
+    auto* X = ctx.Input<phi::DenseTensor>("X");
+    auto* Y = ctx.Input<phi::DenseTensor>("Y");
+    auto* Out = ctx.Output<phi::DenseTensor>("Out");
     bool transpose_x = ctx.Attr<bool>("transpose_X");
     bool transpose_y = ctx.Attr<bool>("transpose_Y");
     float alpha = static_cast<T>(ctx.Attr<float>("alpha"));
@@ -253,11 +253,11 @@ template <typename T>
 class MatMulGradMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* X = ctx.Input<framework::Tensor>("X");
-    auto* Y = ctx.Input<framework::Tensor>("Y");
-    auto* dOut = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dX = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dY = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+    auto* X = ctx.Input<phi::DenseTensor>("X");
+    auto* Y = ctx.Input<phi::DenseTensor>("Y");
+    auto* dOut = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dX = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* dY = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
     bool transpose_x = ctx.Attr<bool>("transpose_X");
     bool transpose_y = ctx.Attr<bool>("transpose_Y");
     float alpha = static_cast<T>(ctx.Attr<float>("alpha"));
diff --git a/paddle/fluid/operators/matmul_op_npu.cc b/paddle/fluid/operators/matmul_op_npu.cc
index e99b21fc696ba..31b352b90f6a8 100644
--- a/paddle/fluid/operators/matmul_op_npu.cc
+++ b/paddle/fluid/operators/matmul_op_npu.cc
@@ -19,15 +19,15 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using NPUDeviceContext = platform::NPUDeviceContext;
 
 template <typename T>
 static void Mul(const framework::ExecutionContext& ctx,
                 const aclrtStream& stream,
-                const Tensor& X,
-                const Tensor& Y,
-                Tensor* Out,
+                const phi::DenseTensor& X,
+                const phi::DenseTensor& Y,
+                phi::DenseTensor* Out,
                 const float alpha) {
   Out->mutable_data<T>(ctx.GetPlace());
 
@@ -49,9 +49,9 @@ static void Mul(const framework::ExecutionContext& ctx,
 template <typename T>
 static void Dot(const framework::ExecutionContext& ctx,
                 const aclrtStream& stream,
-                const Tensor& X,
-                const Tensor& Y,
-                Tensor* Out,
+                const phi::DenseTensor& X,
+                const phi::DenseTensor& Y,
+                phi::DenseTensor* Out,
                 const float alpha) {
   Out->mutable_data<T>(ctx.GetPlace());
 
@@ -73,9 +73,9 @@ static void Dot(const framework::ExecutionContext& ctx,
 template <typename T>
 static void MatMul2D(const framework::ExecutionContext& ctx,
                      const aclrtStream& stream,
-                     const Tensor& X,
-                     const Tensor& Y,
-                     Tensor* Out,
+                     const phi::DenseTensor& X,
+                     const phi::DenseTensor& Y,
+                     phi::DenseTensor* Out,
                      const bool trans_x,
                      const bool trans_y,
                      const float alpha) {
@@ -107,9 +107,9 @@ static void MatMul2D(const framework::ExecutionContext& ctx,
 template <typename T>
 static void MatMulND(const framework::ExecutionContext& ctx,
                      const aclrtStream& stream,
-                     const Tensor& X,
-                     const Tensor& Y,
-                     Tensor* Out,
+                     const phi::DenseTensor& X,
+                     const phi::DenseTensor& Y,
+                     phi::DenseTensor* Out,
                      const bool trans_x,
                      const bool trans_y,
                      const float alpha) {
@@ -143,8 +143,8 @@ static void ReduceDims(const framework::ExecutionContext& ctx,
                        const aclrtStream& stream,
                        const std::vector<int64_t>& dims,
                        const std::vector<int64_t>& brd_dims,
-                       const Tensor& in,
-                       Tensor* out) {
+                       const phi::DenseTensor& in,
+                       phi::DenseTensor* out) {
   std::vector<int64_t> axes;
   int64_t size = brd_dims.size();
   int64_t diff = brd_dims.size() - dims.size();
@@ -167,9 +167,9 @@ template <typename DeviceContext, typename T>
 class MatMulNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* X = ctx.Input<framework::Tensor>("X");
-    auto* Y = ctx.Input<framework::Tensor>("Y");
-    auto* Out = ctx.Output<framework::Tensor>("Out");
+    auto* X = ctx.Input<phi::DenseTensor>("X");
+    auto* Y = ctx.Input<phi::DenseTensor>("Y");
+    auto* Out = ctx.Output<phi::DenseTensor>("Out");
     bool transpose_x = ctx.Attr<bool>("transpose_X");
     bool transpose_y = ctx.Attr<bool>("transpose_Y");
     float alpha = static_cast<T>(ctx.Attr<float>("alpha"));
@@ -312,11 +312,11 @@ template <typename DeviceContext, typename T>
 class MatMulGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* X = ctx.Input<framework::Tensor>("X");
-    auto* Y = ctx.Input<framework::Tensor>("Y");
-    auto* dOut = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dX = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dY = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+    auto* X = ctx.Input<phi::DenseTensor>("X");
+    auto* Y = ctx.Input<phi::DenseTensor>("Y");
+    auto* dOut = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dX = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* dY = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
     bool transpose_x = ctx.Attr<bool>("transpose_X");
     bool transpose_y = ctx.Attr<bool>("transpose_Y");
     float alpha = static_cast<T>(ctx.Attr<float>("alpha"));
diff --git a/paddle/fluid/operators/matmul_op_xpu.cc b/paddle/fluid/operators/matmul_op_xpu.cc
index 922bf780add0b..5f9e9459800da 100644
--- a/paddle/fluid/operators/matmul_op_xpu.cc
+++ b/paddle/fluid/operators/matmul_op_xpu.cc
@@ -23,7 +23,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using framework::Tensor;
 
 template <typename DeviceContext, typename T>
 class MatMulXPUKernel : public framework::OpKernel<T> {
@@ -31,9 +30,9 @@ class MatMulXPUKernel : public framework::OpKernel<T> {
 
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* y = context.Input<framework::Tensor>("Y");
-    auto* out = context.Output<framework::Tensor>("Out");
+    auto* x = context.Input<phi::DenseTensor>("X");
+    auto* y = context.Input<phi::DenseTensor>("Y");
+    auto* out = context.Output<phi::DenseTensor>("Out");
     out->mutable_data<T>(context.GetPlace());
     bool trans_x = context.Attr<bool>("transpose_X");
     bool trans_y = context.Attr<bool>("transpose_Y");
@@ -86,12 +85,11 @@ class MatMulGradXPUKernel : public framework::OpKernel<T> {
 
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto x = *context.Input<framework::Tensor>("X");
-    auto y = *context.Input<framework::Tensor>("Y");
-    auto dout =
-        *context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dy = context.Output<framework::Tensor>(framework::GradVarName("Y"));
+    auto x = *context.Input<phi::DenseTensor>("X");
+    auto y = *context.Input<phi::DenseTensor>("Y");
+    auto dout = *context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* dy = context.Output<phi::DenseTensor>(framework::GradVarName("Y"));
     bool transpose_x = context.Attr<bool>("transpose_X");
     bool transpose_y = context.Attr<bool>("transpose_Y");
     float alpha = static_cast<T>(context.Attr<float>("alpha"));
diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc
index d00004b340d09..876a90e7b9674 100644
--- a/paddle/fluid/operators/matmul_v2_op.cc
+++ b/paddle/fluid/operators/matmul_v2_op.cc
@@ -149,7 +149,7 @@ class MatMulV2Op : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const framework::Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     if (framework::IsComplexType(expected_kernel_type.data_type_)) {
       // only promote inputs’s types when contains complex input
@@ -224,7 +224,7 @@ class MatMulV2OpGrad : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const framework::Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     if (framework::IsComplexType(expected_kernel_type.data_type_)) {
       // only promote inputs’s types when contains complex input
diff --git a/paddle/fluid/operators/matmul_v2_op.h b/paddle/fluid/operators/matmul_v2_op.h
index 8e436dd6afbfb..70bdd0736bf4e 100644
--- a/paddle/fluid/operators/matmul_v2_op.h
+++ b/paddle/fluid/operators/matmul_v2_op.h
@@ -39,7 +39,7 @@ namespace operators {
 
 // Reshape a rank-3 tensor from P x M x N to (P * M) x N.
 // Identity op if the tensor is not of rank 3.
-static framework::Tensor FoldInitDims(const framework::Tensor& input) {
+static phi::DenseTensor FoldInitDims(const phi::DenseTensor& input) {
   auto output = input;
   auto in_dims = input.dims();
   if (in_dims.size() == 3) {
@@ -77,7 +77,7 @@ static framework::DDim ColumnMatrixFromVector(const framework::DDim& y_dim) {
  * If transposed, `H,W` will be swapped.
  */
 static void ReshapeTensorIntoMatrixSequence(
-    framework::Tensor* x, const phi::funcs::MatDescriptor& descriptor) {
+    phi::DenseTensor* x, const phi::funcs::MatDescriptor& descriptor) {
   int64_t h, w;
   h = descriptor.height_;
   w = descriptor.width_;
@@ -91,9 +91,9 @@ static void ReshapeTensorIntoMatrixSequence(
   }
 }
 
-static void ReshapeXYOutIntoMatrixSequence(framework::Tensor* x,
-                                           framework::Tensor* y,
-                                           framework::Tensor* out,
+static void ReshapeXYOutIntoMatrixSequence(phi::DenseTensor* x,
+                                           phi::DenseTensor* y,
+                                           phi::DenseTensor* out,
                                            bool trans_x,
                                            bool trans_y) {
   auto x_dim = RowMatrixFromVector(x->dims());
diff --git a/paddle/fluid/operators/matmul_v2_op_mlu.cc b/paddle/fluid/operators/matmul_v2_op_mlu.cc
index 1ea29500ddc24..134819b7920a0 100644
--- a/paddle/fluid/operators/matmul_v2_op_mlu.cc
+++ b/paddle/fluid/operators/matmul_v2_op_mlu.cc
@@ -18,13 +18,13 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 static void Mul(const framework::ExecutionContext& ctx,
-                const Tensor& X,
-                const Tensor& Y,
-                Tensor* Out) {
+                const phi::DenseTensor& X,
+                const phi::DenseTensor& Y,
+                phi::DenseTensor* Out) {
   Out->mutable_data<T>(ctx.GetPlace());
 
   MLUCnnlTensorDesc x_desc(X, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
@@ -46,9 +46,9 @@ static void Mul(const framework::ExecutionContext& ctx,
 
 template <typename T>
 static void MatMul2D(const framework::ExecutionContext& ctx,
-                     const Tensor& X,
-                     const Tensor& Y,
-                     Tensor* Out,
+                     const phi::DenseTensor& X,
+                     const phi::DenseTensor& Y,
+                     phi::DenseTensor* Out,
                      const bool trans_x,
                      const bool trans_y) {
   Out->mutable_data<T>(ctx.GetPlace());
@@ -70,9 +70,9 @@ static void MatMul2D(const framework::ExecutionContext& ctx,
 
 template <typename T>
 static void MatMul2DwithReduceBatch(const framework::ExecutionContext& ctx,
-                                    const Tensor& X,
-                                    const Tensor& Y,
-                                    Tensor* Out,
+                                    const phi::DenseTensor& X,
+                                    const phi::DenseTensor& Y,
+                                    phi::DenseTensor* Out,
                                     const bool trans_x,
                                     const bool trans_y) {
   if (!Out->initialized()) {
@@ -101,9 +101,9 @@ static void MatMul2DwithReduceBatch(const framework::ExecutionContext& ctx,
 
 template <typename T>
 static void MatMulND(const framework::ExecutionContext& ctx,
-                     const Tensor& X,
-                     const Tensor& Y,
-                     Tensor* Out,
+                     const phi::DenseTensor& X,
+                     const phi::DenseTensor& Y,
+                     phi::DenseTensor* Out,
                      const bool trans_x,
                      const bool trans_y) {
   if (!Out->initialized()) {
@@ -129,8 +129,8 @@ template <typename T>
 static void ReduceDims(const framework::ExecutionContext& ctx,
                        const std::vector<int64_t>& dims,
                        const std::vector<int64_t>& bcast_dims,
-                       const Tensor& in,
-                       Tensor* out) {
+                       const phi::DenseTensor& in,
+                       phi::DenseTensor* out) {
   std::vector<int64_t> axes;
   int64_t size = bcast_dims.size();
   int64_t diff = bcast_dims.size() - dims.size();
@@ -173,9 +173,9 @@ template <typename T>
 class MatMulV2MLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* X = ctx.Input<framework::Tensor>("X");
-    auto* Y = ctx.Input<framework::Tensor>("Y");
-    auto* Out = ctx.Output<framework::Tensor>("Out");
+    auto* X = ctx.Input<phi::DenseTensor>("X");
+    auto* Y = ctx.Input<phi::DenseTensor>("Y");
+    auto* Out = ctx.Output<phi::DenseTensor>("Out");
     const bool trans_x = ctx.Attr<bool>("trans_x");
     const bool trans_y = ctx.Attr<bool>("trans_y");
 
@@ -263,11 +263,11 @@ template <typename T>
 class MatMulGradV2MLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* X = ctx.Input<framework::Tensor>("X");
-    auto* Y = ctx.Input<framework::Tensor>("Y");
-    auto* dOut = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dX = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dY = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+    auto* X = ctx.Input<phi::DenseTensor>("X");
+    auto* Y = ctx.Input<phi::DenseTensor>("Y");
+    auto* dOut = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dX = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* dY = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
     const bool trans_x = ctx.Attr<bool>("trans_x");
     const bool trans_y = ctx.Attr<bool>("trans_y");
 
diff --git a/paddle/fluid/operators/matmul_v2_op_npu.cc b/paddle/fluid/operators/matmul_v2_op_npu.cc
index 291894bc30ed9..4df3de71134ed 100644
--- a/paddle/fluid/operators/matmul_v2_op_npu.cc
+++ b/paddle/fluid/operators/matmul_v2_op_npu.cc
@@ -21,15 +21,15 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using NPUDeviceContext = platform::NPUDeviceContext;
 
 template <typename T>
 static void MatMul2D(const framework::ExecutionContext& ctx,
                      const aclrtStream& stream,
-                     const Tensor& X,
-                     const Tensor& Y,
-                     Tensor* Out,
+                     const phi::DenseTensor& X,
+                     const phi::DenseTensor& Y,
+                     phi::DenseTensor* Out,
                      const bool trans_x,
                      const bool trans_y) {
   Out->mutable_data<T>(ctx.GetPlace());
@@ -44,9 +44,9 @@ static void MatMul2D(const framework::ExecutionContext& ctx,
 template <typename T>
 static void MatMulND(const framework::ExecutionContext& ctx,
                      const aclrtStream& stream,
-                     const Tensor& X,
-                     const Tensor& Y,
-                     Tensor* Out,
+                     const phi::DenseTensor& X,
+                     const phi::DenseTensor& Y,
+                     phi::DenseTensor* Out,
                      const bool trans_x,
                      const bool trans_y) {
   Out->mutable_data<T>(ctx.GetPlace());
@@ -61,9 +61,9 @@ static void MatMulND(const framework::ExecutionContext& ctx,
 template <>
 void MatMulND<phi::dtype::float16>(const framework::ExecutionContext& ctx,
                                    const aclrtStream& stream,
-                                   const Tensor& X,
-                                   const Tensor& Y,
-                                   Tensor* Out,
+                                   const phi::DenseTensor& X,
+                                   const phi::DenseTensor& Y,
+                                   phi::DenseTensor* Out,
                                    const bool trans_x,
                                    const bool trans_y) {
   Out->mutable_data<phi::dtype::float16>(ctx.GetPlace());
@@ -114,8 +114,8 @@ static void ReduceDims(const framework::ExecutionContext& ctx,
                        const aclrtStream& stream,
                        const std::vector<int64_t>& dims,
                        const std::vector<int64_t>& brd_dims,
-                       const Tensor& in,
-                       Tensor* out) {
+                       const phi::DenseTensor& in,
+                       phi::DenseTensor* out) {
   std::vector<int64_t> axes;
   int64_t size = brd_dims.size();
   int64_t diff = brd_dims.size() - dims.size();
@@ -138,9 +138,9 @@ template <typename T>
 class MatMulV2NPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* X = ctx.Input<Tensor>("X");
-    auto* Y = ctx.Input<Tensor>("Y");
-    auto* Out = ctx.Output<Tensor>("Out");
+    auto* X = ctx.Input<phi::DenseTensor>("X");
+    auto* Y = ctx.Input<phi::DenseTensor>("Y");
+    auto* Out = ctx.Output<phi::DenseTensor>("Out");
     const bool trans_x = ctx.Attr<bool>("trans_x");
     const bool trans_y = ctx.Attr<bool>("trans_y");
 
@@ -276,11 +276,11 @@ template <typename T>
 class MatMulV2GradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* X = ctx.Input<Tensor>("X");
-    auto* Y = ctx.Input<Tensor>("Y");
-    auto* dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dX = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dY = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    auto* X = ctx.Input<phi::DenseTensor>("X");
+    auto* Y = ctx.Input<phi::DenseTensor>("Y");
+    auto* dOut = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dX = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* dY = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
     const bool trans_x = ctx.Attr<bool>("trans_x");
     const bool trans_y = ctx.Attr<bool>("trans_y");
 
diff --git a/paddle/fluid/operators/mean_iou_op.cu b/paddle/fluid/operators/mean_iou_op.cu
index ee31607c63ad8..3e7f8a5363ac0 100644
--- a/paddle/fluid/operators/mean_iou_op.cu
+++ b/paddle/fluid/operators/mean_iou_op.cu
@@ -95,11 +95,11 @@ class MeanIoUCUDAOpKernel : public framework::OpKernel<T> {
     auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
     auto& place = *dev_ctx.eigen_device();
     // get input and output tensor
-    auto* predictions = ctx.Input<Tensor>("Predictions");
-    auto* labels = ctx.Input<Tensor>("Labels");
-    auto* out_mean_iou = ctx.Output<Tensor>("OutMeanIou");
-    auto* out_wrong = ctx.Output<Tensor>("OutWrong");
-    auto* out_correct = ctx.Output<Tensor>("OutCorrect");
+    auto* predictions = ctx.Input<phi::DenseTensor>("Predictions");
+    auto* labels = ctx.Input<phi::DenseTensor>("Labels");
+    auto* out_mean_iou = ctx.Output<phi::DenseTensor>("OutMeanIou");
+    auto* out_wrong = ctx.Output<phi::DenseTensor>("OutWrong");
+    auto* out_correct = ctx.Output<phi::DenseTensor>("OutCorrect");
     int num_classes = static_cast<int>(ctx.Attr<int>("num_classes"));
 
     // Get data ptr
@@ -128,16 +128,16 @@ class MeanIoUCUDAOpKernel : public framework::OpKernel<T> {
     out_mean_iou_t.device(place) = out_mean_iou_t.constant(0.0f);
 
     // collect pre wrong, correct and mean_iou
-    auto in_mean_ious = ctx.MultiInput<Tensor>("InMeanIou");
+    auto in_mean_ious = ctx.MultiInput<phi::DenseTensor>("InMeanIou");
     for (int i = 0; i < in_mean_ious.size(); ++i) {
       out_mean_iou_t.device(place) +=
           EigenTensor<float, 1>::From(*in_mean_ious[i]);
     }
-    auto in_wrongs = ctx.MultiInput<Tensor>("InWrongs");
+    auto in_wrongs = ctx.MultiInput<phi::DenseTensor>("InWrongs");
     for (int i = 0; i < in_wrongs.size(); ++i) {
       out_wrong_t.device(place) += EigenTensor<int, 1>::From(*in_wrongs[i]);
     }
-    auto in_corrects = ctx.MultiInput<Tensor>("InCorrects");
+    auto in_corrects = ctx.MultiInput<phi::DenseTensor>("InCorrects");
     for (int i = 0; i < in_corrects.size(); ++i) {
       out_correct_t.device(place) += EigenTensor<int, 1>::From(*in_corrects[i]);
     }
diff --git a/paddle/fluid/operators/mean_iou_op.h b/paddle/fluid/operators/mean_iou_op.h
index 0ec92251a8e37..7681af011e663 100644
--- a/paddle/fluid/operators/mean_iou_op.h
+++ b/paddle/fluid/operators/mean_iou_op.h
@@ -20,7 +20,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T,
           int D,
@@ -35,11 +35,11 @@ class MeanIoUKernel : public framework::OpKernel<T> {
     auto& place =
         *ctx.template device_context<phi::CPUContext>().eigen_device();
     // get input and output tensor
-    auto* predictions = ctx.Input<Tensor>("Predictions");
-    auto* labels = ctx.Input<Tensor>("Labels");
-    auto* out_mean_iou = ctx.Output<Tensor>("OutMeanIou");
-    auto* out_wrong = ctx.Output<Tensor>("OutWrong");
-    auto* out_correct = ctx.Output<Tensor>("OutCorrect");
+    auto* predictions = ctx.Input<phi::DenseTensor>("Predictions");
+    auto* labels = ctx.Input<phi::DenseTensor>("Labels");
+    auto* out_mean_iou = ctx.Output<phi::DenseTensor>("OutMeanIou");
+    auto* out_wrong = ctx.Output<phi::DenseTensor>("OutWrong");
+    auto* out_correct = ctx.Output<phi::DenseTensor>("OutCorrect");
     int num_classes = static_cast<int>(ctx.Attr<int>("num_classes"));
 
     // get data ptr
@@ -77,16 +77,16 @@ class MeanIoUKernel : public framework::OpKernel<T> {
     out_mean_iou_t = out_mean_iou_t.constant(0);
 
     // collect pre wrong, correct and mean_iou
-    auto in_mean_ious = ctx.MultiInput<Tensor>("InMeanIou");
+    auto in_mean_ious = ctx.MultiInput<phi::DenseTensor>("InMeanIou");
     for (size_t i = 0; i < in_mean_ious.size(); ++i) {
       out_mean_iou_t.device(place) +=
           EigenTensor<float, 1>::From(*in_mean_ious[i]);
     }
-    auto in_wrongs = ctx.MultiInput<Tensor>("InWrongs");
+    auto in_wrongs = ctx.MultiInput<phi::DenseTensor>("InWrongs");
     for (size_t i = 0; i < in_wrongs.size(); ++i) {
       out_wrong_t.device(place) += EigenTensor<int, 1>::From(*in_wrongs[i]);
     }
-    auto in_corrects = ctx.MultiInput<Tensor>("InCorrects");
+    auto in_corrects = ctx.MultiInput<phi::DenseTensor>("InCorrects");
     for (size_t i = 0; i < in_corrects.size(); ++i) {
       out_correct_t.device(place) += EigenTensor<int, 1>::From(*in_corrects[i]);
     }
diff --git a/paddle/fluid/operators/mean_op_mlu.cc b/paddle/fluid/operators/mean_op_mlu.cc
index 4301cde33e337..8fea989941c88 100644
--- a/paddle/fluid/operators/mean_op_mlu.cc
+++ b/paddle/fluid/operators/mean_op_mlu.cc
@@ -20,14 +20,14 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class MeanMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
+    auto* input = context.Input<phi::DenseTensor>("X");
+    auto* output = context.Output<phi::DenseTensor>("Out");
 
     const T* in_data = input->data<T>();
     T* out_data = output->mutable_data<T>(context.GetPlace());
@@ -77,14 +77,16 @@ template <typename T>
 class MeanMLUGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto output_grad = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto output_grad =
+        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
     PADDLE_ENFORCE_EQ(output_grad->numel(),
                       1,
                       platform::errors::InvalidArgument(
                           "Mean Gradient Input Tensor len should be 1. But "
                           "received Out@Grad's elements num is %d.",
                           output_grad->numel()));
-    auto input_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    auto input_grad =
+        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
     input_grad->mutable_data<T>(context.GetPlace());
 
     auto in_data = output_grad->data<T>();
diff --git a/paddle/fluid/operators/mean_op_npu.cc b/paddle/fluid/operators/mean_op_npu.cc
index 76f1dcb43a3a2..bee3f8b0696b1 100644
--- a/paddle/fluid/operators/mean_op_npu.cc
+++ b/paddle/fluid/operators/mean_op_npu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class MeanNPUKernel : public framework::OpKernel<T> {
@@ -49,7 +49,7 @@ class MeanGradNPUKernel : public framework::OpKernel<T> {
         context.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    auto grad = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto grad = context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
 
     PADDLE_ENFORCE_EQ(grad->numel(),
                       1,
@@ -58,7 +58,7 @@ class MeanGradNPUKernel : public framework::OpKernel<T> {
                           "received Out@Grad's elements num is %d.",
                           grad->numel()));
 
-    auto IG = context.Output<Tensor>(framework::GradVarName("X"));
+    auto IG = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
     IG->mutable_data<T>(context.GetPlace());
 
     // ones
diff --git a/paddle/fluid/operators/memcpy_d2h_op.cc b/paddle/fluid/operators/memcpy_d2h_op.cc
index ed99fd5bf8783..80181779ab347 100644
--- a/paddle/fluid/operators/memcpy_d2h_op.cc
+++ b/paddle/fluid/operators/memcpy_d2h_op.cc
@@ -39,7 +39,7 @@ class MemcpyD2HOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const framework::Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     return framework::OpKernelType(expected_kernel_type.data_type_,
                                    expected_kernel_type.place_,
diff --git a/paddle/fluid/operators/memcpy_h2d_op.cc b/paddle/fluid/operators/memcpy_h2d_op.cc
index b1126fb12818e..8d2cfcff80768 100644
--- a/paddle/fluid/operators/memcpy_h2d_op.cc
+++ b/paddle/fluid/operators/memcpy_h2d_op.cc
@@ -40,7 +40,7 @@ class MemcpyH2DOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const framework::Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     return framework::OpKernelType(expected_kernel_type.data_type_,
                                    expected_kernel_type.place_,
diff --git a/paddle/fluid/operators/memcpy_op.cc b/paddle/fluid/operators/memcpy_op.cc
index 273b1fe7c9e70..caa4164ee5bc0 100644
--- a/paddle/fluid/operators/memcpy_op.cc
+++ b/paddle/fluid/operators/memcpy_op.cc
@@ -56,7 +56,7 @@ class MemcpyOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const framework::Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     return framework::OpKernelType(expected_kernel_type.data_type_,
                                    expected_kernel_type.place_,
diff --git a/paddle/fluid/operators/meshgrid_op.cc b/paddle/fluid/operators/meshgrid_op.cc
index 0b95200c12828..7921e8844c112 100644
--- a/paddle/fluid/operators/meshgrid_op.cc
+++ b/paddle/fluid/operators/meshgrid_op.cc
@@ -25,8 +25,6 @@
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class MeshgridOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -34,7 +32,7 @@ class MeshgridOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    auto inputs = ctx.MultiInput<Tensor>("X");
+    auto inputs = ctx.MultiInput<phi::DenseTensor>("X");
     auto input_data_type = framework::proto::VarType::Type(0);
     bool flag = 0;
     for (auto* input : inputs) {
diff --git a/paddle/fluid/operators/meshgrid_op_mlu.cc b/paddle/fluid/operators/meshgrid_op_mlu.cc
index 09aaf695f7556..76beb021bc654 100644
--- a/paddle/fluid/operators/meshgrid_op_mlu.cc
+++ b/paddle/fluid/operators/meshgrid_op_mlu.cc
@@ -22,8 +22,8 @@ template <typename T>
 class MeshgridMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto ins = ctx.MultiInput<framework::Tensor>("X");
-    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
+    auto ins = ctx.MultiInput<phi::DenseTensor>("X");
+    auto outs = ctx.MultiOutput<phi::DenseTensor>("Out");
     PADDLE_ENFORCE_EQ(
         (ins.size() > 1) && (ins.size() < 7),
         true,
diff --git a/paddle/fluid/operators/meshgrid_op_npu.cc b/paddle/fluid/operators/meshgrid_op_npu.cc
index 86e45fb66284a..8c4e67d787e92 100644
--- a/paddle/fluid/operators/meshgrid_op_npu.cc
+++ b/paddle/fluid/operators/meshgrid_op_npu.cc
@@ -22,8 +22,8 @@ template <typename T>
 class MeshgridNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto ins = context.MultiInput<framework::Tensor>("X");
-    auto outs = context.MultiOutput<framework::Tensor>("Out");
+    auto ins = context.MultiInput<phi::DenseTensor>("X");
+    auto outs = context.MultiOutput<phi::DenseTensor>("Out");
     PADDLE_ENFORCE_EQ(
         (ins.size() > 1) && (ins.size() < 7),
         true,
@@ -55,7 +55,7 @@ class MeshgridNPUKernel : public framework::OpKernel<T> {
       view_shape[i] = shape[i];
 
       framework::DDim out_dims_reshape = phi::make_ddim(view_shape);
-      framework::Tensor reshape_ins_tensor(ins[i]->dtype());
+      phi::DenseTensor reshape_ins_tensor(ins[i]->dtype());
       reshape_ins_tensor.ShareDataWith(*ins[i]);
       reshape_ins_tensor.Resize(out_dims_reshape);
 
diff --git a/paddle/fluid/operators/metrics/accuracy_op_mlu.cc b/paddle/fluid/operators/metrics/accuracy_op_mlu.cc
index 96ca608a39b93..ec78fb09eab30 100644
--- a/paddle/fluid/operators/metrics/accuracy_op_mlu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_mlu.cc
@@ -23,12 +23,12 @@ template <typename T>
 class AccuracyMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* indices = ctx.Input<Tensor>("Indices");
-    auto* label = ctx.Input<Tensor>("Label");
+    auto* indices = ctx.Input<phi::DenseTensor>("Indices");
+    auto* label = ctx.Input<phi::DenseTensor>("Label");
 
-    auto* accuracy = ctx.Output<Tensor>("Accuracy");
-    auto* correct = ctx.Output<Tensor>("Correct");
-    auto* total = ctx.Output<Tensor>("Total");
+    auto* accuracy = ctx.Output<phi::DenseTensor>("Accuracy");
+    auto* correct = ctx.Output<phi::DenseTensor>("Correct");
+    auto* total = ctx.Output<phi::DenseTensor>("Total");
 
     int num_samples = indices->dims()[0];
     if (num_samples == 0) {
diff --git a/paddle/fluid/operators/metrics/accuracy_op_npu.cc b/paddle/fluid/operators/metrics/accuracy_op_npu.cc
index 2f6c8f5718eff..a53ba79a4c534 100644
--- a/paddle/fluid/operators/metrics/accuracy_op_npu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_npu.cc
@@ -23,13 +23,13 @@ template <typename DeviceContext, typename T>
 class AccuracyNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* inference = ctx.Input<Tensor>("Out");
-    auto* label = ctx.Input<Tensor>("Label");
-    auto* indices = ctx.Input<Tensor>("Indices");
+    auto* inference = ctx.Input<phi::DenseTensor>("Out");
+    auto* label = ctx.Input<phi::DenseTensor>("Label");
+    auto* indices = ctx.Input<phi::DenseTensor>("Indices");
 
-    auto* accuracy = ctx.Output<Tensor>("Accuracy");
-    auto* correct = ctx.Output<Tensor>("Correct");
-    auto* total = ctx.Output<Tensor>("Total");
+    auto* accuracy = ctx.Output<phi::DenseTensor>("Accuracy");
+    auto* correct = ctx.Output<phi::DenseTensor>("Correct");
+    auto* total = ctx.Output<phi::DenseTensor>("Total");
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
diff --git a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
index f2c04d8fbcfc7..f3f39a40fbaea 100644
--- a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
@@ -21,17 +21,17 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = paddle::framework::Tensor;
+using Tensor = phi::DenseTensor;
 template <typename DeviceContext, typename T>
 class AccuracyXPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* inference = ctx.Input<Tensor>("Out");
-    auto* indices = ctx.Input<Tensor>("Indices");
-    auto* label = ctx.Input<Tensor>("Label");
-    auto* accuracy = ctx.Output<Tensor>("Accuracy");
-    auto* correct = ctx.Output<Tensor>("Correct");
-    auto* total = ctx.Output<Tensor>("Total");
+    auto* inference = ctx.Input<phi::DenseTensor>("Out");
+    auto* indices = ctx.Input<phi::DenseTensor>("Indices");
+    auto* label = ctx.Input<phi::DenseTensor>("Label");
+    auto* accuracy = ctx.Output<phi::DenseTensor>("Accuracy");
+    auto* correct = ctx.Output<phi::DenseTensor>("Correct");
+    auto* total = ctx.Output<phi::DenseTensor>("Total");
     int* correct_data = correct->mutable_data<int>(ctx.GetPlace());
     int* total_data = total->mutable_data<int>(ctx.GetPlace());
     float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/metrics/precision_recall_op.h b/paddle/fluid/operators/metrics/precision_recall_op.h
index b8a5e49ef5a24..55be510dcd237 100644
--- a/paddle/fluid/operators/metrics/precision_recall_op.h
+++ b/paddle/fluid/operators/metrics/precision_recall_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
@@ -31,13 +31,13 @@ template <typename DeviceContext, typename T>
 class PrecisionRecallKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in0 = ctx.Input<Tensor>("Indices");
-    auto* in1 = ctx.Input<Tensor>("Labels");
-    auto* in2 = ctx.Input<Tensor>("Weights");
-    auto* in3 = ctx.Input<Tensor>("StatesInfo");
-    auto* out0 = ctx.Output<Tensor>("BatchMetrics");
-    auto* out1 = ctx.Output<Tensor>("AccumMetrics");
-    auto* out2 = ctx.Output<Tensor>("AccumStatesInfo");
+    auto* in0 = ctx.Input<phi::DenseTensor>("Indices");
+    auto* in1 = ctx.Input<phi::DenseTensor>("Labels");
+    auto* in2 = ctx.Input<phi::DenseTensor>("Weights");
+    auto* in3 = ctx.Input<phi::DenseTensor>("StatesInfo");
+    auto* out0 = ctx.Output<phi::DenseTensor>("BatchMetrics");
+    auto* out1 = ctx.Output<phi::DenseTensor>("AccumMetrics");
+    auto* out2 = ctx.Output<phi::DenseTensor>("AccumStatesInfo");
 
     const int* ids_data = in0->data<int>();
     const int* labels_data = in1->data<int>();
diff --git a/paddle/fluid/operators/minus_op.h b/paddle/fluid/operators/minus_op.h
index e6dc80c7fedb3..0a576e875a458 100644
--- a/paddle/fluid/operators/minus_op.h
+++ b/paddle/fluid/operators/minus_op.h
@@ -24,9 +24,9 @@ template <typename DeviceContext, typename T>
 class MinusKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* left_tensor = context.Input<framework::Tensor>("X");
-    auto* right_tensor = context.Input<framework::Tensor>("Y");
-    auto* out_tensor = context.Output<framework::Tensor>("Out");
+    auto* left_tensor = context.Input<phi::DenseTensor>("X");
+    auto* right_tensor = context.Input<phi::DenseTensor>("Y");
+    auto* out_tensor = context.Output<phi::DenseTensor>("Out");
 
     out_tensor->mutable_data<T>(context.GetPlace());
     auto& dev =
diff --git a/paddle/fluid/operators/miopen_lstm_cache.h b/paddle/fluid/operators/miopen_lstm_cache.h
index ec13337d8f0bc..a9a6482fd485c 100644
--- a/paddle/fluid/operators/miopen_lstm_cache.h
+++ b/paddle/fluid/operators/miopen_lstm_cache.h
@@ -51,7 +51,7 @@ class ScopedRNNBase {
               const std::vector<int>& sequence_length,
               size_t* workspace_size,
               size_t* reserve_size,
-              framework::Tensor* dropout_state) {
+              phi::DenseTensor* dropout_state) {
     int numDirections = is_bidirec_ ? 2 : 1;
     miopenDataType_t miopen_type = platform::CudnnDataType<T>::type;
 
diff --git a/paddle/fluid/operators/miopen_rnn_cache.h b/paddle/fluid/operators/miopen_rnn_cache.h
index b568ffbb09cc8..19255363259b5 100644
--- a/paddle/fluid/operators/miopen_rnn_cache.h
+++ b/paddle/fluid/operators/miopen_rnn_cache.h
@@ -53,7 +53,7 @@ struct CudnnRNNCache {
   miopenTensorDescriptor_t dw_desc_;
 
   size_t workspace_size_;
-  framework::Tensor workspace_data_;
+  phi::DenseTensor workspace_data_;
 
   size_t seq_length_;
 
@@ -78,7 +78,7 @@ struct CudnnRNNCache {
             int seed,
             int weight_numel,
             size_t *reserve_size_,
-            framework::Tensor *dropout_state_,
+            phi::DenseTensor *dropout_state_,
             bool initialized,
             miopenDataType_t miopen_type) {
     seq_length_ = seq_len;
diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
index 728d86cd94e33..c2a055f96bd4a 100644
--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
@@ -27,7 +27,7 @@ using dnnl::memory;
 using dnnl::primitive;
 using dnnl::stream;
 using framework::DataLayout;
-using framework::Tensor;
+
 using platform::GetMKLDNNFormat;
 using platform::MKLDNNDeviceContext;
 using platform::to_void_cast;
@@ -62,8 +62,8 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
   auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
   const auto &mkldnn_engine = dev_ctx.GetEngine();
 
-  const auto *x = ctx.Input<Tensor>("X");
-  auto *out = ctx.Output<Tensor>("Out");
+  const auto *x = ctx.Input<phi::DenseTensor>("X");
+  auto *out = ctx.Output<phi::DenseTensor>("Out");
 
   bool is_inplaced = x->IsSharedBufferWith(*out);
 
@@ -94,9 +94,9 @@ void eltwise_grad(const framework::ExecutionContext &ctx,
   auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
   const auto &mkldnn_engine = dev_ctx.GetEngine();
 
-  const auto *x = ctx.Input<Tensor>("X");
-  const auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-  auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+  const auto *x = ctx.Input<phi::DenseTensor>("X");
+  const auto *dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+  auto *dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
   platform::ActivationMKLDNNHandler<T> handler(
       algorithm, ctx, mkldnn_engine, ctx.GetPlace(), x, dout);
@@ -122,9 +122,9 @@ void eltwise_grad_use_out(const framework::ExecutionContext &ctx,
   auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
   const auto &mkldnn_engine = dev_ctx.GetEngine();
 
-  const auto *out = ctx.Input<Tensor>("Out");
-  const auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-  auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+  const auto *out = ctx.Input<phi::DenseTensor>("Out");
+  const auto *dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+  auto *dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
   platform::ActivationMKLDNNHandler<T> handler(
       algorithm, ctx, mkldnn_engine, ctx.GetPlace(), out, dout);
diff --git a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
index f41068dd5f1ae..d7575f0ebf885 100644
--- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
@@ -128,13 +128,13 @@ class BatchNormMKLDNNHandler : public platform::MKLDNNHandlerNoCachingT<
   }
 
   std::shared_ptr<dnnl::memory> AcquireMeanMemory(
-      const framework::Tensor *mean) {
+      const phi::DenseTensor *mean) {
     const T *mean_data = mean->data<T>();
     return this->AcquireMemoryFromPrimitive(this->fwd_pd_->mean_desc(),
                                             to_void_cast<T>(mean_data));
   }
 
-  std::shared_ptr<dnnl::memory> AcquireMeanMemory(framework::Tensor *mean) {
+  std::shared_ptr<dnnl::memory> AcquireMeanMemory(phi::DenseTensor *mean) {
     T *mean_data = mean->mutable_data<T>(this->place_,
                                          this->fwd_pd_->mean_desc().get_size());
     return this->AcquireMemoryFromPrimitive(this->fwd_pd_->mean_desc(),
@@ -142,14 +142,14 @@ class BatchNormMKLDNNHandler : public platform::MKLDNNHandlerNoCachingT<
   }
 
   std::shared_ptr<dnnl::memory> AcquireVarianceMemory(
-      const framework::Tensor *variance) {
+      const phi::DenseTensor *variance) {
     const T *variance_data = variance->data<T>();
     return this->AcquireMemoryFromPrimitive(this->fwd_pd_->variance_desc(),
                                             to_void_cast<T>(variance_data));
   }
 
   std::shared_ptr<dnnl::memory> AcquireVarianceMemory(
-      framework::Tensor *variance) {
+      phi::DenseTensor *variance) {
     T *variance_data = variance->mutable_data<T>(
         this->place_, this->fwd_pd_->variance_desc().get_size());
     return this->AcquireMemoryFromPrimitive(this->fwd_pd_->variance_desc(),
@@ -170,13 +170,13 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     const bool test_mode = is_test && (!trainable_stats);
     const bool global_stats = test_mode || use_global_stats;
 
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *shift = ctx.Input<Tensor>("Bias");
+    const auto *x = ctx.Input<phi::DenseTensor>("X");
+    const auto *scale = ctx.Input<phi::DenseTensor>("Scale");
+    const auto *shift = ctx.Input<phi::DenseTensor>("Bias");
 
-    auto *y = ctx.Output<Tensor>("Y");
-    auto *batch_mean = ctx.Output<Tensor>("SavedMean");
-    auto *batch_variance = ctx.Output<Tensor>("SavedVariance");
+    auto *y = ctx.Output<phi::DenseTensor>("Y");
+    auto *batch_mean = ctx.Output<phi::DenseTensor>("SavedMean");
+    auto *batch_variance = ctx.Output<phi::DenseTensor>("SavedVariance");
     BatchNormMKLDNNHandler<T> handler(
         ctx, mkldnn_engine, x, global_stats, test_mode);
 
@@ -190,8 +190,8 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     if (global_stats) {
       // mean and variance are taken from input Tensor
-      const auto *mean = ctx.Input<Tensor>("Mean");
-      const auto *variance = ctx.Input<Tensor>("Variance");
+      const auto *mean = ctx.Input<phi::DenseTensor>("Mean");
+      const auto *variance = ctx.Input<phi::DenseTensor>("Variance");
 
       mean_memory = handler.AcquireMeanMemory(mean);
       variance_memory = handler.AcquireVarianceMemory(variance);
@@ -213,8 +213,8 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     astream.wait();
 
     if (!global_stats) {
-      auto *mean_out = ctx.Output<Tensor>("MeanOut");
-      auto *variance_out = ctx.Output<Tensor>("VarianceOut");
+      auto *mean_out = ctx.Output<phi::DenseTensor>("MeanOut");
+      auto *variance_out = ctx.Output<phi::DenseTensor>("VarianceOut");
       const float momentum = ctx.Attr<float>("momentum");
 
       const unsigned int C = phi::vectorize(scale->dims())[0];
@@ -246,15 +246,18 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
     auto mkldnn_engine = dev_ctx.GetEngine();
 
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *shift = ctx.Input<Tensor>("Bias");
-    const auto *batch_mean = ctx.Input<Tensor>("SavedMean");
-    const auto *batch_variance = ctx.Input<Tensor>("SavedVariance");
-    const auto *diff_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto *diff_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *diff_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *diff_shift = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+    const auto *x = ctx.Input<phi::DenseTensor>("X");
+    const auto *scale = ctx.Input<phi::DenseTensor>("Scale");
+    const auto *shift = ctx.Input<phi::DenseTensor>("Bias");
+    const auto *batch_mean = ctx.Input<phi::DenseTensor>("SavedMean");
+    const auto *batch_variance = ctx.Input<phi::DenseTensor>("SavedVariance");
+    const auto *diff_y =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
+    auto *diff_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto *diff_scale =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
+    auto *diff_shift =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
 
     BatchNormMKLDNNHandler<T> handler(ctx, mkldnn_engine, x, scale, diff_y);
 
diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
index b16576505dfd3..5df17de25bbe8 100644
--- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
@@ -28,7 +28,7 @@ using dnnl::primitive;
 using dnnl::stream;
 using framework::DataLayout;
 using framework::LoDTensor;
-using framework::Tensor;
+
 using platform::to_void_cast;
 
 template <typename T>
@@ -37,8 +37,8 @@ class ConcatMKLDNNHandler
  public:
   ConcatMKLDNNHandler(const framework::ExecutionContext& ctx,
                       const dnnl::engine mkldnn_engine,
-                      const std::vector<const Tensor*>& inputs,
-                      Tensor* output)
+                      const std::vector<const phi::DenseTensor*>& inputs,
+                      phi::DenseTensor* output)
       : platform::MKLDNNHandlerNoCachingT<T, dnnl::concat>(mkldnn_engine,
                                                            ctx.GetPlace()) {
     int concat_axis = ctx.Attr<int>("axis");
@@ -53,7 +53,7 @@ class ConcatMKLDNNHandler
             concat_axis));
 
     if (ctx.HasInput("AxisTensor")) {
-      auto* axis_tensor = ctx.Input<Tensor>("AxisTensor");
+      auto* axis_tensor = ctx.Input<phi::DenseTensor>("AxisTensor");
       concat_axis = GetDataFromTensor(axis_tensor)[0];
       auto out_dims = inputs[0]->dims();
       for (size_t i = 1; i < inputs.size(); ++i) {
@@ -110,14 +110,15 @@ class ConcatMKLDNNHandler
         dst_md, concat_axis, srcs_md, this->engine_));
   }
 
-  std::shared_ptr<dnnl::memory> AcquireSrcMemory(const Tensor& input, int i) {
+  std::shared_ptr<dnnl::memory> AcquireSrcMemory(const phi::DenseTensor& input,
+                                                 int i) {
     const T* input_data = input.data<T>();
     return this->AcquireMemoryFromPrimitive(this->fwd_pd_->src_desc(i),
                                             to_void_cast<T>(input_data));
   }
 };
 
-static void EnforceLayouts(const std::vector<const Tensor*> inputs) {
+static void EnforceLayouts(const std::vector<const phi::DenseTensor*> inputs) {
   for (auto* input : inputs) {
     PADDLE_ENFORCE_EQ(
         input->layout(),
@@ -127,13 +128,14 @@ static void EnforceLayouts(const std::vector<const Tensor*> inputs) {
 }
 
 // From a multi-input, gather only nonempty inputs
-static const std::vector<const Tensor*> ReduceMultiInput(
-    const std::vector<const Tensor*>& inputs) {
-  std::vector<const Tensor*> reduced(inputs.size());
-  auto end_it = std::copy_if(
-      inputs.begin(), inputs.end(), reduced.begin(), [](const Tensor* t) {
-        return t->numel() > 0;
-      });
+static const std::vector<const phi::DenseTensor*> ReduceMultiInput(
+    const std::vector<const phi::DenseTensor*>& inputs) {
+  std::vector<const phi::DenseTensor*> reduced(inputs.size());
+  auto end_it =
+      std::copy_if(inputs.begin(),
+                   inputs.end(),
+                   reduced.begin(),
+                   [](const phi::DenseTensor* t) { return t->numel() > 0; });
   reduced.resize(std::distance(reduced.begin(), end_it));
   return reduced;
 }
@@ -147,9 +149,9 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     const auto& mkldnn_engine = dev_ctx.GetEngine();
     // If any of the multiple inputs of concat has an input size of 0, the
     // actual size of the multi_input will change
-    auto multi_input = ReduceMultiInput(ctx.MultiInput<Tensor>("X"));
+    auto multi_input = ReduceMultiInput(ctx.MultiInput<phi::DenseTensor>("X"));
     EnforceLayouts(multi_input);
-    Tensor* output = ctx.Output<Tensor>("Out");
+    phi::DenseTensor* output = ctx.Output<phi::DenseTensor>("Out");
 
     ConcatMKLDNNHandler<T> handler(ctx, mkldnn_engine, multi_input, output);
 
@@ -187,7 +189,8 @@ class ConcatGradMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto out_var_names = ctx.OutputNames(framework::GradVarName("X"));
 
     const auto x = ctx.MultiInput<LoDTensor>("X");
-    const auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    const auto* dout =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
     auto dx = ctx.MultiOutput<LoDTensor>(framework::GradVarName("X"));
 
     for (size_t i = 0; i < dx.size(); ++i) {
@@ -198,7 +201,7 @@ class ConcatGradMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     int axis = ctx.Attr<int>("axis");
     if (ctx.HasInput("AxisTensor")) {
-      auto* axis_tensor = ctx.Input<Tensor>("AxisTensor");
+      auto* axis_tensor = ctx.Input<phi::DenseTensor>("AxisTensor");
       axis = GetDataFromTensor<int>(axis_tensor)[0];
     }
 
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index fc8f29913097c..6b5f1f6a35741 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -34,12 +34,13 @@ inline MKLDNNMemoryFormat GetWeightsFormat(const int groups,
   }
 }
 
-static dnnl::memory::data_type GetDstType(bool is_int8,
-                                          bool is_bfloat16,
-                                          bool force_fp32_output,
-                                          std::string fuse_activation,
-                                          bool fuse_residual_conn,
-                                          const Tensor* residual_param) {
+static dnnl::memory::data_type GetDstType(
+    bool is_int8,
+    bool is_bfloat16,
+    bool force_fp32_output,
+    std::string fuse_activation,
+    bool fuse_residual_conn,
+    const phi::DenseTensor* residual_param) {
   auto dst_dt = dnnl::memory::data_type::f32;
   if (is_int8) {
     dst_dt = (fuse_activation == "relu" || fuse_activation == "relu6")
@@ -76,10 +77,10 @@ class ConvMKLDNNHandlerT
                      const platform::MKLDNNDeviceContext& dev_ctx,
                      const dnnl::engine mkldnn_engine,
                      platform::Place cpu_place,
-                     const Tensor* input,
-                     const Tensor* filter,
-                     const Tensor* bias,
-                     Tensor* output,
+                     const phi::DenseTensor* input,
+                     const phi::DenseTensor* filter,
+                     const phi::DenseTensor* bias,
+                     phi::DenseTensor* output,
                      const std::string& unique_name)
       : platform::MKLDNNHandlerT<T,
                                  dnnl::convolution_forward,
@@ -271,12 +272,12 @@ class ConvMKLDNNHandlerT
   ConvMKLDNNHandlerT(const framework::ExecutionContext& ctx,
                      const platform::MKLDNNDeviceContext& dev_ctx,
                      platform::Place cpu_place,
-                     const Tensor* in,
-                     const Tensor* filter,
-                     const Tensor* bias,
-                     const Tensor* out_grad,
-                     Tensor* filter_grad,
-                     Tensor* in_x_grad,
+                     const phi::DenseTensor* in,
+                     const phi::DenseTensor* filter,
+                     const phi::DenseTensor* bias,
+                     const phi::DenseTensor* out_grad,
+                     phi::DenseTensor* filter_grad,
+                     phi::DenseTensor* in_x_grad,
                      const std::string& unique_name)
       : platform::MKLDNNHandlerT<T,
                                  dnnl::convolution_forward,
@@ -449,7 +450,7 @@ class ConvMKLDNNHandlerT
             this->dev_ctx_.GetBlob(key_bs));
     if (bias_scale_tuple) return bias_scale_tuple;
 
-    const auto* filter = ctx.Input<Tensor>("Filter");
+    const auto* filter = ctx.Input<phi::DenseTensor>("Filter");
     const auto& weights_tz = phi::vectorize(filter->dims());
     const int groups = std::max(ctx.Attr<int>("groups"), 1);
 
@@ -482,7 +483,7 @@ class ConvMKLDNNHandlerT
 
   std::tuple<float, std::vector<float>, float> get_int8_scales(
       const framework::ExecutionContext& ctx) const {
-    const auto* filter = ctx.Input<Tensor>("Filter");
+    const auto* filter = ctx.Input<phi::DenseTensor>("Filter");
     const auto& weights_tz = phi::vectorize(filter->dims());
 
     const bool& force_fp32_output = ctx.Attr<bool>("force_fp32_output");
@@ -567,7 +568,7 @@ class ConvMKLDNNHandlerT
 
   std::shared_ptr<dnnl::memory>
   AcquireWeightsMemoryWithReorderFromDataPrimitive(
-      const framework::Tensor* filter, const int groups, const bool is_conv3d) {
+      const phi::DenseTensor* filter, const int groups, const bool is_conv3d) {
     const K* filter_data = filter->data<K>();
     auto weights_tz = phi::vectorize(filter->dims());
     platform::GetGroupConvWeightsTz(weights_tz, groups);
@@ -586,7 +587,7 @@ class ConvMKLDNNHandlerT
   }
 
   std::shared_ptr<dnnl::memory> AcquireSrcMemoryWithReorder(
-      const framework::Tensor* input) {
+      const phi::DenseTensor* input) {
     return this->AcquireMemoryWithReorderPrimitive(input,
                                                    "@src_mem_p_user",
                                                    "@src_mem_p_target",
@@ -595,7 +596,7 @@ class ConvMKLDNNHandlerT
   }
 
   std::shared_ptr<dnnl::memory> AcquireSrcMemoryWithReorderFromWeightsPrimitive(
-      const framework::Tensor* input) {
+      const phi::DenseTensor* input) {
     return this->AcquireMemoryWithReorderPrimitive(input,
                                                    "@src_mem_w_p_user",
                                                    "@src_mem_w_p_target",
@@ -605,7 +606,7 @@ class ConvMKLDNNHandlerT
 
   std::shared_ptr<dnnl::memory>
   AcquireDiffDstMemoryWithReorderFromWeightsPrimitive(
-      const framework::Tensor* out_grad) {
+      const phi::DenseTensor* out_grad) {
     return this->AcquireMemoryWithReorderPrimitive(
         out_grad,
         "@diff_dst_mem_w_p_user",
@@ -616,7 +617,7 @@ class ConvMKLDNNHandlerT
 
   std::shared_ptr<dnnl::memory>
   AcquireDiffDstMemoryWithReorderMemoryFromDataPrimitive(
-      const framework::Tensor* out_grad) {
+      const phi::DenseTensor* out_grad) {
     return this->AcquireMemoryWithReorderPrimitive(
         out_grad,
         "@diff_dst_mem_p_user",
@@ -626,7 +627,7 @@ class ConvMKLDNNHandlerT
   }
 
   std::shared_ptr<dnnl::memory> AcquireMemoryWithReorderPrimitive(
-      const framework::Tensor* in_mem,
+      const phi::DenseTensor* in_mem,
       const char* key_mem_user,
       const char* key_mem_target,
       const char* key_mem,
@@ -653,7 +654,7 @@ class ConvMKLDNNHandlerT
   }
 
   std::shared_ptr<dnnl::memory> AcquireWeightsMemoryWithReorder(
-      const framework::Tensor* filter,
+      const phi::DenseTensor* filter,
       const int groups,
       const bool is_conv3d,
       const bool is_test,
@@ -706,7 +707,7 @@ class ConvMKLDNNHandlerT
   }
 
   std::shared_ptr<dnnl::memory> AcquireBiasMemoryWithReorder(
-      const framework::Tensor* bias,
+      const phi::DenseTensor* bias,
       const bool is_test,
       const std::vector<float>& scale_data = {1.0f},
       int mask = 0) {
@@ -736,7 +737,7 @@ class ConvMKLDNNHandlerT
   }
 
   std::shared_ptr<dnnl::memory> AcquireResidualMemory(
-      const framework::Tensor* residual_param) {
+      const phi::DenseTensor* residual_param) {
     void* residual_data =
         framework::TransToProtoVarType(residual_param->dtype()) ==
                 framework::DataTypeTrait<T_out>::DataType()
@@ -754,7 +755,7 @@ class ConvMKLDNNHandlerT
   }
 
   std::shared_ptr<dnnl::memory> AcquireDstMemoryWithResidual(
-      framework::Tensor* output, const framework::Tensor* residual_param) {
+      phi::DenseTensor* output, const phi::DenseTensor* residual_param) {
     std::shared_ptr<dnnl::memory> dst_memory_p;
     if (residual_param->mem_desc() != this->fwd_pd_->dst_desc()) {
       auto residual_memory_p = this->AcquireResidualMemory(residual_param);
@@ -784,7 +785,7 @@ class ConvMKLDNNOpKernel : public framework::OpKernel<T> {
     bool is_INT8 =
         std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value;
     bool is_BFLOAT16 = ctx.Attr<std::string>("mkldnn_data_type") == "bfloat16";
-    auto residual_param = ctx.Input<Tensor>("ResidualData");
+    auto residual_param = ctx.Input<phi::DenseTensor>("ResidualData");
     bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
     std::string fuse_activation = ctx.Attr<std::string>("fuse_activation");
     bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
@@ -821,11 +822,11 @@ class ConvMKLDNNOpKernel : public framework::OpKernel<T> {
     const bool is_conv3d = ctx.Attr<std::vector<int>>("strides").size() == 3U;
     const bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
 
-    const auto* input = ctx.Input<Tensor>("Input");
-    const auto* filter = ctx.Input<Tensor>("Filter");
+    const auto* input = ctx.Input<phi::DenseTensor>("Input");
+    const auto* filter = ctx.Input<phi::DenseTensor>("Filter");
     const auto* bias =
-        ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
-    auto* output = ctx.Output<Tensor>("Output");
+        ctx.HasInput("Bias") ? ctx.Input<phi::DenseTensor>("Bias") : nullptr;
+    auto* output = ctx.Output<phi::DenseTensor>("Output");
 
     ConvMKLDNNHandlerT<T, K, T_out> handler(
         ctx,
@@ -845,7 +846,7 @@ class ConvMKLDNNOpKernel : public framework::OpKernel<T> {
 
     std::shared_ptr<dnnl::memory> dst_memory_p;
     if (fuse_residual_conn) {
-      auto* residual_param = ctx.Input<Tensor>("ResidualData");
+      auto* residual_param = ctx.Input<phi::DenseTensor>("ResidualData");
       dst_memory_p =
           handler.AcquireDstMemoryWithResidual(output, residual_param);
     } else {
@@ -898,10 +899,11 @@ class ConvMKLDNNOpKernel : public framework::OpKernel<T> {
         platform::errors::Unimplemented(
             "residual fusion does not support force output with fp32"));
 
-    auto* input = ctx.Input<Tensor>("Input");
-    auto* filter = ctx.Input<Tensor>("Filter");
-    auto* bias = ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
-    auto* output = ctx.Output<Tensor>("Output");
+    auto* input = ctx.Input<phi::DenseTensor>("Input");
+    auto* filter = ctx.Input<phi::DenseTensor>("Filter");
+    auto* bias =
+        ctx.HasInput("Bias") ? ctx.Input<phi::DenseTensor>("Bias") : nullptr;
+    auto* output = ctx.Output<phi::DenseTensor>("Output");
 
     ConvMKLDNNHandlerT<T, K, T_out> handler(
         ctx,
@@ -927,7 +929,7 @@ class ConvMKLDNNOpKernel : public framework::OpKernel<T> {
 
     std::shared_ptr<dnnl::memory> dst_memory_p;
     if (fuse_residual_conn) {
-      auto* residual_param = ctx.Input<Tensor>("ResidualData");
+      auto* residual_param = ctx.Input<phi::DenseTensor>("ResidualData");
       PADDLE_ENFORCE_EQ(
           output->dims(),
           residual_param->dims(),
@@ -998,14 +1000,16 @@ class ConvMKLDNNGradOpKernel : public framework::OpKernel<T> {
         ctx.template device_context<platform::MKLDNNDeviceContext>();
     const auto& mkldnn_engine = dev_ctx.GetEngine();
 
-    const Tensor* input = ctx.Input<Tensor>("Input");
-    const Tensor* filter = ctx.Input<Tensor>("Filter");
-    const Tensor* bias =
-        ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
-    const Tensor* output_grad =
-        ctx.Input<Tensor>(framework::GradVarName("Output"));
-    Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
+    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
+    const phi::DenseTensor* filter = ctx.Input<phi::DenseTensor>("Filter");
+    const phi::DenseTensor* bias =
+        ctx.HasInput("Bias") ? ctx.Input<phi::DenseTensor>("Bias") : nullptr;
+    const phi::DenseTensor* output_grad =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Output"));
+    phi::DenseTensor* input_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
+    phi::DenseTensor* filter_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Filter"));
 
     if (!input_grad && !filter_grad) return;
 
diff --git a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
index 80163389318aa..d2dfc9a9c1ccf 100644
--- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
@@ -21,10 +21,11 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using framework::DataLayout;
 
-inline dnnl::memory::dims GetWeightsTz(const Tensor* filter, const int groups) {
+inline dnnl::memory::dims GetWeightsTz(const phi::DenseTensor* filter,
+                                       const int groups) {
   auto weights_tz = phi::vectorize(filter->dims());
   int g = std::max(groups, 1);
   int g_dim = (g > 1) ? 1 : 0;
@@ -40,10 +41,10 @@ class ConvTransposeMKLDNNHandlerT
  public:
   ConvTransposeMKLDNNHandlerT(const framework::ExecutionContext& ctx,
                               const dnnl::engine mkldnn_engine,
-                              const Tensor* input,
-                              const Tensor* filter,
-                              const Tensor* bias,
-                              Tensor* output)
+                              const phi::DenseTensor* input,
+                              const phi::DenseTensor* filter,
+                              const phi::DenseTensor* bias,
+                              phi::DenseTensor* output)
       : platform::MKLDNNHandlerNoCachingT<T, dnnl::deconvolution_forward>(
             mkldnn_engine, ctx.GetPlace()),
         is_test_(ctx.Attr<bool>("is_test")) {
@@ -218,7 +219,7 @@ class ConvTransposeMKLDNNHandlerT
   }
 
   std::shared_ptr<dnnl::memory> AcquireSrcMemoryWithReorder(
-      const framework::Tensor* input) {
+      const phi::DenseTensor* input) {
     const T* input_data = input->data<T>();
     return platform::MKLDNNHandlerNoCachingT<T, dnnl::deconvolution_forward>::
         AcquireMemoryWithReorder(input->mem_desc(),
@@ -229,7 +230,7 @@ class ConvTransposeMKLDNNHandlerT
   std::shared_ptr<dnnl::memory> AcquireWeightsMemoryWithReorder(
       const platform::MKLDNNDeviceContext& dev_ctx,
       const std::string& key,
-      const framework::Tensor* filter,
+      const phi::DenseTensor* filter,
       const int& groups) {
     const K* filter_data = filter->data<K>();
     auto weights_tz = GetWeightsTz(filter, groups);
@@ -331,7 +332,7 @@ class ConvTransposeMKLDNNHandlerT
   std::shared_ptr<dnnl::memory> AcquireBiasMemoryWithReorder(
       const platform::MKLDNNDeviceContext& dev_ctx,
       const std::string& key,
-      const framework::Tensor* bias) {
+      const phi::DenseTensor* bias) {
     const K* bias_data = bias->data<K>();
     auto user_bias_md =
         platform::MKLDNNMemDesc(phi::vectorize(bias->dims()),
@@ -377,11 +378,11 @@ class ConvTransposeMKLDNNOpKernel : public framework::OpKernel<T> {
         ctx.template device_context<platform::MKLDNNDeviceContext>();
     const auto& mkldnn_engine = dev_ctx.GetEngine();
 
-    const auto* input = ctx.Input<Tensor>("Input");
-    const auto* filter = ctx.Input<Tensor>("Filter");
+    const auto* input = ctx.Input<phi::DenseTensor>("Input");
+    const auto* filter = ctx.Input<phi::DenseTensor>("Filter");
     const auto* bias =
-        ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
-    auto* output = ctx.Output<Tensor>("Output");
+        ctx.HasInput("Bias") ? ctx.Input<phi::DenseTensor>("Bias") : nullptr;
+    auto* output = ctx.Output<phi::DenseTensor>("Output");
     ConvTransposeMKLDNNHandlerT<T, K, T_out> handler(
         ctx, mkldnn_engine, input, filter, bias, output);
     auto src_memory_p = handler.AcquireSrcMemoryWithReorder(input);
diff --git a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
index c25c662d612b1..4ceddf53f9458 100644
--- a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "dnnl.hpp"
+#include "dnnl.hpp"  // NOLINT
 #include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/dequantize_op.h"
@@ -27,7 +27,7 @@ using dnnl::memory;
 using dnnl::primitive;
 using dnnl::reorder;
 using platform::to_void_cast;
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using dnnl::stream;
 using framework::DataLayout;
 using platform::GetMKLDNNFormat;
@@ -36,11 +36,11 @@ template <typename T>
 class DeQuantOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("Input");
+    auto* x = ctx.Input<phi::DenseTensor>("Input");
     const auto quantization_scale = ctx.Attr<float>("Scale");
     const auto quantization_shift = ctx.Attr<float>("Shift");
     const bool with_shift = quantization_shift != 0.0f;
-    auto* out = ctx.Output<Tensor>("Output");
+    auto* out = ctx.Output<phi::DenseTensor>("Output");
 
     PADDLE_ENFORCE(quantization_scale != 0.0f,
                    platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
index d477fa0b2bf2c..98ebe42fa1f8d 100644
--- a/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
@@ -20,7 +20,7 @@ namespace {
 
 using paddle::framework::ExecutionContext;
 using paddle::framework::GradVarName;
-using paddle::framework::Tensor;
+
 using paddle::platform::MKLDNNDeviceContext;
 using phi::vectorize;
 
@@ -35,8 +35,8 @@ class ExpandMKLDNNKernel : public paddle::framework::OpKernel<T> {
     const auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
     const auto& onednn_engine = dev_ctx.GetEngine();
 
-    const auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    const auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     auto x_vec_dims = vectorize(x->dims());
 
@@ -99,8 +99,8 @@ class ExpandGradMKLDNNKernel : public paddle::framework::OpKernel<T> {
     const auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
     const auto& onednn_engine = dev_ctx.GetEngine();
 
-    auto* dout = ctx.Input<Tensor>(GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(GradVarName("X"));
+    auto* dout = ctx.Input<phi::DenseTensor>(GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(GradVarName("X"));
 
     auto dx_vec_dims = vectorize(dx->dims());
     auto dout_vec_dims = vectorize(dout->dims());
diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
index 7404972ea7cca..273f7b5c932e0 100644
--- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
@@ -33,7 +33,7 @@ using framework::DataLayout;
 using framework::DDim;
 using framework::ExecutionContext;
 using framework::LoDTensor;
-using framework::Tensor;
+
 using platform::GetMKLDNNFormat;
 using platform::MKLDNNDeviceContext;
 using platform::to_void_cast;
@@ -44,8 +44,8 @@ class FCPrimitiveFactory {
   explicit FCPrimitiveFactory(const dnnl::engine& engine) : engine_(engine) {}
 
   void ExecuteFcPrimitive(const LoDTensor* input,
-                          const Tensor* weights,
-                          const Tensor* bias,
+                          const phi::DenseTensor* weights,
+                          const phi::DenseTensor* bias,
                           LoDTensor* output,
                           const MKLDNNDeviceContext& dev_ctx,
                           const ExecutionContext& ctx) {
@@ -158,7 +158,7 @@ class FCPrimitiveFactory {
   // primitive. Therefore, function SetOutputFormat is needed to choose
   // an appropriate format based on the number of input dimensions and
   // format of an input tensor.
-  void SetOutputFormat(MKLDNNMemoryFormat in_format, Tensor* out) {
+  void SetOutputFormat(MKLDNNMemoryFormat in_format, phi::DenseTensor* out) {
     int dim_num = out->dims().size();
     // In case of 2 dims, we set the only possible format, nc
     if (dim_num == 2) {
@@ -184,8 +184,8 @@ class FCPrimitiveFactory {
   }
 
   void UpdateDataPointers(const ExecutionContext& ctx,
-                          Tensor* out,
-                          const Tensor* in) {
+                          phi::DenseTensor* out,
+                          const phi::DenseTensor* in) {
     input_->set_data_handle(to_void_cast(in->data<T_in>()));
     output_->set_data_handle(out->mutable_data<T_out>(ctx.GetPlace()));
     // If the primitive exists, but the output tensor has changed its
@@ -198,8 +198,8 @@ class FCPrimitiveFactory {
 
   dnnl::inner_product_forward::primitive_desc Create2DFcPrimDescriptor(
       const LoDTensor* input,
-      const Tensor* weights,
-      const Tensor* bias,
+      const phi::DenseTensor* weights,
+      const phi::DenseTensor* bias,
       LoDTensor* output,
       const ExecutionContext& ctx) {
     auto src_desc = CreateMemDescriptor<T_in>(input, MKLDNNMemoryFormat::any);
@@ -212,7 +212,7 @@ class FCPrimitiveFactory {
     return CreateFcPrimDesc(src_desc, weights_desc, bias_desc, dst_desc, attrs);
   }
 
-  std::vector<int64_t> Get2DWeightDimsForDNNL(const Tensor* weights) {
+  std::vector<int64_t> Get2DWeightDimsForDNNL(const phi::DenseTensor* weights) {
     auto dims = phi::vectorize(weights->dims());
     std::swap(dims[0], dims[1]);  // swap input dim with output dim
     return dims;
@@ -222,8 +222,8 @@ class FCPrimitiveFactory {
 
   dnnl::inner_product_forward::primitive_desc Create3DFcPrimDescriptor(
       const LoDTensor* input,
-      const Tensor* weights,
-      const Tensor* bias,
+      const phi::DenseTensor* weights,
+      const phi::DenseTensor* bias,
       LoDTensor* output,
       const ExecutionContext& ctx) {
     auto input_dims = phi::vectorize(input->dims());
@@ -245,20 +245,20 @@ class FCPrimitiveFactory {
     return CreateFcPrimDesc(src_desc, weights_desc, bias_desc, dst_desc, attrs);
   }
 
-  std::vector<int64_t> Get3DWeightDimsForDNNL(const Tensor* weights) {
+  std::vector<int64_t> Get3DWeightDimsForDNNL(const phi::DenseTensor* weights) {
     auto paddle_w_dims = phi::vectorize(weights->dims());
     return {paddle_w_dims[1], paddle_w_dims[0], 1};
   }
 
-  memory::desc Create3DUserWeightsDesc(const Tensor* weights) {
+  memory::desc Create3DUserWeightsDesc(const phi::DenseTensor* weights) {
     auto dims = Get3DWeightDimsForDNNL(weights);
     return CreateMemDescriptor<float>(dims, MKLDNNMemoryFormat::oiw);
   }
 
   dnnl::inner_product_forward::primitive_desc Create4DFcPrimDescriptor(
       const LoDTensor* input,
-      const Tensor* weights,
-      const Tensor* bias,
+      const phi::DenseTensor* weights,
+      const phi::DenseTensor* bias,
       LoDTensor* output,
       const ExecutionContext& ctx) {
     auto src_desc = CreateMemDescriptor<T_in>(input, MKLDNNMemoryFormat::any);
@@ -274,7 +274,7 @@ class FCPrimitiveFactory {
   }
 
   std::vector<int64_t> Get4DWeightDimsForDNNL(const LoDTensor* input,
-                                              const Tensor* weights) {
+                                              const phi::DenseTensor* weights) {
     auto old_w_dims = phi::vectorize(weights->dims());
     auto old_in_dims = phi::vectorize(input->dims());
     auto dims = {old_w_dims[1], old_in_dims[1], old_in_dims[2], old_in_dims[3]};
@@ -282,7 +282,7 @@ class FCPrimitiveFactory {
   }
 
   memory::desc Create4DUserWeightsDesc(const LoDTensor* input,
-                                       const Tensor* weights) {
+                                       const phi::DenseTensor* weights) {
     auto dims = Get4DWeightDimsForDNNL(input, weights);
     return CreateMemDescriptor<float>(dims, MKLDNNMemoryFormat::oihw);
   }
@@ -351,7 +351,7 @@ class FCPrimitiveFactory {
   }
 
   template <typename T>
-  static dnnl::memory::desc CreateMemDescriptor(const Tensor* tensor,
+  static dnnl::memory::desc CreateMemDescriptor(const phi::DenseTensor* tensor,
                                                 MKLDNNMemoryFormat format) {
     auto dims = phi::vectorize(tensor->dims());
     return CreateMemDescriptor<T>(dims, format);
@@ -359,7 +359,7 @@ class FCPrimitiveFactory {
 
   template <typename T>
   dnnl::memory CreateMemory(const dnnl::memory::desc& desc,
-                            const Tensor* tensor) {
+                            const phi::DenseTensor* tensor) {
     return CreateMemory(desc, platform::to_void_cast<T>(tensor->data<T>()));
   }
 
@@ -369,7 +369,7 @@ class FCPrimitiveFactory {
 
   template <typename T>
   std::shared_ptr<dnnl::memory> CreateMemoryToBeCached(
-      const dnnl::memory::desc& desc, const Tensor* tensor) {
+      const dnnl::memory::desc& desc, const phi::DenseTensor* tensor) {
     return CreateMemoryToBeCached(desc,
                                   platform::to_void_cast<T>(tensor->data<T>()));
   }
@@ -380,7 +380,8 @@ class FCPrimitiveFactory {
   }
 
   // Create weights memory and transform to default MKL-DNN format
-  std::shared_ptr<dnnl::memory> CreateWeightsMemory(const Tensor* weights) {
+  std::shared_ptr<dnnl::memory> CreateWeightsMemory(
+      const phi::DenseTensor* weights) {
     auto dims = phi::vectorize(weights->dims());
     std::swap(dims[0], dims[1]);  // Correct output dimensions
     auto src_desc = CreateMemDescriptor<float>(dims, MKLDNNMemoryFormat::io);
@@ -557,10 +558,10 @@ class FCPrimitiveFactory {
   dnnl::memory CreateDstMemory(
       const dnnl::inner_product_forward::primitive_desc& fc_prim_desc,
       const ExecutionContext& ctx,
-      Tensor* output) {
+      phi::DenseTensor* output) {
     if (ctx.HasAttr("fuse_residual_connection") &&
         ctx.Attr<bool>("fuse_residual_connection")) {
-      auto* residual_param = ctx.Output<Tensor>("ResidualData");
+      auto* residual_param = ctx.Output<phi::DenseTensor>("ResidualData");
 
       PADDLE_ENFORCE_EQ(
           output->dims(),
@@ -587,7 +588,7 @@ class FCPrimitiveFactory {
 
   void RecomputeOutputDims(const ExecutionContext& ctx,
                            const LoDTensor* input,
-                           const Tensor* w,
+                           const phi::DenseTensor* w,
                            LoDTensor* output) {
     int in_num_col_dims = ctx.Attr<int>("in_num_col_dims");
     bool padding_weights = ctx.Attr<bool>("padding_weights");
@@ -638,8 +639,8 @@ GetPrimitiveFactory(const MKLDNNDeviceContext& dev_ctx,
 template <typename T_in, typename T_w>
 static void ExecuteFc(const ExecutionContext& ctx,
                       const LoDTensor* input,
-                      const Tensor* w,
-                      const Tensor* bias,
+                      const phi::DenseTensor* w,
+                      const phi::DenseTensor* bias,
                       LoDTensor* output,
                       bool fuse_relu,
                       bool force_fp32_output) {
@@ -679,8 +680,8 @@ class FCMKLDNNOpKernel : public framework::OpKernel<T_in> {
         platform::errors::PreconditionNotMet("FC MKL-DNN must use CPUPlace."));
     platform::MKLDNNDeviceContext::tls().log_lib_version();
     auto input = ctx.Input<LoDTensor>("Input");
-    auto w = ctx.Input<Tensor>("W");
-    auto bias = ctx.Input<Tensor>("Bias");
+    auto w = ctx.Input<phi::DenseTensor>("W");
+    auto bias = ctx.Input<phi::DenseTensor>("Bias");
     auto output = ctx.Output<LoDTensor>("Out");
 
     bool fuse_relu = ctx.Attr<std::string>("activation_type") == "relu";
diff --git a/paddle/fluid/operators/mkldnn/fill_constant_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fill_constant_mkldnn_op.cc
index e7e45b4b6e426..7673b66455f8f 100644
--- a/paddle/fluid/operators/mkldnn/fill_constant_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fill_constant_mkldnn_op.cc
@@ -18,13 +18,11 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 template <typename T>
 class FillConstantMKLDNNHandler
     : public platform::MKLDNNHandlerNoCachingT<T, dnnl::binary> {
  public:
-  FillConstantMKLDNNHandler(Tensor* out,
+  FillConstantMKLDNNHandler(phi::DenseTensor* out,
                             dnnl::engine engine,
                             platform::Place cpu_place)
       : platform::MKLDNNHandlerNoCachingT<T, dnnl::binary>(engine, cpu_place) {
@@ -61,7 +59,7 @@ class FillConstantMKLDNNKernel : public framework::OpKernel<T> {
         ctx.template device_context<platform::MKLDNNDeviceContext>();
     const auto& dnnl_engine = dev_ctx.GetEngine();
 
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     T fill_value = CalculateFillValue(ctx);
 
     auto shape = GetShape(ctx);
@@ -116,7 +114,7 @@ class FillConstantMKLDNNKernel : public framework::OpKernel<T> {
     }
 
     if (ctx.HasInput("ValueTensor")) {
-      const auto* value_tensor = ctx.Input<Tensor>("ValueTensor");
+      const auto* value_tensor = ctx.Input<phi::DenseTensor>("ValueTensor");
       PADDLE_ENFORCE_EQ(
           value_tensor->numel(),
           1,
diff --git a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
index 64d7bca4d0646..54c2e3e630a6a 100644
--- a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
@@ -35,8 +35,8 @@ class InterpolateMKLDNNHandler
   InterpolateMKLDNNHandler(const dnnl::algorithm algo,
                            const dnnl::engine engine,
                            platform::Place cpu_place,
-                           const Tensor* x,
-                           Tensor* out)
+                           const phi::DenseTensor* x,
+                           phi::DenseTensor* out)
       : platform::MKLDNNHandlerNoCachingT<T, dnnl::resampling_forward>(
             engine, cpu_place) {
     const auto dst_tz = phi::vectorize(out->dims());
@@ -51,7 +51,7 @@ template <typename T = float>
 class InterpolateMKLDNNKernel : public framework::OpKernel<T> {
   std::vector<int> ComputeOutputShape(
       const framework::ExecutionContext& ctx) const {
-    const auto* x = ctx.Input<Tensor>("X");
+    const auto* x = ctx.Input<phi::DenseTensor>("X");
     const auto& in_dims = x->dims();
 
     const framework::DDim in_dhw_dims =
@@ -70,8 +70,8 @@ class InterpolateMKLDNNKernel : public framework::OpKernel<T> {
       out_dims.push_back(ctx.Attr<int>("out_w"));
     }
 
-    auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
-    auto out_size = ctx.Input<Tensor>("OutSize");
+    auto list_new_size_tensor = ctx.MultiInput<phi::DenseTensor>("SizeTensor");
+    auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
     if (list_new_size_tensor.size() > 0) {
       auto new_size = get_new_shape(list_new_size_tensor);
       if (new_size.size() == out_dims.size()) {
@@ -85,7 +85,7 @@ class InterpolateMKLDNNKernel : public framework::OpKernel<T> {
     } else {
       std::vector<float> scale;
       scale.reserve(3);
-      auto scale_tensor = ctx.Input<Tensor>("Scale");
+      auto scale_tensor = ctx.Input<phi::DenseTensor>("Scale");
       if (scale_tensor != nullptr) {
         auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
         scale.resize(3, scale_data[0]);
@@ -136,8 +136,8 @@ class InterpolateMKLDNNKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
     const auto& mkldnn_engine = dev_ctx.GetEngine();
 
-    const auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    const auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     const auto interp_method = ctx.Attr<std::string>("interp_method");
     const dnnl::algorithm algo = (interp_method == "nearest")
diff --git a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
index 9aa7e26530d74..d69185f4526ec 100644
--- a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
@@ -28,7 +28,7 @@ class LayerNormMKLDNNHandler
                          const float& epsilon,
                          const dnnl::normalization_flags& flags,
                          const bool& is_test,
-                         const Tensor* x,
+                         const phi::DenseTensor* x,
                          const dnnl::engine engine,
                          platform::Place cpu_place)
       : platform::MKLDNNHandlerNoCachingT<T, dnnl::layer_normalization_forward>(
@@ -39,8 +39,8 @@ class LayerNormMKLDNNHandler
         fwd_prop_kind, x->mem_desc(), epsilon, flags);
   }
 
-  std::shared_ptr<dnnl::memory> AcquireScaleShiftMemory(const Tensor* scale,
-                                                        const Tensor* shift) {
+  std::shared_ptr<dnnl::memory> AcquireScaleShiftMemory(
+      const phi::DenseTensor* scale, const phi::DenseTensor* shift) {
     // OneDNN requires a single piece of memory for scale and shift data
     const unsigned int C = phi::vectorize(scale->dims())[0];
 
@@ -55,7 +55,7 @@ class LayerNormMKLDNNHandler
     return scaleshift_memory;
   }
 
-  std::shared_ptr<dnnl::memory> AcquireMeanMemory(framework::Tensor* mean) {
+  std::shared_ptr<dnnl::memory> AcquireMeanMemory(phi::DenseTensor* mean) {
     T* mean_data = mean->mutable_data<T>(this->place_,
                                          this->fwd_pd_->mean_desc().get_size());
     return this->AcquireMemoryFromPrimitive(this->fwd_pd_->mean_desc(),
@@ -63,7 +63,7 @@ class LayerNormMKLDNNHandler
   }
 
   std::shared_ptr<dnnl::memory> AcquireVarianceMemory(
-      framework::Tensor* variance) {
+      phi::DenseTensor* variance) {
     T* variance_data = variance->mutable_data<T>(
         this->place_, this->fwd_pd_->variance_desc().get_size());
     return this->AcquireMemoryFromPrimitive(this->fwd_pd_->variance_desc(),
@@ -75,10 +75,10 @@ template <typename T>
 class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* scale = ctx.Input<Tensor>("Scale");
-    auto* bias = ctx.Input<Tensor>("Bias");
-    auto* out = ctx.Output<Tensor>("Y");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* scale = ctx.Input<phi::DenseTensor>("Scale");
+    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
+    auto* out = ctx.Output<phi::DenseTensor>("Y");
 
     const float epsilon = ctx.Attr<float>("epsilon");
     const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
@@ -116,8 +116,8 @@ class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                                                   {DNNL_ARG_DST, *dst_memory}};
 
     if (!is_test) {
-      auto* mean = ctx.Output<Tensor>("Mean");
-      auto* var = ctx.Output<Tensor>("Variance");
+      auto* mean = ctx.Output<phi::DenseTensor>("Mean");
+      auto* var = ctx.Output<phi::DenseTensor>("Variance");
 
       auto mean_memory = handler.AcquireMeanMemory(mean);
       auto variance_memory = handler.AcquireVarianceMemory(var);
diff --git a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
index 7043b3b4dda0a..12e12ca428a32 100644
--- a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using paddle::framework::Tensor;
 using paddle::platform::MKLDNNDeviceContext;
 
 template <typename T>
@@ -28,7 +27,7 @@ class LRNMKLDNNHandler
   LRNMKLDNNHandler(const framework::ExecutionContext& ctx,
                    const dnnl::engine mkldnn_engine,
                    platform::Place cpu_place,
-                   const Tensor* input)
+                   const phi::DenseTensor* input)
 
       : platform::
             MKLDNNHandlerNoCachingT<T, dnnl::lrn_forward, dnnl::lrn_backward>(
@@ -59,9 +58,9 @@ class LRNMKLDNNHandler
   LRNMKLDNNHandler(const framework::ExecutionContext& ctx,
                    const dnnl::engine mkldnn_engine,
                    platform::Place cpu_place,
-                   const Tensor* in_x,
-                   const Tensor* out_grad,
-                   Tensor* in_x_grad)
+                   const phi::DenseTensor* in_x,
+                   const phi::DenseTensor* out_grad,
+                   phi::DenseTensor* in_x_grad)
       : platform::
             MKLDNNHandlerNoCachingT<T, dnnl::lrn_forward, dnnl::lrn_backward>(
                 mkldnn_engine, cpu_place) {
@@ -95,7 +94,8 @@ class LRNMKLDNNHandler
         k);
   }
 
-  std::shared_ptr<dnnl::memory> AcquireWorkspaceMemory(Tensor* workspace) {
+  std::shared_ptr<dnnl::memory> AcquireWorkspaceMemory(
+      phi::DenseTensor* workspace) {
     T* ptr = workspace->mutable_data<T>(
         this->place_, this->fwd_pd_->workspace_desc().get_size());
     return this->AcquireMemoryFromPrimitive(this->fwd_pd_->workspace_desc(),
@@ -103,7 +103,7 @@ class LRNMKLDNNHandler
   }
 
   std::shared_ptr<dnnl::memory> AcquireBackwardWorkspaceMemory(
-      const Tensor* workspace) {
+      const phi::DenseTensor* workspace) {
     const T* workspace_data = workspace->data<T>();
     return this->AcquireMemoryFromPrimitive(
         this->fwd_pd_->workspace_desc(),
@@ -128,9 +128,9 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         ctx.template device_context<platform::MKLDNNDeviceContext>();
     const auto& mkldnn_engine = dev_ctx.GetEngine();
 
-    auto x = ctx.Input<Tensor>("X");
-    auto out = ctx.Output<Tensor>("Out");
-    auto mid = ctx.Output<Tensor>("MidOut");
+    auto x = ctx.Input<phi::DenseTensor>("X");
+    auto out = ctx.Output<phi::DenseTensor>("Out");
+    auto mid = ctx.Output<phi::DenseTensor>("MidOut");
 
     LRNMKLDNNHandler<T> handler(ctx, mkldnn_engine, ctx.GetPlace(), x);
 
@@ -173,11 +173,11 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
                       paddle::platform::errors::PreconditionNotMet(
                           "Operator DNNL LRNGrad must use CPUPlace"));
 
-    auto in_x = ctx.Input<Tensor>("X");
-    auto mid = ctx.Input<Tensor>("MidOut");
+    auto in_x = ctx.Input<phi::DenseTensor>("X");
+    auto mid = ctx.Input<phi::DenseTensor>("MidOut");
 
-    auto out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto in_x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto in_x_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
     const auto& mkldnn_engine = dev_ctx.GetEngine();
diff --git a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.h b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.h
index 0abd53a5bb616..53dd177071496 100644
--- a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.h
+++ b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.h
@@ -24,7 +24,7 @@ namespace operators {
 
 using framework::ExecutionContext;
 using platform::MKLDNNDeviceContext;
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class MatMulGradMKLDNNKernel : public framework::OpKernel<T> {
@@ -35,13 +35,13 @@ class MatMulGradMKLDNNKernel : public framework::OpKernel<T> {
   void ExecuteMatMulGrad(const ExecutionContext& ctx,
                          const MKLDNNDeviceContext& dev_ctx,
                          const dnnl::engine& engine,
-                         Tensor* x,
+                         phi::DenseTensor* x,
                          bool trans_x,
                          bool is_fold_init_dims_x,
-                         Tensor* y,
+                         phi::DenseTensor* y,
                          bool trans_y,
                          bool is_fold_init_dims_y,
-                         Tensor* out) const;
+                         phi::DenseTensor* out) const;
   void RunKernel(const ExecutionContext& ctx) const;
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
index 000e31aad9ac9..44296d12f2bac 100644
--- a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
@@ -24,7 +24,7 @@ using paddle::platform::MKLDNNFormatForSize;
 using paddle::platform::MKLDNNGetDataType;
 using paddle::platform::to_void_cast;
 using phi::vectorize;
-using Tensor = paddle::framework::Tensor;
+using Tensor = phi::DenseTensor;
 using paddle::framework::GradVarName;
 using phi::make_ddim;
 
@@ -106,7 +106,7 @@ static paddle::framework::DDim ColumnMatrixDimsFromVector(
 phi::DDim GetDimForInput(const ExecutionContext &ctx, std::string input_name) {
   auto shape = ctx.Attr<std::vector<int>>("fused_reshape_" + input_name);
   auto axis = ctx.Attr<std::vector<int>>("fused_transpose_" + input_name);
-  auto input_dims = ctx.Input<Tensor>(input_name)->dims();
+  auto input_dims = ctx.Input<phi::DenseTensor>(input_name)->dims();
   if (!shape.empty() && !axis.empty()) {
     return input_dims.reshape(shape).transpose(axis);
   }
@@ -182,9 +182,9 @@ class MatMulMKLDNNHandler
   }
 
  public:
-  void Execute(const paddle::framework::Tensor *x,
-               const paddle::framework::Tensor *y,
-               paddle::framework::Tensor *out) {
+  void Execute(const phi::DenseTensor *x,
+               const phi::DenseTensor *y,
+               phi::DenseTensor *out) {
     const auto src_memory_p = this->AcquireSrcMemory(x);
     const auto weights_memory_p = this->AcquireWeightsMemory(y);
     const auto dst_memory_p = this->AcquireDstMemory(out);
@@ -217,8 +217,7 @@ class MatMulMKLDNNHandler
     out->set_mem_desc(dst_memory_p->get_desc().reshape(out->dims()));
   }
 
-  std::shared_ptr<dnnl::memory> AcquireDstMemory(
-      paddle::framework::Tensor *output) {
+  std::shared_ptr<dnnl::memory> AcquireDstMemory(phi::DenseTensor *output) {
     // We cannot use base AcquireDstMemory as it makes an allocation request
     // base on DST memory primitive size. This is fine in general, but in MatMul
     // we have primitive that covers only one batch of Data and then shift
@@ -241,7 +240,7 @@ class MatMulMKLDNNHandler
       const ExecutionContext &ctx, std::string input_name) {
     auto shape = ctx.Attr<std::vector<int>>("fused_reshape_" + input_name);
     auto axis = ctx.Attr<std::vector<int>>("fused_transpose_" + input_name);
-    auto input_dims = ctx.Input<Tensor>(input_name)->dims();
+    auto input_dims = ctx.Input<phi::DenseTensor>(input_name)->dims();
     auto new_dims = input_dims;
     if (!shape.empty() && !axis.empty()) {
       new_dims = input_dims.reshape(shape).transpose(axis);
@@ -478,9 +477,9 @@ static void ExecuteMatMul(const ExecutionContext &ctx) {
       ctx.HasAttr("fuse_activation")
           ? ctx.Attr<std::string>("fuse_activation") == "relu"
           : false;
-  auto *x = ctx.Input<Tensor>("X");
-  auto *y = ctx.Input<Tensor>("Y");
-  auto *out = ctx.Output<Tensor>("Out");
+  auto *x = ctx.Input<phi::DenseTensor>("X");
+  auto *y = ctx.Input<phi::DenseTensor>("Y");
+  auto *out = ctx.Output<phi::DenseTensor>("Out");
   const auto &dev_ctx =
       ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
   const auto &onednn_engine = dev_ctx.GetEngine();
@@ -551,7 +550,7 @@ std::vector<int64_t> GetInputStrides(const ExecutionContext &ctx,
                                      const std::string input_name) {
   auto shape = ctx.Attr<std::vector<int>>("fused_reshape_" + input_name);
   auto axis = ctx.Attr<std::vector<int>>("fused_transpose_" + input_name);
-  auto input_dims = ctx.Input<Tensor>(input_name)->dims();
+  auto input_dims = ctx.Input<phi::DenseTensor>(input_name)->dims();
   auto new_dims = input_dims;
   if (!shape.empty() && !axis.empty()) {
     new_dims = input_dims.reshape(shape).transpose(axis);
@@ -639,7 +638,7 @@ void ExecuteMatMulV2(const ExecutionContext &ctx,
       {DNNL_ARG_DST, *dst_memory_p}};
 
   if (ctx.HasInput("ResidualData")) {
-    auto *residual_data = ctx.Input<Tensor>("ResidualData");
+    auto *residual_data = ctx.Input<phi::DenseTensor>("ResidualData");
     const auto residual_data_memory_p = handler.AcquireSrcMemory(residual_data);
     matmul_args.insert({DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1,
                         *residual_data_memory_p});
@@ -746,9 +745,9 @@ class MatMulV2MKLDNNKernel : public paddle::framework::OpKernel<T> {
     const auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
     const auto &onednn_engine = dev_ctx.GetEngine();
 
-    auto *x = ctx.Input<Tensor>("X");
-    auto *y = ctx.Input<Tensor>("Y");
-    auto *out = ctx.Output<Tensor>("Out");
+    auto *x = ctx.Input<phi::DenseTensor>("X");
+    auto *y = ctx.Input<phi::DenseTensor>("Y");
+    auto *out = ctx.Output<phi::DenseTensor>("Out");
     bool trans_x = ctx.HasAttr("trans_x") ? ctx.Attr<bool>("trans_x")
                                           : ctx.Attr<bool>("transpose_X");
     bool trans_y = ctx.HasAttr("trans_y") ? ctx.Attr<bool>("trans_y")
@@ -858,8 +857,8 @@ class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel<T> {
     const auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
     const auto &onednn_engine = dev_ctx.GetEngine();
 
-    auto *x = ctx.Input<Tensor>("X");
-    auto *y = ctx.Input<Tensor>("Y");
+    auto *x = ctx.Input<phi::DenseTensor>("X");
+    auto *y = ctx.Input<phi::DenseTensor>("Y");
 
     auto x_dims = vectorize(x->dims());
     auto y_dims = vectorize(y->dims());
@@ -882,9 +881,9 @@ class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel<T> {
       return;
     }
 
-    auto *dout = ctx.Input<Tensor>(GradVarName("Out"));
-    auto *dx = ctx.Output<Tensor>(GradVarName("X"));
-    auto *dy = ctx.Output<Tensor>(GradVarName("Y"));
+    auto *dout = ctx.Input<phi::DenseTensor>(GradVarName("Out"));
+    auto *dx = ctx.Output<phi::DenseTensor>(GradVarName("X"));
+    auto *dy = ctx.Output<phi::DenseTensor>(GradVarName("Y"));
 
     bool trans_x = ctx.HasAttr("trans_x") ? ctx.Attr<bool>("trans_x")
                                           : ctx.Attr<bool>("transpose_X");
@@ -1133,11 +1132,11 @@ void MatMulGradMKLDNNKernel<T>::RunKernel(const ExecutionContext &ctx) const {
       ctx.template device_context<platform::MKLDNNDeviceContext>();
   const auto &onednn_engine = dev_ctx.GetEngine();
 
-  auto x = *ctx.Input<Tensor>("X");
-  auto y = *ctx.Input<Tensor>("Y");
-  auto dout = *ctx.Input<Tensor>(framework::GradVarName("Out"));
-  auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-  auto *dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+  auto x = *ctx.Input<phi::DenseTensor>("X");
+  auto y = *ctx.Input<phi::DenseTensor>("Y");
+  auto dout = *ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+  auto *dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+  auto *dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
 
   bool transpose_x = ctx.HasAttr("transpose_X") ? ctx.Attr<bool>("transpose_X")
                                                 : ctx.Attr<bool>("trans_x");
diff --git a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
index e9150b0c58f76..29329351de8d6 100644
--- a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
@@ -28,7 +28,6 @@ using framework::DataLayout;
 using framework::DDim;
 using framework::ExecutionContext;
 using framework::LoDTensor;
-using framework::Tensor;
 
 using platform::MatMulV2MKLDNNHandler;
 using platform::MKLDNNDeviceContext;
@@ -378,9 +377,9 @@ class MulMKLDNNINT8Kernel : public framework::OpKernel<XT> {
     auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
     auto &mkldnn_engine = dev_ctx.GetEngine();
 
-    const Tensor *x = ctx.Input<Tensor>("X");
-    const Tensor *y = ctx.Input<Tensor>("Y");
-    Tensor *out = ctx.Output<Tensor>("Out");
+    const Tensor *x = ctx.Input<phi::DenseTensor>("X");
+    const Tensor *y = ctx.Input<phi::DenseTensor>("Y");
+    Tensor *out = ctx.Output<phi::DenseTensor>("Out");
     auto out_dims = out->dims();
 
     auto mul = GetMulPrimitive<XT, YT>(dev_ctx, ctx, x, y, out, mkldnn_engine);
@@ -451,9 +450,9 @@ class MulMKLDNNKernel : public framework::OpKernel<XT> {
     const auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
     const auto &onednn_engine = dev_ctx.GetEngine();
 
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto *y = ctx.Input<Tensor>("Y");
-    auto *out = ctx.Output<Tensor>("Out");
+    const auto *x = ctx.Input<phi::DenseTensor>("X");
+    const auto *y = ctx.Input<phi::DenseTensor>("Y");
+    auto *out = ctx.Output<phi::DenseTensor>("Out");
 
     int x_num_col_dims = ctx.Attr<int>("x_num_col_dims");
     int y_num_col_dims = ctx.Attr<int>("y_num_col_dims");
@@ -502,7 +501,8 @@ class MulGradMKLDNNKernel : public MulMKLDNNKernel<XT, YT> {
 
     const auto *x = ctx.Input<LoDTensor>("X");
     const auto *y = ctx.Input<LoDTensor>("Y");
-    const auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    const auto *dout =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
 
     auto *dx = ctx.Output<LoDTensor>(framework::GradVarName("X"));
     auto *dy = ctx.Output<LoDTensor>(framework::GradVarName("Y"));
diff --git a/paddle/fluid/operators/mkldnn/pad3d_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pad3d_mkldnn_op.cc
index e7a528c452b8d..39af6d780ba86 100644
--- a/paddle/fluid/operators/mkldnn/pad3d_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/pad3d_mkldnn_op.cc
@@ -17,8 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 /*
 Pad3D is done by using up to 7 reorders. Following example is done
 on 2D data for simplicity, but it is straightforward to extend it to 3D case.
@@ -72,9 +70,9 @@ class PadMKLDNNKernel : public framework::OpKernel<T> {
     const auto& onednn_engine = dev_ctx.GetEngine();
     auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
 
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
-    auto* paddings_tensor = ctx.Input<Tensor>("Paddings");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
+    auto* paddings_tensor = ctx.Input<phi::DenseTensor>("Paddings");
     std::vector<int> paddings(ctx.Attr<std::vector<int>>("paddings"));
     if (paddings_tensor) {
       std::copy(paddings_tensor->data<int>(),
diff --git a/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc
index da401a4947f55..e3b9d3ffd7c6a 100644
--- a/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc
@@ -19,7 +19,7 @@ namespace paddle {
 namespace operators {
 
 using dnnl::memory;
-using framework::Tensor;
+
 using platform::GetMKLDNNFormat;
 using platform::MKLDNNDeviceContext;
 using platform::MKLDNNGetDataType;
@@ -34,8 +34,8 @@ class PReluMKLDNNHandler
   PReluMKLDNNHandler(const MKLDNNDeviceContext& dev_ctx,
                      const dnnl::engine engine,
                      platform::Place cpu_place,
-                     const Tensor* x,
-                     const Tensor* weights,
+                     const phi::DenseTensor* x,
+                     const phi::DenseTensor* weights,
                      const std::string& uniq_name,
                      const std::string& mode,
                      const std::string& data_format,
@@ -70,7 +70,7 @@ class PReluMKLDNNHandler
   }
 
   std::shared_ptr<memory> AcquireWeightsMemoryPossiblyWithReorder(
-      const Tensor* weights, const bool is_test) {
+      const phi::DenseTensor* weights, const bool is_test) {
     const T* weights_data = weights->data<T>();
 
     // if weights are 1D, every format tag is correct, so we accept
@@ -88,7 +88,7 @@ class PReluMKLDNNHandler
                                           is_test);
   }
 
-  std::shared_ptr<memory> AcquireDiffWeightsMemory(Tensor* output) {
+  std::shared_ptr<memory> AcquireDiffWeightsMemory(phi::DenseTensor* output) {
     T* output_data = output->mutable_data<T>(
         this->place_, this->bwd_pd_->diff_weights_desc().get_size());
     return this->AcquireMemoryFromPrimitive(
@@ -108,9 +108,9 @@ class PReluMKLDNNKernel : public framework::OpKernel<T> {
     const auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
     const auto& onednn_engine = dev_ctx.GetEngine();
 
-    const auto* x = ctx.Input<Tensor>("X");
-    const auto* alpha = ctx.Input<Tensor>("Alpha");
-    auto* out = ctx.Output<Tensor>("Out");
+    const auto* x = ctx.Input<phi::DenseTensor>("X");
+    const auto* alpha = ctx.Input<phi::DenseTensor>("Alpha");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     const bool is_test = ctx.Attr<bool>("is_test");
     const auto mode = ctx.Attr<std::string>("mode");
     const auto data_format = ctx.Attr<std::string>("data_format");
@@ -153,11 +153,12 @@ class PReluGradMKLDNNKernel : public framework::OpKernel<T> {
     const auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
     const auto& onednn_engine = dev_ctx.GetEngine();
 
-    auto* x = ctx.Input<Tensor>("X");
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dalpha = ctx.Output<Tensor>(framework::GradVarName("Alpha"));
-    auto* alpha = ctx.Input<Tensor>("Alpha");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dalpha =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Alpha"));
+    auto* alpha = ctx.Input<phi::DenseTensor>("Alpha");
     const bool is_test = ctx.Attr<bool>("is_test");
     const auto mode = ctx.Attr<std::string>("mode");
     const auto data_format = ctx.Attr<std::string>("data_format");
diff --git a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
index 54827a9dd904b..af8843c74179e 100644
--- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
@@ -26,7 +26,7 @@ using dnnl::memory;
 using dnnl::primitive;
 using dnnl::reorder;
 using platform::to_void_cast;
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using dnnl::stream;
 using framework::DataLayout;
 using platform::GetMKLDNNFormat;
@@ -35,8 +35,8 @@ template <typename T>
 class QuantOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("Input");
-    auto* out = ctx.Output<Tensor>("Output");
+    auto* x = ctx.Input<phi::DenseTensor>("Input");
+    auto* out = ctx.Output<phi::DenseTensor>("Output");
 
     const auto quantization_scale = ctx.Attr<float>("Scale");
     const auto quantization_shift = ctx.Attr<float>("Shift");
diff --git a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
index ea30d7a6c5fc2..abfef00ae1678 100644
--- a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "dnnl.hpp"
+#include "dnnl.hpp"  // NOLINT
 #include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/requantize_op.h"
@@ -24,7 +24,7 @@ namespace operators {
 using dnnl::memory;
 using dnnl::reorder;
 using platform::to_void_cast;
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 namespace {
 
@@ -38,13 +38,13 @@ template <typename T>
 class ReQuantOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("Input");
+    auto* input = ctx.Input<phi::DenseTensor>("Input");
     auto scale_in = ctx.Attr<float>("Scale_in");
     auto shift_in = ctx.Attr<float>("Shift_in");
     auto scale_out = ctx.Attr<float>("Scale_out");
     auto shift_out = ctx.Attr<float>("Shift_out");
     bool with_shift = shift_in != 0.0f || shift_out != 0.0f;
-    auto* output = ctx.Output<Tensor>("Output");
+    auto* output = ctx.Output<phi::DenseTensor>("Output");
 
     PADDLE_ENFORCE_NE(
         scale_in,
diff --git a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
index ea56b84c90889..dea6abd0c02b4 100644
--- a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
@@ -35,7 +35,7 @@ using platform::GetMKLDNNFormat;
 using platform::to_void_cast;
 
 static std::vector<int> extract_shape(
-    const std::vector<const Tensor*>& list_new_shape_tensor) {
+    const std::vector<const phi::DenseTensor*>& list_new_shape_tensor) {
   std::vector<int> vec_new_shape;
   vec_new_shape.reserve(list_new_shape_tensor.size());
 
@@ -158,7 +158,8 @@ class ReshapeMKLDNNKernel : public framework::OpKernel<T> {
       const framework::ExecutionContext& ctx,
       framework::DDim& x_dims,            // NOLINT
       framework::DDim& out_dims) const {  // NOLINT
-    auto list_new_shape_tensor = ctx.MultiInput<Tensor>("ShapeTensor");
+    auto list_new_shape_tensor =
+        ctx.MultiInput<phi::DenseTensor>("ShapeTensor");
     if (list_new_shape_tensor.size() > 0) {
       auto new_shape = extract_shape(list_new_shape_tensor);
       out_dims = ValidateShape(new_shape, x_dims);
@@ -202,7 +203,8 @@ class ReshapeMKLDNNKernel : public framework::OpKernel<T> {
   }
 
  protected:
-  static dnnl::memory::format_tag getPlainFormatTag(const Tensor* tensor) {
+  static dnnl::memory::format_tag getPlainFormatTag(
+      const phi::DenseTensor* tensor) {
     auto tensor_dims_size = tensor->dims().size();
     PADDLE_ENFORCE_EQ(
         tensor_dims_size <= 6 && tensor_dims_size >= 1,
diff --git a/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc
index 6a05585a37c6f..0e0e77e33a6d1 100644
--- a/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 using SelectedRows = phi::SelectedRows;
 
@@ -43,7 +43,7 @@ class ShapeMKLDNNKernel : public framework::OpKernel<T> {
         in_dims = phi::make_ddim(rdims);
       }
     }
-    auto* out_t = ctx.Output<Tensor>("Out");
+    auto* out_t = ctx.Output<phi::DenseTensor>("Out");
     out_t->Resize({in_dims.size()});
     auto out_data = out_t->mutable_data<int32_t>(platform::CPUPlace());
     for (int i = 0; i < in_dims.size(); ++i) {
diff --git a/paddle/fluid/operators/mkldnn/shuffle_channel_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/shuffle_channel_mkldnn_op.cc
index 97c8184ebec28..fd1b1927f5fbb 100644
--- a/paddle/fluid/operators/mkldnn/shuffle_channel_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/shuffle_channel_mkldnn_op.cc
@@ -17,13 +17,12 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
 using platform::MKLDNNGetDataType;
 template <typename T>
 class ShuffleChannelMKLDNNHandler
     : public platform::MKLDNNHandlerNoCachingT<T, dnnl::shuffle_forward> {
  public:
-  ShuffleChannelMKLDNNHandler(const Tensor* x,
+  ShuffleChannelMKLDNNHandler(const phi::DenseTensor* x,
                               const int group,
                               const dnnl::engine engine,
                               platform::Place cpu_place)
@@ -43,8 +42,8 @@ class ShuffleChannelMKLDNNKernel : public framework::OpKernel<T> {
         ctx.template device_context<platform::MKLDNNDeviceContext>();
     const auto& mkldnn_engine = dev_ctx.GetEngine();
 
-    const auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    const auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     // oneDNN handles group using C/g instead of g
     const int group = x->dims()[1] / ctx.Attr<int>("group");
diff --git a/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc
index a7c6bd28486f8..05d05ab995a4b 100644
--- a/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using paddle::framework::Tensor;
-
 template <typename T>
 class SliceMKLDNNKernel : public framework::OpKernel<T> {
  public:
@@ -32,8 +30,8 @@ class SliceMKLDNNKernel : public framework::OpKernel<T> {
         ctx.template device_context<platform::MKLDNNDeviceContext>();
     const auto& onednn_engine = dev_ctx.GetEngine();
 
-    auto* x = ctx.Input<Tensor>("Input");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("Input");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     auto x_vec_dims = phi::vectorize(x->dims());
 
@@ -48,18 +46,21 @@ class SliceMKLDNNKernel : public framework::OpKernel<T> {
     std::vector<int64_t> ends(ctx.Attr<std::vector<int>>("ends").begin(),
                               ctx.Attr<std::vector<int>>("ends").end());
 
-    auto starts_tensor_list = ctx.MultiInput<Tensor>("StartsTensorList");
+    auto starts_tensor_list =
+        ctx.MultiInput<phi::DenseTensor>("StartsTensorList");
     if (ctx.HasInput("StartsTensor")) {
-      starts = GetDataFromTensor<int64_t>(ctx.Input<Tensor>("StartsTensor"));
+      starts = GetDataFromTensor<int64_t>(
+          ctx.Input<phi::DenseTensor>("StartsTensor"));
     } else if (starts_tensor_list.size() > 0) {
       starts = GetDataFromTensorList<int64_t>(starts_tensor_list);
     }
 
     auto decrease_axis = ctx.Attr<std::vector<int>>("decrease_axis");
 
-    auto ends_tensor_list = ctx.MultiInput<Tensor>("EndsTensorList");
+    auto ends_tensor_list = ctx.MultiInput<phi::DenseTensor>("EndsTensorList");
     if (ctx.HasInput("EndsTensor")) {
-      ends = GetDataFromTensor<int64_t>(ctx.Input<Tensor>("EndsTensor"));
+      ends =
+          GetDataFromTensor<int64_t>(ctx.Input<phi::DenseTensor>("EndsTensor"));
     } else if (ends_tensor_list.size() > 0) {
       ends = GetDataFromTensorList<int64_t>(ends_tensor_list);
     }
@@ -141,8 +142,8 @@ class SliceGradMKLDNNKernel : public framework::OpKernel<T> {
         ctx.template device_context<platform::MKLDNNDeviceContext>();
     const auto& onednn_engine = dev_ctx.GetEngine();
 
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
 
     auto dx_vec_dims = phi::vectorize(dx->dims());
     auto dout_vec_dims = phi::vectorize(dout->dims());
@@ -158,16 +159,19 @@ class SliceGradMKLDNNKernel : public framework::OpKernel<T> {
     std::vector<int64_t> ends(ctx.Attr<std::vector<int>>("ends").begin(),
                               ctx.Attr<std::vector<int>>("ends").end());
 
-    auto starts_tensor_list = ctx.MultiInput<Tensor>("StartsTensorList");
+    auto starts_tensor_list =
+        ctx.MultiInput<phi::DenseTensor>("StartsTensorList");
     if (ctx.HasInput("StartsTensor")) {
-      starts = GetDataFromTensor<int64_t>(ctx.Input<Tensor>("StartsTensor"));
+      starts = GetDataFromTensor<int64_t>(
+          ctx.Input<phi::DenseTensor>("StartsTensor"));
     } else if (starts_tensor_list.size() > 0) {
       starts = GetDataFromTensorList<int64_t>(starts_tensor_list);
     }
 
-    auto ends_tensor_list = ctx.MultiInput<Tensor>("EndsTensorList");
+    auto ends_tensor_list = ctx.MultiInput<phi::DenseTensor>("EndsTensorList");
     if (ctx.HasInput("EndsTensor")) {
-      ends = GetDataFromTensor<int64_t>(ctx.Input<Tensor>("EndsTensor"));
+      ends =
+          GetDataFromTensor<int64_t>(ctx.Input<phi::DenseTensor>("EndsTensor"));
     } else if (ends_tensor_list.size() > 0) {
       ends = GetDataFromTensorList<int64_t>(ends_tensor_list);
     }
diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
index 2bb82186483da..644998ea5ecdb 100644
--- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
@@ -19,7 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using paddle::framework::Tensor;
 using paddle::platform::MKLDNNDeviceContext;
 using paddle::platform::MKLDNNMemDesc;
 
@@ -39,8 +38,8 @@ class SoftmaxMKLDNNHandler
  public:
   SoftmaxMKLDNNHandler(const dnnl::engine mkldnn_engine,
                        platform::Place cpu_place,
-                       const Tensor* input,
-                       Tensor* output,
+                       const phi::DenseTensor* input,
+                       phi::DenseTensor* output,
                        const int axis)
       : platform::MKLDNNHandlerNoCachingT<T,
                                           dnnl::softmax_forward,
@@ -59,9 +58,9 @@ class SoftmaxMKLDNNHandler
   SoftmaxMKLDNNHandler(const framework::ExecutionContext& ctx,
                        const dnnl::engine mkldnn_engine,
                        platform::Place cpu_place,
-                       const Tensor* out,
-                       const Tensor* out_grad,
-                       Tensor* in_x_grad,
+                       const phi::DenseTensor* out,
+                       const phi::DenseTensor* out_grad,
+                       phi::DenseTensor* in_x_grad,
                        const std::string& unique_name)
       : platform::MKLDNNHandlerNoCachingT<T,
                                           dnnl::softmax_forward,
@@ -94,8 +93,8 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
     auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
     const auto& mkldnn_engine = dev_ctx.GetEngine();
 
-    const Tensor* input = ctx.Input<Tensor>("X");
-    Tensor* output = ctx.Output<Tensor>("Out");
+    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("X");
+    phi::DenseTensor* output = ctx.Output<phi::DenseTensor>("Out");
     bool is_inplaced = input->IsSharedBufferWith(*output);
 
     const int axis =
@@ -143,9 +142,11 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel<T> {
                           "Operator DNNL SoftmaxGrad must use CPUPlace"));
     auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
     const auto& mkldnn_engine = dev_ctx.GetEngine();
-    const Tensor* output = ctx.Input<Tensor>("Out");
-    auto* out_grad = ctx.template Input<Tensor>(framework::GradVarName("Out"));
-    auto* in_x_grad = ctx.template Output<Tensor>(framework::GradVarName("X"));
+    const phi::DenseTensor* output = ctx.Input<phi::DenseTensor>("Out");
+    auto* out_grad =
+        ctx.template Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* in_x_grad =
+        ctx.template Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     SoftmaxMKLDNNHandler<T> handler(ctx,
                                     mkldnn_engine,
diff --git a/paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h b/paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h
index c41864ee26f55..25886c5791fea 100644
--- a/paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h
+++ b/paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h
@@ -18,14 +18,12 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using paddle::framework::Tensor;
-
 template <typename T>
 class SoftplusMKLDNNHandler
     : public platform::MKLDNNHandlerNoCachingT<T, dnnl::binary> {
  public:
   SoftplusMKLDNNHandler(const framework::ExecutionContext& ctx,
-                        const Tensor* x,
+                        const phi::DenseTensor* x,
                         const float beta,
                         const dnnl::engine engine)
       : platform::MKLDNNHandlerNoCachingT<T, dnnl::binary>(engine,
@@ -70,8 +68,8 @@ void custom_softplus_eltwise_forward(const framework::ExecutionContext& ctx) {
       ctx.template device_context<platform::MKLDNNDeviceContext>();
   const auto& mkldnn_engine = dev_ctx.GetEngine();
 
-  const auto* x = ctx.Input<Tensor>("X");
-  auto* out = ctx.Output<Tensor>("Out");
+  const auto* x = ctx.Input<phi::DenseTensor>("X");
+  auto* out = ctx.Output<phi::DenseTensor>("Out");
 
   bool is_inplaced = x->IsSharedBufferWith(*out);
 
diff --git a/paddle/fluid/operators/mkldnn/split_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/split_mkldnn_op.cc
index f71931ad1ecc7..33c8c563a9f03 100644
--- a/paddle/fluid/operators/mkldnn/split_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/split_mkldnn_op.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using paddle::framework::Tensor;
-
 static inline std::vector<std::vector<int64_t>> CalculateOutsDims(
     const framework::DDim& in_dims,
     const size_t num,
@@ -63,8 +61,8 @@ class SplitMKLDNNKernel : public framework::OpKernel<T> {
         ctx.template device_context<platform::MKLDNNDeviceContext>();
     const auto& onednn_engine = dev_ctx.GetEngine();
 
-    const auto* x = ctx.Input<Tensor>("X");
-    auto outs = ctx.MultiOutput<Tensor>("Out");
+    const auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto outs = ctx.MultiOutput<phi::DenseTensor>("Out");
 
     int num = ctx.Attr<int>("num");
     auto sections = ctx.Attr<std::vector<int>>("sections");
@@ -74,12 +72,13 @@ class SplitMKLDNNKernel : public framework::OpKernel<T> {
 
     bool need_resize = false;
     if (ctx.HasInput("AxisTensor")) {
-      auto* axis_tensor = ctx.Input<Tensor>("AxisTensor");
+      auto* axis_tensor = ctx.Input<phi::DenseTensor>("AxisTensor");
       axis = GetDataFromTensor(axis_tensor)[0];
       need_resize = true;
     }
 
-    auto sections_tensor_list = ctx.MultiInput<Tensor>("SectionsTensorList");
+    auto sections_tensor_list =
+        ctx.MultiInput<phi::DenseTensor>("SectionsTensorList");
     if (sections_tensor_list.size() > 0) {
       sections = GetDataFromTensorList(sections_tensor_list);
       need_resize = true;
diff --git a/paddle/fluid/operators/mkldnn/stack_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/stack_mkldnn_op.cc
index 1e546e44fa241..4426f820b64d0 100644
--- a/paddle/fluid/operators/mkldnn/stack_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/stack_mkldnn_op.cc
@@ -23,7 +23,7 @@ using dnnl::primitive;
 using dnnl::stream;
 using framework::DataLayout;
 using framework::LoDTensor;
-using framework::Tensor;
+
 using platform::to_void_cast;
 
 template <typename T>
@@ -32,8 +32,8 @@ class StackMKLDNNHandler
  public:
   StackMKLDNNHandler(const framework::ExecutionContext& ctx,
                      const dnnl::engine mkldnn_engine,
-                     const std::vector<const Tensor*>& inputs,
-                     Tensor* output)
+                     const std::vector<const phi::DenseTensor*>& inputs,
+                     phi::DenseTensor* output)
       : platform::MKLDNNHandlerNoCachingT<T, dnnl::concat>(mkldnn_engine,
                                                            ctx.GetPlace()) {
     int stack_axis = ctx.Attr<int>("axis");
@@ -93,7 +93,8 @@ class StackMKLDNNHandler
         dst_md, stack_axis, srcs_md, this->engine_));
   }
 
-  std::shared_ptr<dnnl::memory> AcquireSrcMemory(const Tensor& input, int i) {
+  std::shared_ptr<dnnl::memory> AcquireSrcMemory(const phi::DenseTensor& input,
+                                                 int i) {
     const T* input_data = input.data<T>();
     return this->AcquireMemoryFromPrimitive(this->fwd_pd_->src_desc(i),
                                             to_void_cast<T>(input_data));
@@ -108,9 +109,9 @@ class StackMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         ctx.template device_context<platform::MKLDNNDeviceContext>();
     const auto& mkldnn_engine = dev_ctx.GetEngine();
 
-    auto multi_input = ctx.MultiInput<Tensor>("X");
+    auto multi_input = ctx.MultiInput<phi::DenseTensor>("X");
 
-    Tensor* output = ctx.Output<Tensor>("Y");
+    phi::DenseTensor* output = ctx.Output<phi::DenseTensor>("Y");
 
     StackMKLDNNHandler<T> handler(ctx, mkldnn_engine, multi_input, output);
 
diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
index 072016d729cdb..ab415ff47a0ec 100644
--- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
@@ -38,7 +38,7 @@ namespace operators {
 using paddle::platform::MKLDNNDeviceContext;
 using phi::CPUContext;
 using platform::to_void_cast;
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using SelectedRows = phi::SelectedRows;
 using LoDTensor = framework::LoDTensor;
 
@@ -84,7 +84,7 @@ class SumMKLDNNHandler
         new dnnl::sum::primitive_desc(dst_md, scales, srcs_md, this->engine_));
   }
 
-  std::shared_ptr<dnnl::memory> AcquireSrcMemory(const framework::Tensor& input,
+  std::shared_ptr<dnnl::memory> AcquireSrcMemory(const phi::DenseTensor& input,
                                                  int i) {
     const T* input_data = input.data<T>();
     return this->AcquireMemoryFromPrimitive(this->fwd_pd_->src_desc(i),
diff --git a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
index a01901950bc41..b7b0f33ade85c 100644
--- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
@@ -21,7 +21,7 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using framework::DataLayout;
 
 template <typename T>
@@ -50,7 +50,7 @@ class TransposeMKLDNNHandler {
     return std::make_shared<dnnl::memory>(src_md, engine_, ptr);
   }
 
-  std::shared_ptr<dnnl::memory> AcquireDstMemory(framework::Tensor* output,
+  std::shared_ptr<dnnl::memory> AcquireDstMemory(phi::DenseTensor* output,
                                                  platform::Place place) {
     auto dst_md = Axis2MemoryDesc(dims_, axis_);
     auto dst_data = output->mutable_data<T>(place, dst_md.get_size());
@@ -101,8 +101,8 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     const auto& mkldnn_engine = dev_ctx.GetEngine();
     std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
     int ndims = axis.size();
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
+    auto* input = ctx.Input<phi::DenseTensor>("X");
+    auto* output = ctx.Output<phi::DenseTensor>("Out");
     const T* input_data = input->data<T>();
 
     if (ndims == 1) {
@@ -140,9 +140,8 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
                       true,
                       paddle::platform::errors::PreconditionNotMet(
                           "Operator DNNL TransposeGrad must use CPUPlace"));
-    auto* out_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* x_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     if (!x_grad) return;
     auto& dev_ctx =
         ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc
index 4cd754775d9c0..a9da6ea2abb56 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.cc
+++ b/paddle/fluid/operators/mlu/mlu_baseop.cc
@@ -206,7 +206,7 @@ MLUCnnlTensorDesc::MLUCnnlTensorDesc(const int tensor_dim,
       cnnlSetTensorDescriptorPosition(raw_tensor_desc, position));
 }
 
-MLUCnnlTensorDesc::MLUCnnlTensorDesc(const Tensor& tensor,
+MLUCnnlTensorDesc::MLUCnnlTensorDesc(const phi::DenseTensor& tensor,
                                      const cnnlTensorLayout_t layout,
                                      const cnnlDataType_t tensor_dtype) {
   auto dims = phi::vectorize<int>(tensor.dims());
@@ -227,11 +227,11 @@ MLUCnnlTensorDesc::MLUCnnlTensorDesc(const Tensor& tensor,
   }
 }
 
-MLUCnnlTensorDesc::MLUCnnlTensorDesc(const Tensor& tensor)
+MLUCnnlTensorDesc::MLUCnnlTensorDesc(const phi::DenseTensor& tensor)
     : MLUCnnlTensorDesc(
           tensor, CNNL_LAYOUT_ARRAY, ToCnnlDataType(tensor.dtype())) {}
 
-MLUCnnlTensorDesc::MLUCnnlTensorDesc(const Tensor& tensor,
+MLUCnnlTensorDesc::MLUCnnlTensorDesc(const phi::DenseTensor& tensor,
                                      cnnlTensorLayout_t layout,
                                      const cnnlDataType_t tensor_dtype,
                                      int position)
@@ -240,7 +240,7 @@ MLUCnnlTensorDesc::MLUCnnlTensorDesc(const Tensor& tensor,
       cnnlSetTensorDescriptorPosition(raw_tensor_desc, position));
 }
 
-MLUCnnlTensorDesc::MLUCnnlTensorDesc(const Tensor& tensor,
+MLUCnnlTensorDesc::MLUCnnlTensorDesc(const phi::DenseTensor& tensor,
                                      cnnlTensorLayout_t layout,
                                      const cnnlDataType_t tensor_dtype,
                                      int position,
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h
index e56331b2728c4..f8d5bfd205c7c 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.h
+++ b/paddle/fluid/operators/mlu/mlu_baseop.h
@@ -28,7 +28,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using DataLayout = framework::DataLayout;
 using ExecutionContext = framework::ExecutionContext;
 using DeviceContextPool = platform::DeviceContextPool;
@@ -86,9 +86,9 @@ inline cnnlInterpBackwardMode_t GetMLUCnnlInterpBackwardMode(
       "Not support interp mode of MLU Device: %s", interp_mode));
 }
 
-inline const void* GetBasePtr(const Tensor* t) { return t->data(); }
+inline const void* GetBasePtr(const phi::DenseTensor* t) { return t->data(); }
 
-inline void* GetBasePtr(Tensor* t) { return t->data(); }
+inline void* GetBasePtr(phi::DenseTensor* t) { return t->data(); }
 
 inline cnnlDataType_t ToCnnlDataType(
     const paddle::experimental::DataType& dtype) {
@@ -256,18 +256,18 @@ class MLUCnnlTensorDesc {
                     const cnnlDataType_t tensor_dtype,
                     int position);
 
-  MLUCnnlTensorDesc(const Tensor& tensor,
+  MLUCnnlTensorDesc(const phi::DenseTensor& tensor,
                     const cnnlTensorLayout_t layout,
                     const cnnlDataType_t tensor_dtype);
 
-  explicit MLUCnnlTensorDesc(const Tensor& tensor);
+  explicit MLUCnnlTensorDesc(const phi::DenseTensor& tensor);
 
-  MLUCnnlTensorDesc(const Tensor& tensor,
+  MLUCnnlTensorDesc(const phi::DenseTensor& tensor,
                     cnnlTensorLayout_t layout,
                     const cnnlDataType_t tensor_dtype,
                     int position);
 
-  MLUCnnlTensorDesc(const Tensor& tensor,
+  MLUCnnlTensorDesc(const phi::DenseTensor& tensor,
                     cnnlTensorLayout_t layout,
                     const cnnlDataType_t tensor_dtype,
                     int position,
@@ -2211,8 +2211,8 @@ inline void SetMLUTransposePerm(const framework::DDim& dims,
 template <typename T>
 inline void TransposeFromMLUTensor(const ExecutionContext& ctx,
                                    const std::vector<int> perm,
-                                   const Tensor* transformed_input,
-                                   Tensor* transformed_output,
+                                   const phi::DenseTensor* transformed_input,
+                                   phi::DenseTensor* transformed_output,
                                    bool need_reshape_or_alloc) {
   const int dim_size = perm.size();
   if (need_reshape_or_alloc) {
@@ -2241,7 +2241,7 @@ inline void TransposeFromMLUTensor(const ExecutionContext& ctx,
 template <typename T>
 inline void FillMLUTensorWithHostValue(const ExecutionContext& ctx,
                                        T value,
-                                       Tensor* out) {
+                                       phi::DenseTensor* out) {
   MLUCnnlTensorDesc out_desc(*out);
   MLUCnnl::Fill(
       ctx, CNNL_POINTER_MODE_HOST, &value, out_desc.get(), GetBasePtr(out));
diff --git a/paddle/fluid/operators/modified_huber_loss_op.cu b/paddle/fluid/operators/modified_huber_loss_op.cu
index 67c3a5d90da9a..330f4ca3596bd 100644
--- a/paddle/fluid/operators/modified_huber_loss_op.cu
+++ b/paddle/fluid/operators/modified_huber_loss_op.cu
@@ -23,7 +23,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 struct ModifiedHuberLossBackward {
   template <typename Tuple>
@@ -45,10 +45,10 @@ template <typename T>
 class ModifiedHuberLossGradGPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("Y");
-    auto* in1 = context.Input<Tensor>("IntermediateVal");
-    auto* in2 = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* in0 = context.Input<phi::DenseTensor>("Y");
+    auto* in1 = context.Input<phi::DenseTensor>("IntermediateVal");
+    auto* in2 = context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* out0 = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     if (out0) {
       auto counts = phi::product(in1->dims());
diff --git a/paddle/fluid/operators/modified_huber_loss_op.h b/paddle/fluid/operators/modified_huber_loss_op.h
index cde9c818dd6af..50d5a14548e35 100644
--- a/paddle/fluid/operators/modified_huber_loss_op.h
+++ b/paddle/fluid/operators/modified_huber_loss_op.h
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
@@ -57,10 +57,10 @@ template <typename DeviceContext, typename T>
 class ModifiedHuberLossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("X");
-    auto* in1 = context.Input<Tensor>("Y");
-    auto* out0 = context.Output<framework::Tensor>("IntermediateVal");
-    auto* out1 = context.Output<framework::Tensor>("Out");
+    auto* in0 = context.Input<phi::DenseTensor>("X");
+    auto* in1 = context.Input<phi::DenseTensor>("Y");
+    auto* out0 = context.Output<phi::DenseTensor>("IntermediateVal");
+    auto* out1 = context.Output<phi::DenseTensor>("Out");
 
     out0->mutable_data<T>(context.GetPlace());
     out1->mutable_data<T>(context.GetPlace());
@@ -84,10 +84,10 @@ template <typename T>
 class ModifiedHuberLossGradCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("Y");
-    auto* in1 = context.Input<framework::Tensor>("IntermediateVal");
-    auto* in2 = context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* out0 = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* in0 = context.Input<phi::DenseTensor>("Y");
+    auto* in1 = context.Input<phi::DenseTensor>("IntermediateVal");
+    auto* in2 = context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* out0 = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     if (out0) {
       const T* y_ptr = in0->data<T>();
diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc
index 2d4ca62955eb1..9a3d540ea2ee2 100644
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@@ -30,7 +30,6 @@ namespace paddle {
 namespace operators {
 
 using framework::OpKernelType;
-using framework::Tensor;
 
 constexpr int kMULMKLDNNINT8 = 1;
 constexpr int kMULMKLDNNFP32 = 2;
diff --git a/paddle/fluid/operators/mul_op_npu.cc b/paddle/fluid/operators/mul_op_npu.cc
index 6617cb277a791..ab8334909edcc 100644
--- a/paddle/fluid/operators/mul_op_npu.cc
+++ b/paddle/fluid/operators/mul_op_npu.cc
@@ -25,9 +25,9 @@ template <typename DeviceContext, typename T>
 class MulNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* y = ctx.Input<phi::DenseTensor>("Y");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     int x_num_col_dims = ctx.Attr<int>("x_num_col_dims");
     int y_num_col_dims = ctx.Attr<int>("y_num_col_dims");
     auto stream =
@@ -120,11 +120,11 @@ template <typename DeviceContext, typename T>
 class MulGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* y = ctx.Input<phi::DenseTensor>("Y");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
     int x_num_col_dims = ctx.Attr<int>("x_num_col_dims");
     int y_num_col_dims = ctx.Attr<int>("y_num_col_dims");
     auto stream =
diff --git a/paddle/fluid/operators/multi_dot_op.cc b/paddle/fluid/operators/multi_dot_op.cc
index eeeda2bcb93ff..b83bc8ea6541b 100644
--- a/paddle/fluid/operators/multi_dot_op.cc
+++ b/paddle/fluid/operators/multi_dot_op.cc
@@ -27,7 +27,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class MultiDotOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
diff --git a/paddle/fluid/operators/multinomial_op_npu.cc b/paddle/fluid/operators/multinomial_op_npu.cc
index 305d7cc5cd70c..206c7b041a9b3 100644
--- a/paddle/fluid/operators/multinomial_op_npu.cc
+++ b/paddle/fluid/operators/multinomial_op_npu.cc
@@ -22,14 +22,14 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class NPUMultinomialKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto x = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
+    const auto x = ctx.Input<phi::DenseTensor>("X");
+    auto out = ctx.Output<phi::DenseTensor>("Out");
     const int64_t num_samples = ctx.Attr<int>("num_samples");
     const bool replacement = ctx.Attr<bool>("replacement");
 
diff --git a/paddle/fluid/operators/multiplex_op.cc b/paddle/fluid/operators/multiplex_op.cc
index 5931e8d301439..749849a333f3d 100644
--- a/paddle/fluid/operators/multiplex_op.cc
+++ b/paddle/fluid/operators/multiplex_op.cc
@@ -23,7 +23,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class MultiplexOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/nccl/nccl_op.cu.cc b/paddle/fluid/operators/nccl/nccl_op.cu.cc
index 04fb6957580d6..52c3aa57604f8 100644
--- a/paddle/fluid/operators/nccl/nccl_op.cu.cc
+++ b/paddle/fluid/operators/nccl/nccl_op.cu.cc
@@ -20,7 +20,7 @@ namespace paddle {
 namespace operators {
 
 using framework::LoDTensor;
-using framework::Tensor;
+
 using platform::Communicator;
 
 template <typename Type>
diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc
index c9c4d1a4c74f3..4020dfb9afc71 100644
--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class NCEOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h
index 2141ad0f50c76..8e7e02b9667b5 100644
--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
@@ -31,7 +31,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 using SelectedRows = phi::SelectedRows;
 using Sampler = math::Sampler;
@@ -46,7 +46,7 @@ template <typename DeviceContext, typename T>
 void PrepareSamples(const framework::ExecutionContext &context,
                     Sampler *sampler,
                     Tensor *sample_labels) {
-  auto label = context.Input<Tensor>("Label");
+  auto label = context.Input<phi::DenseTensor>("Label");
   const int64_t *label_data = label->data<int64_t>();
   auto label_dims = label->dims();
   // for unitest
@@ -98,9 +98,10 @@ class NCEKernel : public framework::OpKernel<T> {
         break;
       }
       case 2: {
-        auto dist_probs = context.Input<Tensor>("CustomDistProbs");
-        auto dist_alias = context.Input<Tensor>("CustomDistAlias");
-        auto dist_alias_probs = context.Input<Tensor>("CustomDistAliasProbs");
+        auto dist_probs = context.Input<phi::DenseTensor>("CustomDistProbs");
+        auto dist_alias = context.Input<phi::DenseTensor>("CustomDistAlias");
+        auto dist_alias_probs =
+            context.Input<phi::DenseTensor>("CustomDistAliasProbs");
 
         PADDLE_ENFORCE_EQ(
             dist_probs->numel(),
@@ -153,14 +154,15 @@ class NCEKernel : public framework::OpKernel<T> {
     }
 
     std::vector<int64_t> sample_out_dims;
-    auto label = context.Input<Tensor>("Label");
+    auto label = context.Input<phi::DenseTensor>("Label");
     Tensor *sample_labels;
     Tensor *sample_out;
     Tensor sample_labels_tmp, sample_out_tmp;
     if (is_test) {
       // set dims of output(SampleOut)
       int num_true_classes = label->dims().size() == 2 ? label->dims()[1] : 1;
-      sample_out_dims.push_back((context.Input<Tensor>("Input"))->dims()[0]);
+      sample_out_dims.push_back(
+          (context.Input<phi::DenseTensor>("Input"))->dims()[0]);
       sample_out_dims.push_back(
           (num_true_classes == -1) ? -1 : (num_neg_samples + num_true_classes));
 
@@ -170,8 +172,8 @@ class NCEKernel : public framework::OpKernel<T> {
       sample_out = &sample_out_tmp;
       sample_out->Resize(phi::make_ddim(sample_out_dims));
     } else {
-      sample_labels = context.Output<Tensor>("SampleLabels");
-      sample_out = context.Output<Tensor>("SampleLogits");
+      sample_labels = context.Output<phi::DenseTensor>("SampleLabels");
+      sample_out = context.Output<phi::DenseTensor>("SampleLogits");
     }
 
     PrepareSamples<DeviceContext, T>(context, sampler, sample_labels);
@@ -189,12 +191,12 @@ class NCEKernel : public framework::OpKernel<T> {
     }
 
     T *sample_out_data = sample_out->mutable_data<T>(context.GetPlace());
-    auto sample_weight = context.Input<Tensor>("SampleWeight");
+    auto sample_weight = context.Input<phi::DenseTensor>("SampleWeight");
     const T *sample_weight_data = nullptr;
     if (sample_weight != nullptr) {
       sample_weight_data = sample_weight->data<T>();
     }
-    auto out = context.Output<Tensor>("Cost");
+    auto out = context.Output<phi::DenseTensor>("Cost");
     T *out_data = out->mutable_data<T>(context.GetPlace());
     int64_t num_true_class = 1;
     if (label != nullptr) {
@@ -203,7 +205,7 @@ class NCEKernel : public framework::OpKernel<T> {
     int64_t sampled_labels_num = sample_labels->dims()[1];
     //    T b = 1. / num_total_classes * num_neg_samples;
     // forward bias
-    auto bias = context.Input<Tensor>("Bias");
+    auto bias = context.Input<phi::DenseTensor>("Bias");
     if (bias != nullptr) {
       const T *bias_data = bias->data<T>();
       for (int64_t i = 0; i < sample_labels->numel(); ++i) {
@@ -215,9 +217,11 @@ class NCEKernel : public framework::OpKernel<T> {
       }
     }
     // forward mul
-    auto input_mat = EigenMatrix<T>::From(*(context.Input<Tensor>("Input")));
+    auto input_mat =
+        EigenMatrix<T>::From(*(context.Input<phi::DenseTensor>("Input")));
 
-    auto weight_mat = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
+    auto weight_mat =
+        EigenMatrix<T>::From(*(context.Input<phi::DenseTensor>("Weight")));
     for (int64_t i = 0; i < sample_labels->numel(); ++i) {
       Eigen::Tensor<T, 0, Eigen::RowMajor, Eigen::DenseIndex> result =
           (input_mat.chip(static_cast<int>(i / sample_labels->dims()[1]), 0) *
@@ -247,14 +251,15 @@ template <typename DeviceContext, typename T>
 class NCEGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    auto d_out = context.Input<Tensor>(framework::GradVarName("Cost"));
+    auto d_out =
+        context.Input<phi::DenseTensor>(framework::GradVarName("Cost"));
     const T *d_out_data = d_out->data<T>();
-    auto label = context.Input<Tensor>("Label");
-    auto sample_out = context.Input<Tensor>("SampleLogits");
+    auto label = context.Input<phi::DenseTensor>("Label");
+    auto sample_out = context.Input<phi::DenseTensor>("SampleLogits");
     const T *sample_out_data = sample_out->data<T>();
-    auto sample_labels = context.Input<Tensor>("SampleLabels");
+    auto sample_labels = context.Input<phi::DenseTensor>("SampleLabels");
     const int64_t *sample_labels_data = sample_labels->data<int64_t>();
-    auto sample_weight = context.Input<Tensor>("SampleWeight");
+    auto sample_weight = context.Input<phi::DenseTensor>("SampleWeight");
     const T *sample_weight_data = nullptr;
     if (sample_weight != nullptr) {
       sample_weight_data = sample_weight->data<T>();
@@ -279,9 +284,10 @@ class NCEGradKernel : public framework::OpKernel<T> {
         break;
       }
       case 2: {
-        auto dist_probs = context.Input<Tensor>("CustomDistProbs");
-        auto dist_alias = context.Input<Tensor>("CustomDistAlias");
-        auto dist_alias_probs = context.Input<Tensor>("CustomDistAliasProbs");
+        auto dist_probs = context.Input<phi::DenseTensor>("CustomDistProbs");
+        auto dist_alias = context.Input<phi::DenseTensor>("CustomDistAlias");
+        auto dist_alias_probs =
+            context.Input<phi::DenseTensor>("CustomDistAliasProbs");
 
         PADDLE_ENFORCE_EQ(
             dist_probs->numel(),
@@ -351,7 +357,8 @@ class NCEGradKernel : public framework::OpKernel<T> {
     }
 
     // get d_bias
-    auto d_bias = context.Output<Tensor>(framework::GradVarName("Bias"));
+    auto d_bias =
+        context.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
     if (d_bias != nullptr) {
       T *d_bias_data = d_bias->mutable_data<T>(context.GetPlace());
       std::fill(d_bias_data, d_bias_data + d_bias->numel(), 0.0);
@@ -364,12 +371,14 @@ class NCEGradKernel : public framework::OpKernel<T> {
 
     if (!is_sparse) {
       // get d_w
-      auto d_w = context.Output<Tensor>(framework::GradVarName("Weight"));
+      auto d_w =
+          context.Output<phi::DenseTensor>(framework::GradVarName("Weight"));
       if (d_w != nullptr) {
         auto d_w_data = d_w->mutable_data<T>(context.GetPlace());
         std::fill(d_w_data, d_w_data + d_w->numel(), 0.0);
         auto d_w_matrix = EigenMatrix<T>::From(*d_w);
-        auto x_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Input")));
+        auto x_matrix =
+            EigenMatrix<T>::From(*(context.Input<phi::DenseTensor>("Input")));
         for (int64_t i = 0; i < sample_labels->numel(); ++i) {
           d_w_matrix.chip(sample_labels_data[i], 0) +=
               x_matrix.chip(static_cast<int>(i / sample_labels->dims()[1]), 0) *
@@ -410,7 +419,8 @@ class NCEGradKernel : public framework::OpKernel<T> {
       std::fill(d_w_data, d_w_data + d_table_value->numel(), 0.0);
 
       auto d_w_matrix = EigenMatrix<T>::From(*d_table_value);
-      auto x_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Input")));
+      auto x_matrix =
+          EigenMatrix<T>::From(*(context.Input<phi::DenseTensor>("Input")));
       for (int64_t i = 0; i < sample_labels->numel(); ++i) {
         d_w_matrix.chip(d_w->Index(sample_labels_data[i]), 0) +=
             x_matrix.chip(static_cast<int>(i / sample_labels->dims()[1]), 0) *
@@ -419,12 +429,14 @@ class NCEGradKernel : public framework::OpKernel<T> {
     }
 
     // get d_x
-    auto d_x = context.Output<Tensor>(framework::GradVarName("Input"));
+    auto d_x =
+        context.Output<phi::DenseTensor>(framework::GradVarName("Input"));
     if (d_x != nullptr) {
       auto *d_x_data = d_x->mutable_data<T>(context.GetPlace());
       std::fill(d_x_data, d_x_data + d_x->numel(), 0.0);
       auto d_x_matrix = EigenMatrix<T>::From(*d_x);
-      auto w_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
+      auto w_matrix =
+          EigenMatrix<T>::From(*(context.Input<phi::DenseTensor>("Weight")));
       for (int64_t i = 0; i < sample_labels->numel(); ++i) {
         d_x_matrix.chip(static_cast<int>(i / sample_labels->dims()[1]), 0) +=
             w_matrix.chip(sample_labels_data[i], 0) * sample_grad_data[i];
diff --git a/paddle/fluid/operators/norm_op_npu.cc b/paddle/fluid/operators/norm_op_npu.cc
index e2b6875eeeaef..c5f0749227e23 100644
--- a/paddle/fluid/operators/norm_op_npu.cc
+++ b/paddle/fluid/operators/norm_op_npu.cc
@@ -16,7 +16,7 @@ namespace paddle {
 namespace operators {
 
 using DDim = framework::DDim;
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 void CheckAxis(int axis, int rank) {
   // check the axis is in [-rank, rank-1]
@@ -34,9 +34,9 @@ class NormNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     VLOG(4) << "Launch Norm Op Kernel on NPU." << std::endl;
-    auto *in_x = ctx.Input<framework::Tensor>("X");
-    auto *out_y = ctx.Output<framework::Tensor>("Out");
-    auto *out_norm = ctx.Output<framework::Tensor>("Norm");
+    auto *in_x = ctx.Input<phi::DenseTensor>("X");
+    auto *out_y = ctx.Output<phi::DenseTensor>("Out");
+    auto *out_norm = ctx.Output<phi::DenseTensor>("Norm");
     out_y->mutable_data<T>(ctx.GetPlace());
     out_norm->mutable_data<T>(ctx.GetPlace());
     auto xdim = in_x->dims();
@@ -67,10 +67,10 @@ class NormGradNPUKernel : public framework::OpKernel<T> {
     float epsilon = ctx.Attr<float>("epsilon");
     int axis = ctx.Attr<int>("axis");
 
-    auto *x = ctx.Input<Tensor>("X");
-    auto *y = ctx.Input<framework::Tensor>("Out");
-    auto *dy = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto *dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto *x = ctx.Input<phi::DenseTensor>("X");
+    auto *y = ctx.Input<phi::DenseTensor>("Out");
+    auto *dy = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto *dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     auto xdim = x->dims();
     CheckAxis(axis, xdim.size());
diff --git a/paddle/fluid/operators/norm_utils.cu.h b/paddle/fluid/operators/norm_utils.cu.h
index 88f9ca02ff100..b331ef2529ac5 100644
--- a/paddle/fluid/operators/norm_utils.cu.h
+++ b/paddle/fluid/operators/norm_utils.cu.h
@@ -37,7 +37,7 @@ namespace cub = hipcub;
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using DataLayout = framework::DataLayout;
 
 // math: dx = scale * ((x - mean) * inv_var / NxHxW * (np.mean(ddx,
diff --git a/paddle/fluid/operators/number_count_op.cu b/paddle/fluid/operators/number_count_op.cu
index 330163b1f9350..b9d46a8559bc6 100644
--- a/paddle/fluid/operators/number_count_op.cu
+++ b/paddle/fluid/operators/number_count_op.cu
@@ -38,7 +38,7 @@ static inline int GET_BLOCKS(const int N) {
 }
 
 using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 __global__ void initialize_zero_kernel(T* data, const int length) {
diff --git a/paddle/fluid/operators/one_hot_op.cc b/paddle/fluid/operators/one_hot_op.cc
index 59842249adcdd..8e1a07975e2da 100644
--- a/paddle/fluid/operators/one_hot_op.cc
+++ b/paddle/fluid/operators/one_hot_op.cc
@@ -65,7 +65,7 @@ class OneHotOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     if (var_name == "depth_tensor") {
       return expected_kernel_type;
diff --git a/paddle/fluid/operators/one_hot_op.cu b/paddle/fluid/operators/one_hot_op.cu
index 85594ff05742e..c91f2995af042 100644
--- a/paddle/fluid/operators/one_hot_op.cu
+++ b/paddle/fluid/operators/one_hot_op.cu
@@ -70,9 +70,9 @@ class OneHotCUDAKernel : public framework::OpKernel<T> {
 
     int depth = -1;
     if (context.HasInput("depth_tensor")) {
-      auto* depth_tensor = context.Input<framework::Tensor>("depth_tensor");
+      auto* depth_tensor = context.Input<phi::DenseTensor>("depth_tensor");
       if (platform::is_gpu_place(depth_tensor->place())) {
-        framework::Tensor temp;
+        phi::DenseTensor temp;
         paddle::framework::TensorCopySync(
             *depth_tensor, platform::CPUPlace(), &temp);
         depth = *temp.data<int32_t>();
diff --git a/paddle/fluid/operators/one_hot_op.h b/paddle/fluid/operators/one_hot_op.h
index 95d767fed805c..6e139c94880be 100644
--- a/paddle/fluid/operators/one_hot_op.h
+++ b/paddle/fluid/operators/one_hot_op.h
@@ -77,7 +77,7 @@ struct OneHotOpFunctor {
 };
 
 using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 template <typename DeviceContext, typename T>
 class OneHotKernel : public framework::OpKernel<T> {
  public:
@@ -87,7 +87,7 @@ class OneHotKernel : public framework::OpKernel<T> {
     int depth = context.Attr<int>("depth");
     bool allow_out_of_range = context.Attr<bool>("allow_out_of_range");
     if (context.HasInput("depth_tensor")) {
-      auto* depth_tensor = context.Input<Tensor>("depth_tensor");
+      auto* depth_tensor = context.Input<phi::DenseTensor>("depth_tensor");
       auto* depth_data = depth_tensor->data<int32_t>();
       depth = depth_data[0];
       auto in_dims = in->dims();
diff --git a/paddle/fluid/operators/one_hot_op_npu.cc b/paddle/fluid/operators/one_hot_op_npu.cc
index 5d6fe0d50bdd6..2ca74cac0a051 100644
--- a/paddle/fluid/operators/one_hot_op_npu.cc
+++ b/paddle/fluid/operators/one_hot_op_npu.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class OneHotNPUKernel : public framework::OpKernel<T> {
@@ -30,7 +30,7 @@ class OneHotNPUKernel : public framework::OpKernel<T> {
     int depth = ctx.Attr<int>("depth");
 
     if (ctx.HasInput("depth_tensor")) {
-      auto* depth_tensor = ctx.Input<Tensor>("depth_tensor");
+      auto* depth_tensor = ctx.Input<phi::DenseTensor>("depth_tensor");
       std::vector<int32_t> depth_data;
       framework::TensorToVector(*depth_tensor, dev_ctx, &depth_data);
       depth = depth_data[0];
diff --git a/paddle/fluid/operators/one_hot_op_xpu.cc b/paddle/fluid/operators/one_hot_op_xpu.cc
index 7c213956bfde4..6812a2415ed53 100644
--- a/paddle/fluid/operators/one_hot_op_xpu.cc
+++ b/paddle/fluid/operators/one_hot_op_xpu.cc
@@ -23,7 +23,7 @@ namespace paddle {
 namespace operators {
 
 using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class OneHotXPUKernel : public framework::OpKernel<T> {
@@ -37,7 +37,7 @@ class OneHotXPUKernel : public framework::OpKernel<T> {
 
     // get depth from input tensor
     if (context.HasInput("depth_tensor")) {
-      auto* depth_tensor = context.Input<Tensor>("depth_tensor");
+      auto* depth_tensor = context.Input<phi::DenseTensor>("depth_tensor");
       auto* depth_data = depth_tensor->data<int32_t>();
       if (platform::is_xpu_place(depth_tensor->place())) {
         xpu_memcpy(static_cast<void*>(&depth),
diff --git a/paddle/fluid/operators/one_hot_v2_op.cc b/paddle/fluid/operators/one_hot_v2_op.cc
index daf491c64b6d4..55cb5d1a53b2f 100644
--- a/paddle/fluid/operators/one_hot_v2_op.cc
+++ b/paddle/fluid/operators/one_hot_v2_op.cc
@@ -38,7 +38,7 @@ class OneHotV2Op : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const framework::Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     if (var_name == "depth_tensor") {
       return expected_kernel_type;
diff --git a/paddle/fluid/operators/one_hot_v2_op_mlu.cc b/paddle/fluid/operators/one_hot_v2_op_mlu.cc
index f574cc525f142..a7b1a30afe567 100644
--- a/paddle/fluid/operators/one_hot_v2_op_mlu.cc
+++ b/paddle/fluid/operators/one_hot_v2_op_mlu.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 
 template <typename T>
@@ -33,7 +33,8 @@ class OneHotV2MLUKernel : public framework::OpKernel<T> {
     int depth = ctx.Attr<int>("depth");
     if (ctx.HasInput("depth_tensor")) {
       std::vector<int32_t> depth_data;
-      depth_data = GetDataFromTensor<int>(ctx.Input<Tensor>("depth_tensor"));
+      depth_data =
+          GetDataFromTensor<int>(ctx.Input<phi::DenseTensor>("depth_tensor"));
       depth = depth_data[0];
 
       auto out_dims = out->dims();
diff --git a/paddle/fluid/operators/one_hot_v2_op_npu.cc b/paddle/fluid/operators/one_hot_v2_op_npu.cc
index 8399d41050399..1ea952cfcb7e6 100644
--- a/paddle/fluid/operators/one_hot_v2_op_npu.cc
+++ b/paddle/fluid/operators/one_hot_v2_op_npu.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 
 template <typename T>
@@ -31,7 +31,7 @@ class OneHotV2NPUKernel : public framework::OpKernel<T> {
     int depth = ctx.Attr<int>("depth");
 
     if (ctx.HasInput("depth_tensor")) {
-      auto* depth_tensor = ctx.Input<Tensor>("depth_tensor");
+      auto* depth_tensor = ctx.Input<phi::DenseTensor>("depth_tensor");
       std::vector<int32_t> depth_data;
       framework::TensorToVector(*depth_tensor, dev_ctx, &depth_data);
       depth = depth_data[0];
diff --git a/paddle/fluid/operators/optimizers/adadelta_op.cc b/paddle/fluid/operators/optimizers/adadelta_op.cc
index ef37c21496e55..4390da3c4e479 100644
--- a/paddle/fluid/operators/optimizers/adadelta_op.cc
+++ b/paddle/fluid/operators/optimizers/adadelta_op.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class AdadeltaOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/optimizers/adagrad_op.cc b/paddle/fluid/operators/optimizers/adagrad_op.cc
index ae05070692fb0..e122b4c92822b 100644
--- a/paddle/fluid/operators/optimizers/adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/adagrad_op.cc
@@ -25,7 +25,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 class AdagradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h
index cdbd8c4b9dfd2..aa331df4cbd0c 100644
--- a/paddle/fluid/operators/optimizers/adam_op.h
+++ b/paddle/fluid/operators/optimizers/adam_op.h
@@ -19,7 +19,7 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class AdamOp : public framework::OperatorWithKernel {
  public:
@@ -34,7 +34,7 @@ class AdamOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const framework::Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const {
     if (var_name == "Beta1Pow" || var_name == "Beta2Pow" ||
         var_name == "SkipUpdate") {
diff --git a/paddle/fluid/operators/optimizers/adam_op_functor.h b/paddle/fluid/operators/optimizers/adam_op_functor.h
index 15dee861b874e..7be2ab055cd41 100644
--- a/paddle/fluid/operators/optimizers/adam_op_functor.h
+++ b/paddle/fluid/operators/optimizers/adam_op_functor.h
@@ -23,9 +23,9 @@ namespace operators {
 
 namespace scatter = paddle::operators::math::scatter;
 
-static inline float GetAttrFromTensor(const framework::Tensor* tensor) {
+static inline float GetAttrFromTensor(const phi::DenseTensor* tensor) {
   const float* tensor_data = tensor->data<float>();
-  framework::Tensor cpu_tensor;
+  phi::DenseTensor cpu_tensor;
   if (platform::is_gpu_place(tensor->place())) {
     paddle::framework::TensorCopySync(
         *tensor, platform::CPUPlace(), &cpu_tensor);
diff --git a/paddle/fluid/operators/optimizers/adam_op_mlu.cc b/paddle/fluid/operators/optimizers/adam_op_mlu.cc
index 6ee63354fbff4..af912a5acab6e 100644
--- a/paddle/fluid/operators/optimizers/adam_op_mlu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_mlu.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 
 template <typename T>
@@ -48,8 +48,8 @@ class AdamMLUKernel : public framework::OpKernel<T> {
     auto* mom2 = ctx.Input<LoDTensor>("Moment2");
     auto* lr = ctx.Input<LoDTensor>("LearningRate");
 
-    auto* beta1_pow = ctx.Input<Tensor>("Beta1Pow");
-    auto* beta2_pow = ctx.Input<Tensor>("Beta2Pow");
+    auto* beta1_pow = ctx.Input<phi::DenseTensor>("Beta1Pow");
+    auto* beta2_pow = ctx.Input<phi::DenseTensor>("Beta2Pow");
 
     auto* param_out = ctx.Output<LoDTensor>("ParamOut");
     auto* mom1_out = ctx.Output<LoDTensor>("Moment1Out");
@@ -59,7 +59,7 @@ class AdamMLUKernel : public framework::OpKernel<T> {
 
     bool skip_update = false;
     if (ctx.HasInput("SkipUpdate")) {
-      auto* skip_update_tensor = ctx.Input<framework::Tensor>("SkipUpdate");
+      auto* skip_update_tensor = ctx.Input<phi::DenseTensor>("SkipUpdate");
       PADDLE_ENFORCE_EQ(skip_update_tensor->numel(),
                         1,
                         platform::errors::InvalidArgument(
@@ -153,16 +153,16 @@ class AdamMLUKernel : public framework::OpKernel<T> {
                           "value is:%d.",
                           beta2_pow_out->numel()));
 
-    const Tensor* beta1_tensor = nullptr;
-    const Tensor* beta2_tensor = nullptr;
-    const Tensor* epsilon_tensor = nullptr;
+    const phi::DenseTensor* beta1_tensor = nullptr;
+    const phi::DenseTensor* beta2_tensor = nullptr;
+    const phi::DenseTensor* epsilon_tensor = nullptr;
 
     Tensor beta1_tmp(experimental::DataType::FLOAT32);
     Tensor beta2_tmp(experimental::DataType::FLOAT32);
     Tensor epsilon_tmp(experimental::DataType::FLOAT32);
 
     if (ctx.HasInput("Beta1Tensor")) {
-      beta1_tensor = ctx.Input<framework::Tensor>("Beta1Tensor");
+      beta1_tensor = ctx.Input<phi::DenseTensor>("Beta1Tensor");
       PADDLE_ENFORCE_EQ(beta1_tensor->numel(),
                         1,
                         platform::errors::InvalidArgument(
@@ -181,7 +181,7 @@ class AdamMLUKernel : public framework::OpKernel<T> {
     }
 
     if (ctx.HasInput("Beta2Tensor")) {
-      beta2_tensor = ctx.Input<framework::Tensor>("Beta2Tensor");
+      beta2_tensor = ctx.Input<phi::DenseTensor>("Beta2Tensor");
       PADDLE_ENFORCE_EQ(beta2_tensor->numel(),
                         1,
                         platform::errors::InvalidArgument(
@@ -200,7 +200,7 @@ class AdamMLUKernel : public framework::OpKernel<T> {
     }
 
     if (ctx.HasInput("EpsilonTensor")) {
-      epsilon_tensor = ctx.Input<framework::Tensor>("EpsilonTensor");
+      epsilon_tensor = ctx.Input<phi::DenseTensor>("EpsilonTensor");
       PADDLE_ENFORCE_EQ(epsilon_tensor->numel(),
                         1,
                         platform::errors::InvalidArgument(
@@ -278,7 +278,7 @@ class AdamWMLUKernel : public AdamMLUKernel<T> {
     bool skip_update = false;
     if (ctx.HasInput("SkipUpdate")) {
       VLOG(3) << "Has SkipUpdate";
-      auto* skip_update_tensor = ctx.Input<framework::Tensor>("SkipUpdate");
+      auto* skip_update_tensor = ctx.Input<phi::DenseTensor>("SkipUpdate");
       PADDLE_ENFORCE_EQ(skip_update_tensor->numel(),
                         1,
                         platform::errors::InvalidArgument(
@@ -338,19 +338,19 @@ class MergedAdamMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     // Get inputs and outputs
-    auto params = ctx.MultiInput<framework::Tensor>("Param");
-    auto grads = ctx.MultiInput<framework::Tensor>("Grad");
-    auto lrs = ctx.MultiInput<framework::Tensor>("LearningRate");
-    auto mom1s = ctx.MultiInput<framework::Tensor>("Moment1");
-    auto mom2s = ctx.MultiInput<framework::Tensor>("Moment2");
-    auto beta1_pows = ctx.MultiInput<framework::Tensor>("Beta1Pow");
-    auto beta2_pows = ctx.MultiInput<framework::Tensor>("Beta2Pow");
-    auto master_params = ctx.MultiInput<framework::Tensor>("MasterParam");
-    auto param_outs = ctx.MultiOutput<framework::Tensor>("ParamOut");
-    auto mom1_outs = ctx.MultiOutput<framework::Tensor>("Moment1Out");
-    auto mom2_outs = ctx.MultiOutput<framework::Tensor>("Moment2Out");
-    auto beta1_pow_outs = ctx.MultiOutput<framework::Tensor>("Beta1PowOut");
-    auto beta2_pow_outs = ctx.MultiOutput<framework::Tensor>("Beta2PowOut");
+    auto params = ctx.MultiInput<phi::DenseTensor>("Param");
+    auto grads = ctx.MultiInput<phi::DenseTensor>("Grad");
+    auto lrs = ctx.MultiInput<phi::DenseTensor>("LearningRate");
+    auto mom1s = ctx.MultiInput<phi::DenseTensor>("Moment1");
+    auto mom2s = ctx.MultiInput<phi::DenseTensor>("Moment2");
+    auto beta1_pows = ctx.MultiInput<phi::DenseTensor>("Beta1Pow");
+    auto beta2_pows = ctx.MultiInput<phi::DenseTensor>("Beta2Pow");
+    auto master_params = ctx.MultiInput<phi::DenseTensor>("MasterParam");
+    auto param_outs = ctx.MultiOutput<phi::DenseTensor>("ParamOut");
+    auto mom1_outs = ctx.MultiOutput<phi::DenseTensor>("Moment1Out");
+    auto mom2_outs = ctx.MultiOutput<phi::DenseTensor>("Moment2Out");
+    auto beta1_pow_outs = ctx.MultiOutput<phi::DenseTensor>("Beta1PowOut");
+    auto beta2_pow_outs = ctx.MultiOutput<phi::DenseTensor>("Beta2PowOut");
 
     // Check validation of inputs and outputs
     size_t param_num = params.size();
@@ -365,7 +365,7 @@ class MergedAdamMLUKernel : public framework::OpKernel<T> {
 
     bool skip_update = false;
     if (ctx.HasInput("SkipUpdate")) {
-      auto* skip_update_tensor = ctx.Input<framework::Tensor>("SkipUpdate");
+      auto* skip_update_tensor = ctx.Input<phi::DenseTensor>("SkipUpdate");
       PADDLE_ENFORCE_EQ(skip_update_tensor->numel(),
                         1,
                         platform::errors::InvalidArgument(
@@ -416,9 +416,9 @@ class MergedAdamMLUKernel : public framework::OpKernel<T> {
     VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
 
     // Get beta1, beta2 and epsilon from attribute.
-    const Tensor* beta1_tensor = nullptr;
-    const Tensor* beta2_tensor = nullptr;
-    const Tensor* epsilon_tensor = nullptr;
+    const phi::DenseTensor* beta1_tensor = nullptr;
+    const phi::DenseTensor* beta2_tensor = nullptr;
+    const phi::DenseTensor* epsilon_tensor = nullptr;
 
     Tensor beta1_tmp(experimental::DataType::FLOAT32);
     Tensor beta2_tmp(experimental::DataType::FLOAT32);
diff --git a/paddle/fluid/operators/optimizers/adam_op_npu.cc b/paddle/fluid/operators/optimizers/adam_op_npu.cc
index 3642c45ba6aab..d7850cae972d1 100644
--- a/paddle/fluid/operators/optimizers/adam_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_npu.cc
@@ -22,7 +22,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 
 template <typename DeviceContext, typename T>
@@ -51,8 +51,8 @@ class AdamNPUKernel : public framework::OpKernel<T> {
     auto* mom2 = ctx.Input<LoDTensor>("Moment2");
     auto* lr = ctx.Input<LoDTensor>("LearningRate");
 
-    auto* beta1_pow = ctx.Input<Tensor>("Beta1Pow");
-    auto* beta2_pow = ctx.Input<Tensor>("Beta2Pow");
+    auto* beta1_pow = ctx.Input<phi::DenseTensor>("Beta1Pow");
+    auto* beta2_pow = ctx.Input<phi::DenseTensor>("Beta2Pow");
 
     auto* param_out = ctx.Output<LoDTensor>("ParamOut");
     auto* mom1_out = ctx.Output<LoDTensor>("Moment1Out");
@@ -62,7 +62,7 @@ class AdamNPUKernel : public framework::OpKernel<T> {
 
     bool skip_update = false;
     if (ctx.HasInput("SkipUpdate")) {
-      auto* skip_update_tensor = ctx.Input<framework::Tensor>("SkipUpdate");
+      auto* skip_update_tensor = ctx.Input<phi::DenseTensor>("SkipUpdate");
       PADDLE_ENFORCE_EQ(skip_update_tensor->numel(),
                         1,
                         platform::errors::InvalidArgument(
@@ -129,16 +129,16 @@ class AdamNPUKernel : public framework::OpKernel<T> {
       beta2_pow = &beta2_pow_tmp;
     }
 
-    const Tensor* beta1_tensor = nullptr;
-    const Tensor* beta2_tensor = nullptr;
-    const Tensor* epsilon_tensor = nullptr;
+    const phi::DenseTensor* beta1_tensor = nullptr;
+    const phi::DenseTensor* beta2_tensor = nullptr;
+    const phi::DenseTensor* epsilon_tensor = nullptr;
 
     Tensor beta1_tmp(experimental::DataType::FLOAT32);
     Tensor beta2_tmp(experimental::DataType::FLOAT32);
     Tensor epsilon_tmp(experimental::DataType::FLOAT32);
 
     if (ctx.HasInput("Beta1Tensor")) {
-      beta1_tensor = ctx.Input<framework::Tensor>("Beta1Tensor");
+      beta1_tensor = ctx.Input<phi::DenseTensor>("Beta1Tensor");
       PADDLE_ENFORCE_EQ(beta1_tensor->numel(),
                         1,
                         platform::errors::InvalidArgument(
@@ -152,7 +152,7 @@ class AdamNPUKernel : public framework::OpKernel<T> {
     }
 
     if (ctx.HasInput("Beta2Tensor")) {
-      beta2_tensor = ctx.Input<framework::Tensor>("Beta2Tensor");
+      beta2_tensor = ctx.Input<phi::DenseTensor>("Beta2Tensor");
       PADDLE_ENFORCE_EQ(beta2_tensor->numel(),
                         1,
                         platform::errors::InvalidArgument(
@@ -166,7 +166,7 @@ class AdamNPUKernel : public framework::OpKernel<T> {
     }
 
     if (ctx.HasInput("EpsilonTensor")) {
-      epsilon_tensor = ctx.Input<framework::Tensor>("EpsilonTensor");
+      epsilon_tensor = ctx.Input<phi::DenseTensor>("EpsilonTensor");
       PADDLE_ENFORCE_EQ(epsilon_tensor->numel(),
                         1,
                         platform::errors::InvalidArgument(
@@ -264,7 +264,7 @@ class AdamWNPUKernel : public AdamNPUKernel<platform::NPUDeviceContext, T> {
     bool skip_update = false;
     if (ctx.HasInput("SkipUpdate")) {
       VLOG(3) << "Has SkipUpdate";
-      auto* skip_update_tensor = ctx.Input<framework::Tensor>("SkipUpdate");
+      auto* skip_update_tensor = ctx.Input<phi::DenseTensor>("SkipUpdate");
       PADDLE_ENFORCE_EQ(skip_update_tensor->numel(),
                         1,
                         platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/optimizers/adamax_op.cc b/paddle/fluid/operators/optimizers/adamax_op.cc
index 75f9e25796ea0..5298030f17a04 100644
--- a/paddle/fluid/operators/optimizers/adamax_op.cc
+++ b/paddle/fluid/operators/optimizers/adamax_op.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 class AdamaxOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
index 90ce98c4dc316..94a52d9765bfa 100644
--- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 class DecayedAdagradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.h b/paddle/fluid/operators/optimizers/decayed_adagrad_op.h
index 741a12ded2e0a..98d807b9e9977 100644
--- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.h
+++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.h
@@ -40,8 +40,8 @@ class DecayedAdagradOpKernel : public framework::OpKernel<T> {
                           ctx.InputNames("Grad").front(),
                           framework::ToTypeName(grad_var->Type())));
 
-    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
-    auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
+    auto param_out_tensor = ctx.Output<phi::DenseTensor>("ParamOut");
+    auto moment_out_tensor = ctx.Output<phi::DenseTensor>("MomentOut");
 
     param_out_tensor->mutable_data<T>(ctx.GetPlace());
     moment_out_tensor->mutable_data<T>(ctx.GetPlace());
@@ -50,13 +50,13 @@ class DecayedAdagradOpKernel : public framework::OpKernel<T> {
     float epsilon = ctx.Attr<float>("epsilon");
 
     auto param = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Param"));
+        *ctx.Input<phi::DenseTensor>("Param"));
     auto grad = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Grad"));
+        *ctx.Input<phi::DenseTensor>("Grad"));
     auto moment = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("Moment"));
+        *ctx.Input<phi::DenseTensor>("Moment"));
     auto lr = framework::EigenVector<T>::Flatten(
-        *ctx.Input<framework::Tensor>("LearningRate"));
+        *ctx.Input<phi::DenseTensor>("LearningRate"));
 
     auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
     auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
diff --git a/paddle/fluid/operators/optimizers/dgc_momentum_op.cc b/paddle/fluid/operators/optimizers/dgc_momentum_op.cc
index 09847ff216f5a..2b4b1c1a109bd 100644
--- a/paddle/fluid/operators/optimizers/dgc_momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/dgc_momentum_op.cc
@@ -37,7 +37,7 @@ class DGCMomentumOp : public MomentumOp {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const framework::Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     if (var_name == "current_step" || var_name == "nranks") {
       VLOG(10) << "var_name:" << var_name << " need not to transform";
diff --git a/paddle/fluid/operators/optimizers/dgc_momentum_op.h b/paddle/fluid/operators/optimizers/dgc_momentum_op.h
index 5ea3a4cc808d9..86e069fe45e63 100644
--- a/paddle/fluid/operators/optimizers/dgc_momentum_op.h
+++ b/paddle/fluid/operators/optimizers/dgc_momentum_op.h
@@ -34,11 +34,11 @@ class DGCMomentumKernel : public framework::OpKernel<T> {
       return;
     }
 
-    auto current_step_tensor = context.Input<framework::Tensor>("current_step");
+    auto current_step_tensor = context.Input<phi::DenseTensor>("current_step");
     auto* current_step = current_step_tensor->data<T>();
 
     // nranks
-    auto nranks_tensor = context.Input<framework::Tensor>("nranks");
+    auto nranks_tensor = context.Input<phi::DenseTensor>("nranks");
     const int nranks = static_cast<const int>(*nranks_tensor->data<float>());
     PADDLE_ENFORCE_GT(
         nranks,
@@ -47,8 +47,8 @@ class DGCMomentumKernel : public framework::OpKernel<T> {
             "DGC is not useful when num_trainers <= 1, but now nranks=%d",
             nranks));
 
-    const framework::Tensor* g = context.Input<framework::Tensor>("Grad");
-    framework::Tensor* g_out = context.Output<framework::Tensor>("Grad_out");
+    const phi::DenseTensor* g = context.Input<phi::DenseTensor>("Grad");
+    phi::DenseTensor* g_out = context.Output<phi::DenseTensor>("Grad_out");
     auto g_e = framework::EigenVector<T>::Flatten(*g);
     auto g_out_e = framework::EigenVector<T>::Flatten(*g_out);
 
@@ -64,16 +64,16 @@ class DGCMomentumKernel : public framework::OpKernel<T> {
     const auto* grad_var = context.InputVar("Grad");
     if (static_cast<int>(*current_step) < static_cast<int>(rampup_begin_step)) {
       VLOG(10) << " so use momentum optimizer";
-      auto* learning_rate = context.Input<framework::Tensor>("LearningRate");
+      auto* learning_rate = context.Input<phi::DenseTensor>("LearningRate");
       bool multi_precision = context.Attr<bool>("multi_precision");
 
-      auto* param = context.Input<framework::Tensor>("Param");
-      auto* velocity = context.Input<framework::Tensor>("Velocity");
-      auto* param_out = context.Output<framework::Tensor>("ParamOut");
-      auto* velocity_out = context.Output<framework::Tensor>("VelocityOut");
+      auto* param = context.Input<phi::DenseTensor>("Param");
+      auto* velocity = context.Input<phi::DenseTensor>("Velocity");
+      auto* param_out = context.Output<phi::DenseTensor>("ParamOut");
+      auto* velocity_out = context.Output<phi::DenseTensor>("VelocityOut");
       auto* master_param_out =
-          context.Output<framework::Tensor>("MasterParamOut");
-      paddle::optional<framework::Tensor> master_param_opt(paddle::none);
+          context.Output<phi::DenseTensor>("MasterParamOut");
+      paddle::optional<phi::DenseTensor> master_param_opt(paddle::none);
       float mu = context.Attr<float>("mu");
       bool use_nesterov = context.Attr<bool>("use_nesterov");
       std::string regularization_method =
@@ -81,9 +81,9 @@ class DGCMomentumKernel : public framework::OpKernel<T> {
       float regularization_coeff = context.Attr<float>("regularization_coeff");
       float rescale_grad = context.Attr<float>("rescale_grad");
 
-      if (grad_var->IsType<framework::Tensor>()) {
+      if (grad_var->IsType<phi::DenseTensor>()) {
         // sgd_dense
-        auto* grad = context.Input<framework::Tensor>("Grad");
+        auto* grad = context.Input<phi::DenseTensor>("Grad");
         phi::MomentumDenseKernel<T>(
             static_cast<const typename framework::ConvertToPhiContext<
                 DeviceContext>::TYPE&>(dev_ctx),
@@ -130,22 +130,22 @@ class DGCMomentumKernel : public framework::OpKernel<T> {
 
     const auto* param_var = context.InputVar("Param");
 
-    auto* learning_rate = context.Input<framework::Tensor>("LearningRate");
+    auto* learning_rate = context.Input<phi::DenseTensor>("LearningRate");
     bool multi_precision = context.Attr<bool>("multi_precision");
     if (param_var->IsType<framework::LoDTensor>()) {
-      auto* param = context.Input<framework::Tensor>("Param");
-      auto* param_out = context.Output<framework::Tensor>("ParamOut");
+      auto* param = context.Input<phi::DenseTensor>("Param");
+      auto* param_out = context.Output<phi::DenseTensor>("ParamOut");
       auto* master_param_out =
-          context.Output<framework::Tensor>("MasterParamOut");
-      paddle::optional<framework::Tensor> master_param_opt(paddle::none);
+          context.Output<phi::DenseTensor>("MasterParamOut");
+      paddle::optional<phi::DenseTensor> master_param_opt(paddle::none);
       if (multi_precision) {
-        auto* master_param = context.Input<framework::Tensor>("MasterParam");
+        auto* master_param = context.Input<phi::DenseTensor>("MasterParam");
         master_param_opt = *master_param;
       }
 
-      if (grad_var->IsType<framework::Tensor>()) {
+      if (grad_var->IsType<phi::DenseTensor>()) {
         // sgd_dense
-        auto* grad = context.Input<framework::Tensor>("Grad");
+        auto* grad = context.Input<phi::DenseTensor>("Grad");
         phi::SGDDenseKernel<T>(
             static_cast<const typename framework::ConvertToPhiContext<
                 DeviceContext>::TYPE&>(dev_ctx),
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu
index d922b2a30cf90..2ad50781ad985 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu
@@ -28,8 +28,8 @@ using phi::funcs::FlattenToString;
 using phi::funcs::ToVector;
 
 struct ParamGradInfo {
-  framework::Tensor *param_t{nullptr};
-  framework::Tensor *grad_t{nullptr};
+  phi::DenseTensor *param_t{nullptr};
+  phi::DenseTensor *grad_t{nullptr};
   size_t idx{0};
   size_t numel{0};
   size_t numel_with_padding{0};
@@ -182,7 +182,7 @@ static size_t FillAlignmentPaddingInfo(std::vector<ParamGradInfo> *infos,
 
 template <typename T>
 static T *TensorFillConstant(const phi::GPUContext &dev_ctx,
-                             framework::Tensor *tensor,
+                             phi::DenseTensor *tensor,
                              const framework::DDim &dims,
                              T value) {
   tensor->Resize(dims);
@@ -192,10 +192,10 @@ static T *TensorFillConstant(const phi::GPUContext &dev_ctx,
   return ptr;
 }
 
-static framework::Tensor CastDataForInitedTensor(const phi::GPUContext &dev_ctx,
-                                                 framework::Tensor *origin,
-                                                 framework::Tensor *fused_out,
-                                                 size_t numel_offset) {
+static phi::DenseTensor CastDataForInitedTensor(const phi::GPUContext &dev_ctx,
+                                                phi::DenseTensor *origin,
+                                                phi::DenseTensor *fused_out,
+                                                size_t numel_offset) {
   PADDLE_ENFORCE_EQ(origin->IsInitialized(),
                     true,
                     platform::errors::InvalidArgument(
@@ -224,9 +224,9 @@ static framework::Tensor CastDataForInitedTensor(const phi::GPUContext &dev_ctx,
   return sliced_tensor;
 }
 
-static framework::Tensor CopyAndShareBufferForInitedTensor(
-    framework::Tensor *origin,
-    framework::Tensor *fused_out,
+static phi::DenseTensor CopyAndShareBufferForInitedTensor(
+    phi::DenseTensor *origin,
+    phi::DenseTensor *fused_out,
     size_t numel_offset,
     gpuStream_t stream) {
   PADDLE_ENFORCE_EQ(
@@ -271,8 +271,8 @@ static framework::Tensor CopyAndShareBufferForInitedTensor(
   return sliced_tensor;
 }
 
-static void ShareBufferForNonInitedTensor(framework::Tensor *origin,
-                                          framework::Tensor *fused_out,
+static void ShareBufferForNonInitedTensor(phi::DenseTensor *origin,
+                                          phi::DenseTensor *fused_out,
                                           size_t numel_offset,
                                           const framework::DDim &dims) {
   PADDLE_ENFORCE_EQ(
@@ -295,7 +295,7 @@ static void ShareBufferForNonInitedTensor(framework::Tensor *origin,
 
 template <typename T>
 static void CopyVectorToCPUTensor(const std::vector<T> &src,
-                                  framework::Tensor *dst) {
+                                  phi::DenseTensor *dst) {
   dst->Resize({static_cast<int64_t>(src.size())});
   T *dst_ptr = dst->mutable_data<T>(platform::CPUPlace());
   const T *src_ptr = src.data();
@@ -351,9 +351,9 @@ class DistributedFusedLambInitOpKernel<phi::GPUContext, T>
 
     // Step 1: Check Input(Param) and Output(ParamOut), Input(Grad) and
     // Output(GradOut)
-    auto params = ctx.MultiInput<framework::Tensor>("Param");
-    auto grads = ctx.MultiInput<framework::Tensor>("Grad");
-    auto master_params = ctx.MultiOutput<framework::Tensor>("MasterParamOut");
+    auto params = ctx.MultiInput<phi::DenseTensor>("Param");
+    auto grads = ctx.MultiInput<phi::DenseTensor>("Grad");
+    auto master_params = ctx.MultiOutput<phi::DenseTensor>("MasterParamOut");
     std::vector<ParamGradInfo> fp32_infos, fp16_infos;
     {
       PADDLE_ENFORCE_EQ(params.size(),
@@ -362,8 +362,8 @@ class DistributedFusedLambInitOpKernel<phi::GPUContext, T>
                             "The parameter number and parameter gradient "
                             "number should be the same."));
 
-      auto params_out = ctx.MultiOutput<framework::Tensor>("ParamOut");
-      auto grads_out = ctx.MultiOutput<framework::Tensor>("GradOut");
+      auto params_out = ctx.MultiOutput<phi::DenseTensor>("ParamOut");
+      auto grads_out = ctx.MultiOutput<phi::DenseTensor>("GradOut");
       PADDLE_ENFORCE_EQ(
           params.size(),
           params_out.size(),
@@ -469,7 +469,7 @@ class DistributedFusedLambInitOpKernel<phi::GPUContext, T>
     size_t fp16_wd_end_idx =
         ReorderParamGradInfoList(apply_weight_decay, &fp16_infos);
 
-    auto *param_order_t = ctx.Output<framework::Tensor>("ParamOrder");
+    auto *param_order_t = ctx.Output<phi::DenseTensor>("ParamOrder");
     auto param_num = fp32_infos.size() + fp16_infos.size();
     param_order_t->Resize({static_cast<int16_t>(param_num)});
     auto *param_order = param_order_t->mutable_data<int>(platform::CPUPlace());
@@ -535,30 +535,30 @@ class DistributedFusedLambInitOpKernel<phi::GPUContext, T>
     // Step 3: allocate output tensor and do initialization
     float *fused_fp32_param = nullptr, *fused_fp32_grad = nullptr;
     platform::float16 *fused_fp16_param = nullptr, *fused_fp16_grad = nullptr;
-    framework::Tensor *fp32_p_t = nullptr, *fp16_p_t = nullptr,
-                      *fp32_g_t = nullptr, *fp16_g_t = nullptr;
-    std::vector<framework::Tensor *> fp16_master_params;
+    phi::DenseTensor *fp32_p_t = nullptr, *fp16_p_t = nullptr,
+                     *fp32_g_t = nullptr, *fp16_g_t = nullptr;
+    std::vector<phi::DenseTensor *> fp16_master_params;
     if (total_numel > 0) {
-      fp32_p_t = ctx.Output<framework::Tensor>("FP32FusedParam");
+      fp32_p_t = ctx.Output<phi::DenseTensor>("FP32FusedParam");
       fused_fp32_param = TensorFillConstant<float>(
           dev_ctx, fp32_p_t, {static_cast<int64_t>(total_numel)}, 0.0f);
     }
 
     if (fp32_numel > 0) {
-      fp32_g_t = ctx.Output<framework::Tensor>("FP32FusedGrad");
+      fp32_g_t = ctx.Output<phi::DenseTensor>("FP32FusedGrad");
       fused_fp32_grad = TensorFillConstant<float>(
           dev_ctx, fp32_g_t, {static_cast<int64_t>(fp32_numel)}, 0.0f);
     }
 
     if (fp16_numel > 0) {
-      fp16_p_t = ctx.Output<framework::Tensor>("FP16FusedParam");
+      fp16_p_t = ctx.Output<phi::DenseTensor>("FP16FusedParam");
       fused_fp16_param = TensorFillConstant<platform::float16>(
           dev_ctx,
           fp16_p_t,
           {static_cast<int64_t>(fp16_numel)},
           static_cast<platform::float16>(0));
 
-      fp16_g_t = ctx.Output<framework::Tensor>("FP16FusedGrad");
+      fp16_g_t = ctx.Output<phi::DenseTensor>("FP16FusedGrad");
       fused_fp16_grad = TensorFillConstant<platform::float16>(
           dev_ctx,
           fp16_g_t,
@@ -622,19 +622,19 @@ class DistributedFusedLambInitOpKernel<phi::GPUContext, T>
 
     // Step 4: For Moment1, Moment2, Beta1Pow, Beta2Pow, just fill constant
     TensorFillConstant<float>(dev_ctx,
-                              ctx.Output<framework::Tensor>("Moment1"),
+                              ctx.Output<phi::DenseTensor>("Moment1"),
                               {static_cast<int64_t>(numel_each_device)},
                               0.0f);
     TensorFillConstant<float>(dev_ctx,
-                              ctx.Output<framework::Tensor>("Moment2"),
+                              ctx.Output<phi::DenseTensor>("Moment2"),
                               {static_cast<int64_t>(numel_each_device)},
                               0.0f);
     TensorFillConstant<float>(dev_ctx,
-                              ctx.Output<framework::Tensor>("Beta1Pow"),
+                              ctx.Output<phi::DenseTensor>("Beta1Pow"),
                               {1},
                               ctx.Attr<float>("beta1"));
     TensorFillConstant<float>(dev_ctx,
-                              ctx.Output<framework::Tensor>("Beta2Pow"),
+                              ctx.Output<phi::DenseTensor>("Beta2Pow"),
                               {1},
                               ctx.Attr<float>("beta2"));
     VLOG(10) << "Init Moment and BetaPow ends";
@@ -665,7 +665,7 @@ class DistributedFusedLambInitOpKernel<phi::GPUContext, T>
     size_t total_local_param_num = fp32_local_param_num + fp16_local_param_num;
     VLOG(10) << "Found the sharding arguments";
 
-    auto *param_info_t = ctx.Output<framework::Tensor>("ParamInfo");
+    auto *param_info_t = ctx.Output<phi::DenseTensor>("ParamInfo");
     param_info_t->Resize({8});
     auto *param_info = param_info_t->mutable_data<int>(platform::CPUPlace());
     param_info[0] = static_cast<int>(fp32_start_idx);
@@ -760,22 +760,22 @@ class DistributedFusedLambInitOpKernel<phi::GPUContext, T>
     }
 
     CopyVectorToCPUTensor(numel_offsets,
-                          ctx.Output<framework::Tensor>("FusedParamOffsets"));
+                          ctx.Output<phi::DenseTensor>("FusedParamOffsets"));
     CopyVectorToCPUTensor(
         fp32_partial_numel_offsets,
-        ctx.Output<framework::Tensor>("FP32ShardFusedParamOffsets"));
+        ctx.Output<phi::DenseTensor>("FP32ShardFusedParamOffsets"));
     CopyVectorToCPUTensor(
         fp16_partial_numel_offsets,
-        ctx.Output<framework::Tensor>("FP16ShardFusedParamOffsets"));
+        ctx.Output<phi::DenseTensor>("FP16ShardFusedParamOffsets"));
 
-    auto *global_scale = ctx.Output<framework::Tensor>("GlobalScale");
+    auto *global_scale = ctx.Output<phi::DenseTensor>("GlobalScale");
     if (!global_scale->IsInitialized()) {
       TensorFillConstant<float>(dev_ctx, global_scale, {1}, 1.0f);
     }
     VLOG(10) << "Init global scale ends";
 
     TensorFillConstant<int64_t>(dev_ctx,
-                                ctx.Output<framework::Tensor>("Step"),
+                                ctx.Output<phi::DenseTensor>("Step"),
                                 {1},
                                 static_cast<int64_t>(0));
 
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
index 9f286fef47773..d810f8df7370a 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
@@ -32,7 +32,7 @@ class DistributedFusedLambOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const framework::Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     return expected_kernel_type;
   }
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
index 8a799f2bdc83c..908be3cd41d21 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
@@ -225,10 +225,10 @@ static void LogParamAndTrustRatioDivSquareNorm(
     const float *trust_ratio_div_square_norm) {
   if (!VLOG_IS_ON(LogLevel)) return;
 
-  auto tensors = ctx.MultiInput<framework::Tensor>("Param");
+  auto tensors = ctx.MultiInput<phi::DenseTensor>("Param");
   if (tensors.empty()) return;
 
-  const auto *order = ctx.Input<framework::Tensor>("ParamOrder")->data<int>();
+  const auto *order = ctx.Input<phi::DenseTensor>("ParamOrder")->data<int>();
 
   size_t n = tensors.size();
   auto place = tensors[0]->place();
@@ -264,7 +264,7 @@ template <typename T>
 static const T *GetInputTensorPtr(const framework::ExecutionContext &ctx,
                                   const char *in_name,
                                   int64_t *numel = nullptr) {
-  const auto *in_tensor = ctx.Input<framework::Tensor>(in_name);
+  const auto *in_tensor = ctx.Input<phi::DenseTensor>(in_name);
   PADDLE_ENFORCE_NOT_NULL(
       in_tensor,
       platform::errors::InvalidArgument("Input(%s) cannot be NULL.", in_name));
@@ -283,7 +283,7 @@ static T *GetSameInOutTensorPtr(const framework::ExecutionContext &ctx,
                                 const char *in_name,
                                 const char *out_name,
                                 int64_t *numel = nullptr) {
-  const auto *in_tensor = ctx.Input<framework::Tensor>(in_name);
+  const auto *in_tensor = ctx.Input<phi::DenseTensor>(in_name);
   if (in_tensor == nullptr || !in_tensor->IsInitialized()) {
     PADDLE_ENFORCE_EQ(AllowNotExist,
                       true,
@@ -293,7 +293,7 @@ static T *GetSameInOutTensorPtr(const framework::ExecutionContext &ctx,
     return nullptr;
   }
 
-  auto *out_tensor = ctx.Output<framework::Tensor>(out_name);
+  auto *out_tensor = ctx.Output<phi::DenseTensor>(out_name);
   PADDLE_ENFORCE_NOT_NULL(
       in_tensor,
       platform::errors::InvalidArgument("Input(%s) cannot be NULL.", in_name));
@@ -1145,8 +1145,7 @@ static std::string GetMinMaxStr(const T *x,
 }
 
 struct VisitDTypeFunctor {
-  VisitDTypeFunctor(const framework::Tensor *x, std::string *s)
-      : x_(x), s_(s) {}
+  VisitDTypeFunctor(const phi::DenseTensor *x, std::string *s) : x_(x), s_(s) {}
 
   template <typename T>
   void apply() const {
@@ -1154,11 +1153,11 @@ struct VisitDTypeFunctor {
   }
 
  private:
-  const framework::Tensor *x_;
+  const phi::DenseTensor *x_;
   std::string *s_;
 };
 
-static std::string GetMinMaxStr(const framework::Tensor *x) {
+static std::string GetMinMaxStr(const phi::DenseTensor *x) {
   if (x == nullptr) return "null";
   if (!x->IsInitialized()) return "not_inited";
   if (!platform::is_gpu_place(x->place())) return "CPUTensor";
@@ -1173,7 +1172,7 @@ static void PrintAllMinMaxRange(const framework::ExecutionContext &ctx,
   if (!VLOG_IS_ON(1)) return;
   for (const auto &pair : ctx.GetOp().Inputs()) {
     const auto &key = pair.first;
-    const auto tensors = ctx.MultiInput<framework::Tensor>(key);
+    const auto tensors = ctx.MultiInput<phi::DenseTensor>(key);
     size_t n = tensors.size();
     for (size_t i = 0; i < n; ++i) {
       VLOG(1) << "Input(" << key + ")[" << i << "] = " << pair.second[i]
@@ -1184,7 +1183,7 @@ static void PrintAllMinMaxRange(const framework::ExecutionContext &ctx,
   if (only_inputs) return;
   for (const auto &pair : ctx.GetOp().Outputs()) {
     const auto &key = pair.first;
-    const auto tensors = ctx.MultiOutput<framework::Tensor>(key);
+    const auto tensors = ctx.MultiOutput<phi::DenseTensor>(key);
     size_t n = tensors.size();
     for (size_t i = 0; i < n; ++i) {
       VLOG(1) << "Output(" << key + ")[" << i << "] = " << pair.second[i]
@@ -1340,7 +1339,7 @@ class DistributedFusedLambOpKernel<phi::GPUContext, T>
     auto stream = dev_ctx.stream();
     auto place = dev_ctx.GetPlace();
 
-    auto *found_inf_t = ctx.Output<framework::Tensor>("FoundInf");
+    auto *found_inf_t = ctx.Output<phi::DenseTensor>("FoundInf");
     found_inf_t->Resize({1});
 
     // Step 1: Get fp16 param and grad tensors
@@ -1397,7 +1396,7 @@ class DistributedFusedLambOpKernel<phi::GPUContext, T>
         platform::errors::InvalidArgument(
             "The gradient accumulation steps should be not less than 1."));
     if (acc_steps > 1) {
-      auto *step_t = ctx.Output<framework::Tensor>("AccStep");
+      auto *step_t = ctx.Output<phi::DenseTensor>("AccStep");
       PADDLE_ENFORCE_NOT_NULL(
           step_t,
           platform::errors::InvalidArgument(
@@ -1417,7 +1416,7 @@ class DistributedFusedLambOpKernel<phi::GPUContext, T>
       float *fp32_acc_grad = nullptr;
       if (has_fp32_param) {
         auto *fp32_acc_grad_t =
-            ctx.Output<framework::Tensor>("FP32AccFusedGrad");
+            ctx.Output<phi::DenseTensor>("FP32AccFusedGrad");
         PADDLE_ENFORCE_NOT_NULL(
             fp32_acc_grad_t,
             platform::errors::InvalidArgument(
@@ -1437,7 +1436,7 @@ class DistributedFusedLambOpKernel<phi::GPUContext, T>
       if (has_fp16_param) {
         use_master_acc_grad = ctx.Attr<bool>("use_master_acc_grad");
         auto *fp16_acc_grad_t =
-            ctx.Output<framework::Tensor>("FP16AccFusedGrad");
+            ctx.Output<phi::DenseTensor>("FP16AccFusedGrad");
         PADDLE_ENFORCE_NOT_NULL(
             fp16_acc_grad_t,
             platform::errors::InvalidArgument(
@@ -1527,7 +1526,7 @@ class DistributedFusedLambOpKernel<phi::GPUContext, T>
         }
       }
 
-      auto *stop_update_t = ctx.Output<framework::Tensor>("StopUpdate");
+      auto *stop_update_t = ctx.Output<phi::DenseTensor>("StopUpdate");
       stop_update_t->Resize({1});
       auto *stop_update =
           stop_update_t->mutable_data<bool>(platform::CPUPlace());
@@ -2061,18 +2060,18 @@ class DistributedFusedLambOpKernel<phi::GPUContext, T>
     VLOG(10) << "ReduceScatter done";
 
     // Step 7: update the moment1, moment2. Calcuate the trust_ratio_div
-    auto *fused_offsets_t = ctx.Input<framework::Tensor>("FusedParamOffsets");
+    auto *fused_offsets_t = ctx.Input<phi::DenseTensor>("FusedParamOffsets");
     auto *fused_offsets = fused_offsets_t->data<int>();
     auto *fp32_partial_fused_offsets_t =
-        ctx.Input<framework::Tensor>("FP32ShardFusedParamOffsets");
+        ctx.Input<phi::DenseTensor>("FP32ShardFusedParamOffsets");
     const auto *fp32_partial_fused_offsets =
         fp32_partial_fused_offsets_t->data<int>();
     auto *fp16_partial_fused_offsets_t =
-        ctx.Input<framework::Tensor>("FP16ShardFusedParamOffsets");
+        ctx.Input<phi::DenseTensor>("FP16ShardFusedParamOffsets");
     const auto *fp16_partial_fused_offsets =
         fp16_partial_fused_offsets_t->data<int>();
 
-    auto *step = ctx.Output<framework::Tensor>("Step")->data<int64_t>();
+    auto *step = ctx.Output<phi::DenseTensor>("Step")->data<int64_t>();
 
     VLOG(1) << "FusedParamOffsets: "
             << FlattenToString(fused_offsets,
diff --git a/paddle/fluid/operators/optimizers/dpsgd_op.cc b/paddle/fluid/operators/optimizers/dpsgd_op.cc
index d058b890cbd9d..9d522031acf6c 100644
--- a/paddle/fluid/operators/optimizers/dpsgd_op.cc
+++ b/paddle/fluid/operators/optimizers/dpsgd_op.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 class DpsgdOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/optimizers/dpsgd_op.h b/paddle/fluid/operators/optimizers/dpsgd_op.h
index 7d6a99ad2c55c..abc5a619ca830 100644
--- a/paddle/fluid/operators/optimizers/dpsgd_op.h
+++ b/paddle/fluid/operators/optimizers/dpsgd_op.h
@@ -46,12 +46,12 @@ class DpsgdOpKernel : public framework::OpKernel<T> {
                           ctx.InputNames("Grad").front(),
                           framework::ToTypeName(grad_var->Type())));
 
-    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+    const auto *learning_rate = ctx.Input<phi::DenseTensor>("LearningRate");
 
-    const auto *param = ctx.Input<framework::Tensor>("Param");
-    const auto *grad = ctx.Input<framework::Tensor>("Grad");
+    const auto *param = ctx.Input<phi::DenseTensor>("Param");
+    const auto *grad = ctx.Input<phi::DenseTensor>("Grad");
 
-    auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
+    auto *param_out = ctx.Output<phi::DenseTensor>("ParamOut");
 
     auto sz = param_out->numel();
     PADDLE_ENFORCE_EQ(param->numel(),
diff --git a/paddle/fluid/operators/optimizers/ftrl_op.cc b/paddle/fluid/operators/optimizers/ftrl_op.cc
index 50060b1636943..b81a6c5ab6bb7 100644
--- a/paddle/fluid/operators/optimizers/ftrl_op.cc
+++ b/paddle/fluid/operators/optimizers/ftrl_op.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 class FTRLOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/optimizers/ftrl_op.h b/paddle/fluid/operators/optimizers/ftrl_op.h
index e15233c718a9a..abd0e15e471b1 100644
--- a/paddle/fluid/operators/optimizers/ftrl_op.h
+++ b/paddle/fluid/operators/optimizers/ftrl_op.h
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
@@ -120,15 +120,15 @@ class FTRLOpKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     const auto* grad_var = ctx.InputVar("Grad");
 
-    auto* lr_in = ctx.Input<Tensor>("LearningRate");
+    auto* lr_in = ctx.Input<phi::DenseTensor>("LearningRate");
 
-    auto* param_in = ctx.Input<Tensor>("Param");
-    auto* sq_accum_in = ctx.Input<Tensor>("SquaredAccumulator");
-    auto* lin_accum_in = ctx.Input<Tensor>("LinearAccumulator");
+    auto* param_in = ctx.Input<phi::DenseTensor>("Param");
+    auto* sq_accum_in = ctx.Input<phi::DenseTensor>("SquaredAccumulator");
+    auto* lin_accum_in = ctx.Input<phi::DenseTensor>("LinearAccumulator");
 
-    auto* param_out = ctx.Output<Tensor>("ParamOut");
-    auto* sq_accum_out = ctx.Output<Tensor>("SquaredAccumOut");
-    auto* lin_accum_out = ctx.Output<Tensor>("LinearAccumOut");
+    auto* param_out = ctx.Output<phi::DenseTensor>("ParamOut");
+    auto* sq_accum_out = ctx.Output<phi::DenseTensor>("SquaredAccumOut");
+    auto* lin_accum_out = ctx.Output<phi::DenseTensor>("LinearAccumOut");
 
     param_out->mutable_data<T>(ctx.GetPlace());
     sq_accum_out->mutable_data<T>(ctx.GetPlace());
@@ -139,7 +139,7 @@ class FTRLOpKernel : public framework::OpKernel<T> {
     auto lr_power = static_cast<T>(ctx.Attr<float>("lr_power"));
 
     if (grad_var->IsType<framework::LoDTensor>()) {
-      auto grad = ctx.Input<Tensor>("Grad");
+      auto grad = ctx.Input<phi::DenseTensor>("Grad");
       auto g = EigenVector<T>::Flatten(*grad);
 
       auto p = EigenVector<T>::Flatten(*param_in);
diff --git a/paddle/fluid/operators/optimizers/lamb_op.cc b/paddle/fluid/operators/optimizers/lamb_op.cc
index e9d6ab77f4357..3e2ee495b0586 100644
--- a/paddle/fluid/operators/optimizers/lamb_op.cc
+++ b/paddle/fluid/operators/optimizers/lamb_op.cc
@@ -37,7 +37,7 @@ class LambOp : public framework::OperatorWithKernel {
   }
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const framework::Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const {
     if (var_name == "Beta1Pow" || var_name == "Beta2Pow") {
       return expected_kernel_type;
diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cu b/paddle/fluid/operators/optimizers/lars_momentum_op.cu
index 5337e56b28d5b..066bf66c4549a 100644
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu
@@ -484,7 +484,7 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
     bool multi_precision = ctx.Attr<bool>("multi_precision");
     auto& cuda_ctx = ctx.template device_context<phi::GPUContext>();
     int sm_num = cuda_ctx.GetSMCount();
-    framework::Tensor tmp_buffer_t = ctx.AllocateTmpTensor<MT, phi::GPUContext>(
+    phi::DenseTensor tmp_buffer_t = ctx.AllocateTmpTensor<MT, phi::GPUContext>(
         {LARS_BLOCK_SIZE << 1}, cuda_ctx);
     auto* p_buffer = tmp_buffer_t.mutable_data<MT>(ctx.GetPlace());
     auto* g_buffer = p_buffer + LARS_BLOCK_SIZE;
diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.h b/paddle/fluid/operators/optimizers/lars_momentum_op.h
index 459900b14f61d..4aaf37af73faf 100644
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.h
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.h
@@ -49,7 +49,7 @@ class LarsMomentumOpKernel : public framework::OpKernel<T> {
       auto g = framework::EigenVector<T>::Flatten(*(grad[i]));
       auto rescale_g = rescale_grad * g;
 
-      framework::Tensor p_norm_t, g_norm_t;
+      phi::DenseTensor p_norm_t, g_norm_t;
       p_norm_t.Resize({1});
       g_norm_t.Resize({1});
       p_norm_t.mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/optimizers/merged_adam_op.cc b/paddle/fluid/operators/optimizers/merged_adam_op.cc
index f49fc72d01030..8e4ff40372a12 100644
--- a/paddle/fluid/operators/optimizers/merged_adam_op.cc
+++ b/paddle/fluid/operators/optimizers/merged_adam_op.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class MergedAdamOp : public framework::OperatorWithKernel {
  public:
@@ -34,7 +34,7 @@ class MergedAdamOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const framework::Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     if (var_name == "Beta1Pow" || var_name == "Beta2Pow" ||
         var_name == "SkipUpdate") {
diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc b/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc
index 90faf8f389a89..c390a12863bc4 100644
--- a/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc
+++ b/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc
@@ -28,8 +28,8 @@ template <typename T>
 class MLUMergedMomentumOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto params = ctx.MultiInput<framework::Tensor>("Param");
-    auto params_out = ctx.MultiOutput<framework::Tensor>("ParamOut");
+    auto params = ctx.MultiInput<phi::DenseTensor>("Param");
+    auto params_out = ctx.MultiOutput<phi::DenseTensor>("ParamOut");
     size_t n = params.size();
     PADDLE_ENFORCE_EQ(n,
                       params_out.size(),
@@ -47,7 +47,7 @@ class MLUMergedMomentumOpKernel : public framework::OpKernel<T> {
                             "must be the same Tensors."));
     }
 
-    auto grads = ctx.MultiInput<framework::Tensor>("Grad");
+    auto grads = ctx.MultiInput<phi::DenseTensor>("Grad");
     PADDLE_ENFORCE_EQ(
         n,
         grads.size(),
@@ -57,7 +57,7 @@ class MLUMergedMomentumOpKernel : public framework::OpKernel<T> {
             grads.size(),
             n));
 
-    auto velocitys = ctx.MultiInput<framework::Tensor>("Velocity");
+    auto velocitys = ctx.MultiInput<phi::DenseTensor>("Velocity");
     PADDLE_ENFORCE_EQ(n,
                       velocitys.size(),
                       platform::errors::InvalidArgument(
@@ -67,7 +67,7 @@ class MLUMergedMomentumOpKernel : public framework::OpKernel<T> {
                           velocitys.size(),
                           n));
 
-    auto velocitys_out = ctx.MultiOutput<framework::Tensor>("VelocityOut");
+    auto velocitys_out = ctx.MultiOutput<phi::DenseTensor>("VelocityOut");
     PADDLE_ENFORCE_EQ(
         n,
         velocitys_out.size(),
@@ -86,7 +86,7 @@ class MLUMergedMomentumOpKernel : public framework::OpKernel<T> {
     }
 
     auto mu = static_cast<T>(ctx.Attr<float>("mu"));
-    auto lrs = ctx.MultiInput<framework::Tensor>("LearningRate");
+    auto lrs = ctx.MultiInput<phi::DenseTensor>("LearningRate");
     if (lrs.size() != 1) {
       PADDLE_ENFORCE_EQ(
           n,
diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc b/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc
index 38479d6dba22e..6dd1cdbc03e9a 100644
--- a/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc
@@ -28,8 +28,8 @@ template <typename T>
 class NPUMergedMomentumOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto params = ctx.MultiInput<framework::Tensor>("Param");
-    auto params_out = ctx.MultiOutput<framework::Tensor>("ParamOut");
+    auto params = ctx.MultiInput<phi::DenseTensor>("Param");
+    auto params_out = ctx.MultiOutput<phi::DenseTensor>("ParamOut");
     size_t n = params.size();
     PADDLE_ENFORCE_EQ(n,
                       params_out.size(),
@@ -47,7 +47,7 @@ class NPUMergedMomentumOpKernel : public framework::OpKernel<T> {
                             "must be the same Tensors."));
     }
 
-    auto grads = ctx.MultiInput<framework::Tensor>("Grad");
+    auto grads = ctx.MultiInput<phi::DenseTensor>("Grad");
     PADDLE_ENFORCE_EQ(
         n,
         grads.size(),
@@ -57,7 +57,7 @@ class NPUMergedMomentumOpKernel : public framework::OpKernel<T> {
             grads.size(),
             n));
 
-    auto velocitys = ctx.MultiInput<framework::Tensor>("Velocity");
+    auto velocitys = ctx.MultiInput<phi::DenseTensor>("Velocity");
     PADDLE_ENFORCE_EQ(n,
                       velocitys.size(),
                       platform::errors::InvalidArgument(
@@ -67,7 +67,7 @@ class NPUMergedMomentumOpKernel : public framework::OpKernel<T> {
                           velocitys.size(),
                           n));
 
-    auto velocitys_out = ctx.MultiOutput<framework::Tensor>("VelocityOut");
+    auto velocitys_out = ctx.MultiOutput<phi::DenseTensor>("VelocityOut");
     PADDLE_ENFORCE_EQ(
         n,
         velocitys_out.size(),
@@ -86,7 +86,7 @@ class NPUMergedMomentumOpKernel : public framework::OpKernel<T> {
     }
 
     T mu = static_cast<T>(ctx.Attr<float>("mu"));
-    auto lrs = ctx.MultiInput<framework::Tensor>("LearningRate");
+    auto lrs = ctx.MultiInput<phi::DenseTensor>("LearningRate");
     if (lrs.size() != 1) {
       PADDLE_ENFORCE_EQ(
           n,
diff --git a/paddle/fluid/operators/optimizers/mkldnn/sgd_mkldnn_op.cc b/paddle/fluid/operators/optimizers/mkldnn/sgd_mkldnn_op.cc
index e332972f7576a..ea5f3f9a2e806 100644
--- a/paddle/fluid/operators/optimizers/mkldnn/sgd_mkldnn_op.cc
+++ b/paddle/fluid/operators/optimizers/mkldnn/sgd_mkldnn_op.cc
@@ -28,10 +28,10 @@ class SGDOneDNNKernel : public SGDOpKernel<phi::CPUContext, T> {
   void dense_param_and_grad_kernel(
       const framework::ExecutionContext &ctx) const override {
     VLOG(4) << "[ONEDNN]: sgd_dense_param_kernel<T, LodTensor>";
-    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
-    const auto *param = ctx.Input<framework::Tensor>("Param");
-    auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
-    const auto *grad = ctx.Input<framework::Tensor>("Grad");
+    const auto *learning_rate = ctx.Input<phi::DenseTensor>("LearningRate");
+    const auto *param = ctx.Input<phi::DenseTensor>("Param");
+    auto *param_out = ctx.Output<phi::DenseTensor>("ParamOut");
+    const auto *grad = ctx.Input<phi::DenseTensor>("Grad");
 
     auto *out_data = param_out->mutable_data<T>(ctx.GetPlace());
     const T *param_data = param->data<T>();
@@ -46,8 +46,8 @@ class SGDOneDNNKernel : public SGDOpKernel<phi::CPUContext, T> {
   void dense_param_sparse_grad_kernel(
       const framework::ExecutionContext &ctx) const override {
     VLOG(4) << "[ONEDNN]: sgd_dense_param_kernel<T, SelectedRows>";
-    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
-    auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
+    const auto *learning_rate = ctx.Input<phi::DenseTensor>("LearningRate");
+    auto *param_out = ctx.Output<phi::DenseTensor>("ParamOut");
     const auto *grad = ctx.Input<phi::SelectedRows>("Grad");
 
     const auto &grad_value = grad->value();
diff --git a/paddle/fluid/operators/optimizers/momentum_op.cc b/paddle/fluid/operators/optimizers/momentum_op.cc
index 7a738a8994768..4171f0c11955a 100644
--- a/paddle/fluid/operators/optimizers/momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/momentum_op.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class MomentumOpInferVarType : public framework::VarTypeInference {
  public:
diff --git a/paddle/fluid/operators/optimizers/momentum_op_mlu.cc b/paddle/fluid/operators/optimizers/momentum_op_mlu.cc
index eeeddfc793f31..9aa16c7fe642b 100644
--- a/paddle/fluid/operators/optimizers/momentum_op_mlu.cc
+++ b/paddle/fluid/operators/optimizers/momentum_op_mlu.cc
@@ -37,19 +37,19 @@ class MLUMomentumOpKernel : public framework::OpKernel<T> {
     T mu = static_cast<T>(ctx.Attr<float>("mu"));
     bool use_nesterov = ctx.Attr<bool>("use_nesterov");
 
-    auto learning_rate = ctx.Input<framework::Tensor>("LearningRate");
-    auto param = ctx.Input<framework::Tensor>("Param");
-    auto velocity = ctx.Input<framework::Tensor>("Velocity");
+    auto learning_rate = ctx.Input<phi::DenseTensor>("LearningRate");
+    auto param = ctx.Input<phi::DenseTensor>("Param");
+    auto velocity = ctx.Input<phi::DenseTensor>("Velocity");
 
-    auto param_out = ctx.Output<framework::Tensor>("ParamOut");
-    auto velocity_out = ctx.Output<framework::Tensor>("VelocityOut");
+    auto param_out = ctx.Output<phi::DenseTensor>("ParamOut");
+    auto velocity_out = ctx.Output<phi::DenseTensor>("VelocityOut");
 
     param_out->mutable_data<T>(ctx.GetPlace());
     velocity_out->mutable_data<T>(ctx.GetPlace());
 
     auto* grad_var = ctx.InputVar("Grad");
     if (grad_var->IsType<framework::LoDTensor>()) {
-      auto grad = ctx.Input<framework::Tensor>("Grad");
+      auto grad = ctx.Input<phi::DenseTensor>("Grad");
       Tensor mu_tensor =
           ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
       MLUCnnlTensorDesc mu_tensor_desc(mu_tensor);
diff --git a/paddle/fluid/operators/optimizers/momentum_op_npu.cc b/paddle/fluid/operators/optimizers/momentum_op_npu.cc
index 234f86fe38bca..40136919fe17a 100644
--- a/paddle/fluid/operators/optimizers/momentum_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/momentum_op_npu.cc
@@ -37,19 +37,19 @@ class NPUMomentumOpKernel : public framework::OpKernel<T> {
     T mu = static_cast<T>(ctx.Attr<float>("mu"));
     bool use_nesterov = ctx.Attr<bool>("use_nesterov");
 
-    auto learning_rate = ctx.Input<framework::Tensor>("LearningRate");
-    auto param = ctx.Input<framework::Tensor>("Param");
-    auto velocity = ctx.Input<framework::Tensor>("Velocity");
+    auto learning_rate = ctx.Input<phi::DenseTensor>("LearningRate");
+    auto param = ctx.Input<phi::DenseTensor>("Param");
+    auto velocity = ctx.Input<phi::DenseTensor>("Velocity");
 
-    auto param_out = ctx.Output<framework::Tensor>("ParamOut");
-    auto velocity_out = ctx.Output<framework::Tensor>("VelocityOut");
+    auto param_out = ctx.Output<phi::DenseTensor>("ParamOut");
+    auto velocity_out = ctx.Output<phi::DenseTensor>("VelocityOut");
 
     param_out->mutable_data<T>(ctx.GetPlace());
     velocity_out->mutable_data<T>(ctx.GetPlace());
 
     auto* grad_var = ctx.InputVar("Grad");
     if (grad_var->IsType<framework::LoDTensor>()) {
-      auto grad = ctx.Input<framework::Tensor>("Grad");
+      auto grad = ctx.Input<phi::DenseTensor>("Grad");
       Tensor mu_tensor;
       mu_tensor.mutable_data<T>(phi::make_ddim({1}), ctx.GetPlace());
       FillNpuTensorWithConstant<T>(&mu_tensor, mu);
diff --git a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h
index d3d2e48fdcd6c..8f3be79cd4c8d 100644
--- a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h
+++ b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h
@@ -71,10 +71,10 @@ template <typename DeviceContext, typename T>
 class Pow2DecayWithLinearWarmupOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const {
-    const auto *lr = ctx.Input<framework::Tensor>("LearningRate");
-    const auto *step = ctx.Input<framework::Tensor>("Step");
-    auto *lr_out = ctx.Output<framework::Tensor>("LearningRateOut");
-    auto *step_out = ctx.Output<framework::Tensor>("StepOut");
+    const auto *lr = ctx.Input<phi::DenseTensor>("LearningRate");
+    const auto *step = ctx.Input<phi::DenseTensor>("Step");
+    auto *lr_out = ctx.Output<phi::DenseTensor>("LearningRateOut");
+    auto *step_out = ctx.Output<phi::DenseTensor>("StepOut");
     PADDLE_ENFORCE_EQ(
         lr,
         lr_out,
diff --git a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op_xpu.cc b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op_xpu.cc
index 4a13e226df8ce..543a4634c6d71 100644
--- a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op_xpu.cc
@@ -27,10 +27,10 @@ template <typename T>
 class Pow2DecayWithLinearWarmupXPUOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const {
-    const auto *lr = ctx.Input<framework::Tensor>("LearningRate");
-    const auto *step = ctx.Input<framework::Tensor>("Step");
-    auto *lr_out = ctx.Output<framework::Tensor>("LearningRateOut");
-    auto *step_out = ctx.Output<framework::Tensor>("StepOut");
+    const auto *lr = ctx.Input<phi::DenseTensor>("LearningRate");
+    const auto *step = ctx.Input<phi::DenseTensor>("Step");
+    auto *lr_out = ctx.Output<phi::DenseTensor>("LearningRateOut");
+    auto *step_out = ctx.Output<phi::DenseTensor>("StepOut");
     PADDLE_ENFORCE_EQ(
         lr,
         lr_out,
diff --git a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
index de280a6788779..2da5bed7642c1 100644
--- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 class ProximalAdagradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/optimizers/proximal_adagrad_op.h b/paddle/fluid/operators/optimizers/proximal_adagrad_op.h
index 3faf8ea765944..136e416307ab0 100644
--- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.h
+++ b/paddle/fluid/operators/optimizers/proximal_adagrad_op.h
@@ -19,14 +19,14 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class ProximalAdagradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* param_out = ctx.Output<Tensor>("ParamOut");
-    auto* moment_out = ctx.Output<Tensor>("MomentOut");
+    auto* param_out = ctx.Output<phi::DenseTensor>("ParamOut");
+    auto* moment_out = ctx.Output<phi::DenseTensor>("MomentOut");
 
     param_out->mutable_data<T>(ctx.GetPlace());
     moment_out->mutable_data<T>(ctx.GetPlace());
@@ -34,12 +34,14 @@ class ProximalAdagradOpKernel : public framework::OpKernel<T> {
     auto l1 = static_cast<T>(ctx.Attr<float>("l1"));
     auto l2 = static_cast<T>(ctx.Attr<float>("l2"));
 
-    auto grad = ctx.Input<Tensor>("Grad");
-    auto p = framework::EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
-    auto m = framework::EigenVector<T>::Flatten(*ctx.Input<Tensor>("Moment"));
+    auto grad = ctx.Input<phi::DenseTensor>("Grad");
+    auto p = framework::EigenVector<T>::Flatten(
+        *ctx.Input<phi::DenseTensor>("Param"));
+    auto m = framework::EigenVector<T>::Flatten(
+        *ctx.Input<phi::DenseTensor>("Moment"));
     auto g = framework::EigenVector<T>::Flatten(*grad);
-    auto lr =
-        framework::EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
+    auto lr = framework::EigenVector<T>::Flatten(
+        *ctx.Input<phi::DenseTensor>("LearningRate"));
 
     auto p_out = framework::EigenVector<T>::Flatten(*param_out);
     auto m_out = framework::EigenVector<T>::Flatten(*moment_out);
diff --git a/paddle/fluid/operators/optimizers/proximal_gd_op.cc b/paddle/fluid/operators/optimizers/proximal_gd_op.cc
index 2460b30fa26b0..061e495c4bacd 100644
--- a/paddle/fluid/operators/optimizers/proximal_gd_op.cc
+++ b/paddle/fluid/operators/optimizers/proximal_gd_op.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 class ProximalGDOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/optimizers/proximal_gd_op.h b/paddle/fluid/operators/optimizers/proximal_gd_op.h
index 7caa8421f041c..024062045ae43 100644
--- a/paddle/fluid/operators/optimizers/proximal_gd_op.h
+++ b/paddle/fluid/operators/optimizers/proximal_gd_op.h
@@ -19,25 +19,26 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class ProximalGDOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* param_out = ctx.Output<Tensor>("ParamOut");
+    auto* param_out = ctx.Output<phi::DenseTensor>("ParamOut");
 
     param_out->mutable_data<T>(ctx.GetPlace());
 
-    auto grad = ctx.Input<Tensor>("Grad");
+    auto grad = ctx.Input<phi::DenseTensor>("Grad");
 
     auto l1 = static_cast<T>(ctx.Attr<float>("l1"));
     auto l2 = static_cast<T>(ctx.Attr<float>("l2"));
 
-    auto p = framework::EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
+    auto p = framework::EigenVector<T>::Flatten(
+        *ctx.Input<phi::DenseTensor>("Param"));
     auto g = framework::EigenVector<T>::Flatten(*grad);
-    auto lr =
-        framework::EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
+    auto lr = framework::EigenVector<T>::Flatten(
+        *ctx.Input<phi::DenseTensor>("LearningRate"));
 
     auto p_out = framework::EigenVector<T>::Flatten(*param_out);
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
diff --git a/paddle/fluid/operators/optimizers/rmsprop_op_npu.cc b/paddle/fluid/operators/optimizers/rmsprop_op_npu.cc
index 8f6a35a8b6747..a70c129bad038 100644
--- a/paddle/fluid/operators/optimizers/rmsprop_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/rmsprop_op_npu.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 
 template <typename DeviceContext, typename T>
@@ -62,8 +62,8 @@ class RMSPROPNPUKernel : public framework::OpKernel<T> {
         epsilon_tmp.mutable_data<T>({1}, ctx.GetPlace());
         FillNpuTensorWithConstant<T>(&epsilon_tmp, epsilon);
         epsilon_tensor = &epsilon_tmp;
-        auto *mg_tensor = ctx.Input<Tensor>("MeanGrad");
-        auto *mean_grad_out = ctx.Output<Tensor>("MeanGradOut");
+        auto *mg_tensor = ctx.Input<phi::DenseTensor>("MeanGrad");
+        auto *mean_grad_out = ctx.Output<phi::DenseTensor>("MeanGradOut");
         mean_grad_out->mutable_data<T>(ctx.GetPlace());
         const auto &runner_applycenterrmsprop = NpuOpRunner(
             std::string("ApplyCenteredRMSPropD"),
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cc b/paddle/fluid/operators/optimizers/sgd_op.cc
index 803bc9f980a51..0bc3cb11f4d06 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.cc
+++ b/paddle/fluid/operators/optimizers/sgd_op.cc
@@ -60,7 +60,7 @@ class SGDOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const framework::Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const {
     if (var_name == "LearningRate") {
       return framework::OpKernelType(
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu
index 28ca7c6d8d3b7..686529758260a 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.cu
+++ b/paddle/fluid/operators/optimizers/sgd_op.cu
@@ -76,18 +76,18 @@ class SGDOpKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
                           "but the received is %s",
                           ctx.InputNames("Param").front(),
                           paddle::framework::ToTypeName(param_var->Type())));
-    using paddle::framework::Tensor;
+
     using MPDType = typename details::MPTypeTrait<T>::Type;
 
-    auto* param = ctx.Input<framework::Tensor>("Param");
-    auto* param_out = ctx.Output<framework::Tensor>("ParamOut");
-    auto* learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+    auto* param = ctx.Input<phi::DenseTensor>("Param");
+    auto* param_out = ctx.Output<phi::DenseTensor>("ParamOut");
+    auto* learning_rate = ctx.Input<phi::DenseTensor>("LearningRate");
 
     auto* grad_var = ctx.InputVar("Grad");
 
     const bool multi_precision = ctx.Attr<bool>("multi_precision");
-    const Tensor* master_param = nullptr;
-    Tensor* master_param_out = nullptr;
+    const phi::DenseTensor* master_param = nullptr;
+    phi::DenseTensor* master_param_out = nullptr;
     if (multi_precision) {
       bool has_master =
           ctx.HasInput("MasterParam") && ctx.HasOutput("MasterParamOut");
@@ -97,8 +97,8 @@ class SGDOpKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
                             "The Input(MasterParam) and Output(MasterParamOut) "
                             "should not be null when "
                             "the attr `multi_precision` is true"));
-      master_param = ctx.Input<framework::Tensor>("MasterParam");
-      master_param_out = ctx.Output<framework::Tensor>("MasterParamOut");
+      master_param = ctx.Input<phi::DenseTensor>("MasterParam");
+      master_param_out = ctx.Output<phi::DenseTensor>("MasterParamOut");
     }
     const MPDType* master_in_data =
         multi_precision ? master_param->data<MPDType>() : nullptr;
@@ -109,7 +109,7 @@ class SGDOpKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
 
     // Actually, all tensors are LoDTensor except SelectedRows.
     if (grad_var->IsType<framework::LoDTensor>()) {
-      auto* grad = ctx.Input<framework::Tensor>("Grad");
+      auto* grad = ctx.Input<phi::DenseTensor>("Grad");
 
       int block = 512;
       int grid = (param->numel() + block - 1) / block;
diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h
index 02d8bcbd279dc..16f3e76662dae 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.h
+++ b/paddle/fluid/operators/optimizers/sgd_op.h
@@ -38,10 +38,10 @@ struct sgd_dense_param_kernel<
     framework::VarTypeTrait<framework::LoDTensor>::kId> {
   void operator()(const framework::ExecutionContext &ctx) const {
     VLOG(4) << "[CPU]: sgd_dense_param_kernel<T, LoDTensor>";
-    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
-    const auto *param = ctx.Input<framework::Tensor>("Param");
-    auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
-    const auto *grad = ctx.Input<framework::Tensor>("Grad");
+    const auto *learning_rate = ctx.Input<phi::DenseTensor>("LearningRate");
+    const auto *param = ctx.Input<phi::DenseTensor>("Param");
+    auto *param_out = ctx.Output<phi::DenseTensor>("ParamOut");
+    const auto *grad = ctx.Input<phi::DenseTensor>("Grad");
 
     const auto sz = param_out->numel();
     jit::sgd_attr_t attr(1, sz, 1, sz, 1);
@@ -64,9 +64,9 @@ struct sgd_dense_param_kernel<T,
                               framework::VarTypeTrait<phi::SelectedRows>::kId> {
   void operator()(const framework::ExecutionContext &ctx) const {
     VLOG(4) << "[CPU]: sgd_dense_param_kernel<T, SelectedRows>";
-    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
-    const auto *param = ctx.Input<framework::Tensor>("Param");
-    auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
+    const auto *learning_rate = ctx.Input<phi::DenseTensor>("LearningRate");
+    const auto *param = ctx.Input<phi::DenseTensor>("Param");
+    auto *param_out = ctx.Output<phi::DenseTensor>("ParamOut");
     const auto *grad = ctx.Input<phi::SelectedRows>("Grad");
 
     const auto &grad_value = grad->value();
@@ -98,10 +98,10 @@ struct sgd_dense_param_kernel<
     framework::VarTypeTrait<framework::LoDTensor>::kId> {
   void operator()(const framework::ExecutionContext &ctx) const {
     VLOG(4) << "[CPU]: sgd_dense_param_kernel<bfloat16, LoDTensor>";
-    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
-    const auto *param = ctx.Input<framework::Tensor>("Param");
-    auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
-    const auto *grad = ctx.Input<framework::Tensor>("Grad");
+    const auto *learning_rate = ctx.Input<phi::DenseTensor>("LearningRate");
+    const auto *param = ctx.Input<phi::DenseTensor>("Param");
+    auto *param_out = ctx.Output<phi::DenseTensor>("ParamOut");
+    const auto *grad = ctx.Input<phi::DenseTensor>("Grad");
     param_out->mutable_data<platform::bfloat16>(ctx.GetPlace());
 
     auto p = framework::EigenVector<platform::bfloat16>::Flatten(*param);
@@ -119,8 +119,8 @@ struct sgd_dense_param_kernel<platform::bfloat16,
                               framework::VarTypeTrait<phi::SelectedRows>::kId> {
   void operator()(const framework::ExecutionContext &ctx) const {
     VLOG(4) << "[CPU]: sgd_dense_param_kernel<bfloat16, SelectedRows>";
-    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
-    auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
+    const auto *learning_rate = ctx.Input<phi::DenseTensor>("LearningRate");
+    auto *param_out = ctx.Output<phi::DenseTensor>("ParamOut");
     const auto *grad = ctx.Input<phi::SelectedRows>("Grad");
 
     const auto &grad_value = grad->value();
@@ -181,12 +181,12 @@ class SGDOpKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
 
  protected:
   void invoke_dense_param_kernel(const framework::ExecutionContext &ctx) const {
-    const auto *param = ctx.Input<framework::Tensor>("Param");
-    auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
+    const auto *param = ctx.Input<phi::DenseTensor>("Param");
+    auto *param_out = ctx.Output<phi::DenseTensor>("ParamOut");
     const auto *grad_var = ctx.InputVar("Grad");
 
     if (grad_var->IsType<framework::LoDTensor>()) {
-      const auto *grad = ctx.Input<framework::Tensor>("Grad");
+      const auto *grad = ctx.Input<phi::DenseTensor>("Grad");
       const auto sz = param_out->numel();
       PADDLE_ENFORCE_EQ(param->numel(),
                         sz,
@@ -269,7 +269,7 @@ class SGDOpKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
 
   void sparse_param_and_grad_kernel(
       const framework::ExecutionContext &ctx) const {
-    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+    const auto *learning_rate = ctx.Input<phi::DenseTensor>("LearningRate");
     const auto *param_var = ctx.InputVar("Param");
     const auto *grad_var = ctx.InputVar("Grad");
 
diff --git a/paddle/fluid/operators/optimizers/sparse_momentum_op.cc b/paddle/fluid/operators/optimizers/sparse_momentum_op.cc
index a92bbbc838a8a..3e072a5e17a64 100644
--- a/paddle/fluid/operators/optimizers/sparse_momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/sparse_momentum_op.cc
@@ -19,7 +19,7 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class SparseMomentumOpInferVarType : public framework::VarTypeInference {
  public:
diff --git a/paddle/fluid/operators/optimizers/sparse_momentum_op.h b/paddle/fluid/operators/optimizers/sparse_momentum_op.h
index e3d59a3744c0b..9eea5c11cb074 100644
--- a/paddle/fluid/operators/optimizers/sparse_momentum_op.h
+++ b/paddle/fluid/operators/optimizers/sparse_momentum_op.h
@@ -36,8 +36,6 @@ namespace cub = hipcub;
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 template <typename T>
 using MultiPrecisionType = typename details::MPTypeTrait<T>::Type;
 
@@ -305,7 +303,7 @@ class SparseMomentumOpKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     const bool multi_precision = ctx.Attr<bool>("multi_precision");
     bool use_nesterov = ctx.Attr<bool>("use_nesterov");
-    auto index = ctx.Input<framework::Tensor>("Index");
+    auto index = ctx.Input<phi::DenseTensor>("Index");
     const auto& index_type = framework::TransToProtoVarType(index->dtype());
     if (multi_precision) {
       if (use_nesterov) {
@@ -371,8 +369,8 @@ class SparseMomentumOpKernel : public framework::OpKernel<T> {
     int axis = ctx.Attr<int>("axis");
     // get axis from tensor
     if (ctx.HasInput("Axis")) {
-      Tensor cpu_axis;
-      const Tensor* axis_tensor = ctx.Input<Tensor>("Axis");
+      phi::DenseTensor cpu_axis;
+      const phi::DenseTensor* axis_tensor = ctx.Input<phi::DenseTensor>("Axis");
       framework::TensorCopy(*axis_tensor, platform::CPUPlace(), &cpu_axis);
       const auto& axis_type =
           framework::TransToProtoVarType(axis_tensor->dtype());
@@ -388,12 +386,12 @@ class SparseMomentumOpKernel : public framework::OpKernel<T> {
         platform::errors::InvalidArgument("The axis of sparse_momentum_op only "
                                           "support axis=0 or axis=1 now."));
 
-    auto learning_rate = ctx.Input<framework::Tensor>("LearningRate");
-    auto param = ctx.Input<framework::Tensor>("Param");
-    auto param_out = ctx.Output<framework::Tensor>("ParamOut");
-    auto velocity = ctx.Input<framework::Tensor>("Velocity");
-    auto velocity_out = ctx.Output<framework::Tensor>("VelocityOut");
-    auto index = ctx.Input<framework::Tensor>("Index");
+    auto learning_rate = ctx.Input<phi::DenseTensor>("LearningRate");
+    auto param = ctx.Input<phi::DenseTensor>("Param");
+    auto param_out = ctx.Output<phi::DenseTensor>("ParamOut");
+    auto velocity = ctx.Input<phi::DenseTensor>("Velocity");
+    auto velocity_out = ctx.Output<phi::DenseTensor>("VelocityOut");
+    auto index = ctx.Input<phi::DenseTensor>("Index");
     int64_t num_index = index->numel();
 
     // check index of shape 1-D
@@ -412,8 +410,8 @@ class SparseMomentumOpKernel : public framework::OpKernel<T> {
                             " the second dimension should be 1."));
     }
 
-    const framework::Tensor* master_param = nullptr;
-    framework::Tensor* master_param_out = nullptr;
+    const phi::DenseTensor* master_param = nullptr;
+    phi::DenseTensor* master_param_out = nullptr;
     if (multi_precision) {
       bool has_master =
           ctx.HasInput("MasterParam") && ctx.HasOutput("MasterParamOut");
@@ -423,8 +421,8 @@ class SparseMomentumOpKernel : public framework::OpKernel<T> {
                             "The Input(MasterParam) and Output(MasterParamOut) "
                             "should not be null when "
                             "the attr `multi_precision` is true"));
-      master_param = ctx.Input<framework::Tensor>("MasterParam");
-      master_param_out = ctx.Output<framework::Tensor>("MasterParamOut");
+      master_param = ctx.Input<phi::DenseTensor>("MasterParam");
+      master_param_out = ctx.Output<phi::DenseTensor>("MasterParamOut");
     }
 
     param_out->mutable_data<T>(ctx.GetPlace());
@@ -435,7 +433,7 @@ class SparseMomentumOpKernel : public framework::OpKernel<T> {
         multi_precision ? master_param_out->mutable_data<MT>(ctx.GetPlace())
                         : nullptr;
 
-    auto grad = ctx.Input<framework::Tensor>("Grad");
+    auto grad = ctx.Input<phi::DenseTensor>("Grad");
 
     platform::ForRange<DeviceContext> for_range(
         static_cast<const DeviceContext&>(ctx.device_context()),
@@ -455,7 +453,7 @@ class SparseMomentumOpKernel : public framework::OpKernel<T> {
                           "The Grad's rank of sparse_momentum_op"
                           " must be 2 now."));
 
-    Tensor sorted_index, grad_index, sort_value;
+    phi::DenseTensor sorted_index, grad_index, sort_value;
     auto sorted_index_ptr =
         sorted_index.mutable_data<IndexT>({num_index}, ctx.GetPlace());
     auto grad_index_ptr =
diff --git a/paddle/fluid/operators/p_norm_op_npu.cc b/paddle/fluid/operators/p_norm_op_npu.cc
index fb7ae8756d446..9d312dd572a45 100644
--- a/paddle/fluid/operators/p_norm_op_npu.cc
+++ b/paddle/fluid/operators/p_norm_op_npu.cc
@@ -22,8 +22,8 @@ template <typename DeviceContext, typename T>
 class PnormNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_x = ctx.Input<framework::Tensor>("X");
-    auto* out_norm = ctx.Output<framework::Tensor>("Out");
+    auto* in_x = ctx.Input<phi::DenseTensor>("X");
+    auto* out_norm = ctx.Output<phi::DenseTensor>("Out");
     out_norm->mutable_data<T>(ctx.GetPlace());
 
     float porder = ctx.Attr<float>("porder");
@@ -93,11 +93,11 @@ template <typename DeviceContext, typename T>
 class PnormGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Out");
-    auto* dy = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    using Tensor = phi::DenseTensor;
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* y = ctx.Input<phi::DenseTensor>("Out");
+    auto* dy = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     auto place = ctx.GetPlace();
     dx->mutable_data<T>(place);
diff --git a/paddle/fluid/operators/pad2d_op.cc b/paddle/fluid/operators/pad2d_op.cc
index 66aef5fe4eaa2..b812338e2cb79 100644
--- a/paddle/fluid/operators/pad2d_op.cc
+++ b/paddle/fluid/operators/pad2d_op.cc
@@ -23,8 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 template <typename T>
 void Pad2DConstNCHW(const T* in_data,
                     const int num,
@@ -391,7 +389,7 @@ void Pad2DGradEdgeNHWC(T* d_in_data,
 
 static inline void GetPaddings(int* paddings,
                                const framework::ExecutionContext& context) {
-  auto* paddings_t = context.Input<Tensor>("Paddings");
+  auto* paddings_t = context.Input<phi::DenseTensor>("Paddings");
   if (paddings_t) {
     auto paddings_data = paddings_t->data<int>();
     paddings[0] = paddings_data[0];
@@ -414,11 +412,11 @@ class Pad2dCPUKernel : public framework::OpKernel<T> {
     auto data_format = context.Attr<std::string>("data_format");
     T value = static_cast<T>(context.Attr<float>("pad_value"));
 
-    auto* x = context.Input<Tensor>("X");
+    auto* x = context.Input<phi::DenseTensor>("X");
     auto in_dims = x->dims();
     const T* in_data = x->data<T>();
 
-    auto* out = context.Output<Tensor>("Out");
+    auto* out = context.Output<phi::DenseTensor>("Out");
     if (data_format == "NCHW") {
       out->Resize({in_dims[0],
                    in_dims[1],
@@ -530,8 +528,9 @@ class Pad2dGradCPUKernel : public framework::OpKernel<T> {
     GetPaddings(pads, context);
     auto mode = context.Attr<std::string>("mode");
     auto data_format = context.Attr<std::string>("data_format");
-    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* d_in = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* d_out =
+        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* d_in = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto d_in_dims = d_in->dims();
     auto d_out_dims = d_out->dims();
     const T* d_out_data = d_out->data<T>();
@@ -704,7 +703,7 @@ class Pad2dOp : public framework::OperatorWithKernel {
     // only constant mode and non-blocked layouts are supported for oneDNN
     if (this->CanMKLDNNBeUsed(ctx, input_data_type) &&
         ctx.Attr<std::string>("mode") == "constant" &&
-        ctx.Input<Tensor>("X")
+        ctx.Input<phi::DenseTensor>("X")
                 ->mem_desc()
                 .data.format_desc.blocking.inner_nblks == 0) {
       return framework::OpKernelType(input_data_type,
@@ -718,7 +717,7 @@ class Pad2dOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const {
 #ifdef PADDLE_WITH_MKLDNN
     if ((expected_kernel_type.data_layout_ == framework::DataLayout::kMKLDNN) &&
diff --git a/paddle/fluid/operators/pad2d_op.cu b/paddle/fluid/operators/pad2d_op.cu
index 5ed217b2e60ef..c76a6b61e780e 100644
--- a/paddle/fluid/operators/pad2d_op.cu
+++ b/paddle/fluid/operators/pad2d_op.cu
@@ -24,8 +24,6 @@ namespace operators {
 
 using platform::PADDLE_CUDA_NUM_THREADS;
 
-using framework::Tensor;
-
 template <typename T>
 __global__ void Pad2DConstNCHW(const int nthreads,
                                const T* in_data,
@@ -350,9 +348,9 @@ __global__ void Pad2DGradEdgeNHWC(const int out_size,
 
 static inline void GetPaddings(int* paddings,
                                const framework::ExecutionContext& context) {
-  auto* paddings_t = context.Input<Tensor>("Paddings");
+  auto* paddings_t = context.Input<phi::DenseTensor>("Paddings");
   if (paddings_t) {
-    Tensor pads;
+    phi::DenseTensor pads;
     framework::TensorCopySync(*paddings_t, platform::CPUPlace(), &pads);
     auto pads_data = pads.data<int>();
     paddings[0] = pads_data[0];
@@ -375,10 +373,10 @@ class Pad2dCUDAKernel : public framework::OpKernel<T> {
     auto data_format = context.Attr<std::string>("data_format");
     T value = static_cast<T>(context.Attr<float>("pad_value"));
 
-    auto* x = context.Input<Tensor>("X");
+    auto* x = context.Input<phi::DenseTensor>("X");
     auto in_dims = x->dims();
     const T* in_data = x->data<T>();
-    auto* out = context.Output<Tensor>("Out");
+    auto* out = context.Output<phi::DenseTensor>("Out");
     auto out_dims = out->dims();
     if (data_format == "NCHW") {
       out_dims[0] = in_dims[0];
@@ -501,8 +499,9 @@ class Pad2dGradCUDAKernel : public framework::OpKernel<T> {
     GetPaddings(pads, context);
     auto mode = context.Attr<std::string>("mode");
     auto data_format = context.Attr<std::string>("data_format");
-    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* d_in = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* d_out =
+        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* d_in = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto d_in_dims = d_in->dims();
     auto d_out_dims = d_out->dims();
     const T* d_out_data = d_out->data<T>();
diff --git a/paddle/fluid/operators/pad3d_op.cc b/paddle/fluid/operators/pad3d_op.cc
index 65475f63ec0f9..8fb86ac37aa22 100644
--- a/paddle/fluid/operators/pad3d_op.cc
+++ b/paddle/fluid/operators/pad3d_op.cc
@@ -25,8 +25,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class Pad3dOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -39,7 +37,7 @@ class Pad3dOp : public framework::OperatorWithKernel {
     // only constant mode and non-blocked layouts are supported for oneDNN
     if (this->CanMKLDNNBeUsed(ctx, input_data_type) &&
         ctx.Attr<std::string>("mode") == "constant" &&
-        ctx.Input<Tensor>("X")
+        ctx.Input<phi::DenseTensor>("X")
                 ->mem_desc()
                 .data.format_desc.blocking.inner_nblks == 0) {
       return framework::OpKernelType(input_data_type,
@@ -53,7 +51,7 @@ class Pad3dOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
 #ifdef PADDLE_WITH_MKLDNN
     if ((expected_kernel_type.data_layout_ == framework::DataLayout::kMKLDNN) &&
diff --git a/paddle/fluid/operators/pad3d_op_npu.cc b/paddle/fluid/operators/pad3d_op_npu.cc
index 5f1ec06018277..7694e0edbf9f9 100644
--- a/paddle/fluid/operators/pad3d_op_npu.cc
+++ b/paddle/fluid/operators/pad3d_op_npu.cc
@@ -19,12 +19,12 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 static inline std::vector<int> GetPaddings(
     const framework::ExecutionContext& context) {
   std::vector<int> paddings(6);
-  auto* paddings_t = context.Input<Tensor>("Paddings");
+  auto* paddings_t = context.Input<phi::DenseTensor>("Paddings");
   if (paddings_t) {
     paddle::framework::TensorToVector(
         *paddings_t, context.device_context(), &paddings);
@@ -39,7 +39,7 @@ template <typename T>
 class Pad3dNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
+    auto* x = context.Input<phi::DenseTensor>("X");
     auto in_dims = x->dims();
 
     std::vector<int> pads = GetPaddings(context);
@@ -47,7 +47,7 @@ class Pad3dNPUKernel : public framework::OpKernel<T> {
     float value = context.Attr<float>("value");
     auto data_format = context.Attr<std::string>("data_format");
 
-    auto* out = context.Output<Tensor>("Out");
+    auto* out = context.Output<phi::DenseTensor>("Out");
 
     PADDLE_ENFORCE_LT(abs(value),
                       1e-5,
@@ -106,8 +106,9 @@ class Pad3dGradNPUKernel : public framework::OpKernel<T> {
     auto mode = context.Attr<std::string>("mode");
     auto data_format = context.Attr<std::string>("data_format");
 
-    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* d_in = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* d_out =
+        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* d_in = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto d_in_dims = d_in->dims();
     d_in->mutable_data<T>(context.GetPlace());
 
diff --git a/paddle/fluid/operators/pad_constant_like_op.cc b/paddle/fluid/operators/pad_constant_like_op.cc
index 254e8ebe5c570..28d264ba8e41f 100644
--- a/paddle/fluid/operators/pad_constant_like_op.cc
+++ b/paddle/fluid/operators/pad_constant_like_op.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class PadConstantLikeOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/pad_constant_like_op.h b/paddle/fluid/operators/pad_constant_like_op.h
index 1207eb1d5cde2..ba87bd3ef1818 100644
--- a/paddle/fluid/operators/pad_constant_like_op.h
+++ b/paddle/fluid/operators/pad_constant_like_op.h
@@ -30,9 +30,9 @@ template <typename DeviceContext, typename T>
 class PadConstantLikeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto in_x = context.Input<framework::Tensor>("X");
-    auto in_y = context.Input<framework::Tensor>("Y");
-    auto* out = context.Output<framework::Tensor>("Out");
+    auto in_x = context.Input<phi::DenseTensor>("X");
+    auto in_y = context.Input<phi::DenseTensor>("Y");
+    auto* out = context.Output<phi::DenseTensor>("Out");
 
     if (in_x->dims() == in_y->dims()) {
       framework::TensorCopy(*in_y, context.GetPlace(), out);
@@ -42,7 +42,7 @@ class PadConstantLikeKernel : public framework::OpKernel<T> {
     T pad_value = static_cast<T>(context.Attr<float>("pad_value"));
     out->mutable_data<T>(context.GetPlace());
 
-    int rank = context.Input<framework::Tensor>("X")->dims().size();
+    int rank = context.Input<phi::DenseTensor>("X")->dims().size();
 
     std::vector<int> pads(rank * 2, 0);
 
@@ -65,10 +65,10 @@ template <typename DeviceContext, typename T>
 class PadConstantLikeGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto in_y = context.Input<framework::Tensor>("Y");
+    auto in_y = context.Input<phi::DenseTensor>("Y");
     auto in_dout =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* d_y = context.Output<framework::Tensor>(framework::GradVarName("Y"));
+        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* d_y = context.Output<phi::DenseTensor>(framework::GradVarName("Y"));
 
     if (d_y == nullptr) {
       return;
diff --git a/paddle/fluid/operators/pad_op.cc b/paddle/fluid/operators/pad_op.cc
index fb4a90ebd8ca9..4e6a10a912a88 100644
--- a/paddle/fluid/operators/pad_op.cc
+++ b/paddle/fluid/operators/pad_op.cc
@@ -22,8 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class PadOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/pad_op_npu.cc b/paddle/fluid/operators/pad_op_npu.cc
index 061da7d76e5df..425defc9792c7 100644
--- a/paddle/fluid/operators/pad_op_npu.cc
+++ b/paddle/fluid/operators/pad_op_npu.cc
@@ -19,14 +19,14 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class PadNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* out = context.Output<Tensor>("Out");
+    auto* x = context.Input<phi::DenseTensor>("X");
+    auto* out = context.Output<phi::DenseTensor>("Out");
     auto paddings = context.Attr<std::vector<int>>("paddings");
     float pad_value = context.Attr<float>("pad_value");
 
@@ -56,8 +56,9 @@ template <typename T>
 class PadGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* d_out =
+        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* d_x = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto paddings = context.Attr<std::vector<int>>("paddings");
 
     d_x->mutable_data<T>(context.GetPlace());
diff --git a/paddle/fluid/operators/partial_concat_op.cc b/paddle/fluid/operators/partial_concat_op.cc
index e9b54632ddc01..396c4f2d038b7 100644
--- a/paddle/fluid/operators/partial_concat_op.cc
+++ b/paddle/fluid/operators/partial_concat_op.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class PartialConcatOp : public framework::OperatorWithKernel {
  public:
@@ -92,7 +92,7 @@ class PartialConcatOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    auto inputs = ctx.MultiInput<Tensor>("X");
+    auto inputs = ctx.MultiInput<phi::DenseTensor>("X");
     auto input_data_type = framework::proto::VarType::Type(0);
     bool flag = 0;
     for (auto *input : inputs) {
diff --git a/paddle/fluid/operators/partial_concat_op.cu b/paddle/fluid/operators/partial_concat_op.cu
index a6b2700a1a4da..ae36b85d8520f 100644
--- a/paddle/fluid/operators/partial_concat_op.cu
+++ b/paddle/fluid/operators/partial_concat_op.cu
@@ -24,7 +24,7 @@ namespace operators {
 #define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
 
 using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <class T>
 __global__ void ConcatPartialCUDAKernel(T **in,
@@ -72,8 +72,8 @@ template <typename T>
 class PartialConcatOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    auto in_vars = ctx.MultiInput<Tensor>("X");
-    Tensor *out = ctx.Output<Tensor>("Out");
+    auto in_vars = ctx.MultiInput<phi::DenseTensor>("X");
+    Tensor *out = ctx.Output<phi::DenseTensor>("Out");
     PADDLE_ENFORCE_EQ(in_vars[0] != nullptr,
                       true,
                       platform::errors::InvalidArgument(
@@ -153,7 +153,7 @@ template <typename T>
 class PartialConcatGradOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto *out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
     auto ins = ctx.MultiInput<LoDTensor>("X");
     auto outs = ctx.MultiOutput<LoDTensor>(framework::GradVarName("X"));
 
diff --git a/paddle/fluid/operators/partial_concat_op.h b/paddle/fluid/operators/partial_concat_op.h
index 927ffbede6e6c..d81924298588b 100644
--- a/paddle/fluid/operators/partial_concat_op.h
+++ b/paddle/fluid/operators/partial_concat_op.h
@@ -23,7 +23,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 static inline int64_t ComputeStartIndex(int64_t start_index, int64_t size) {
   PADDLE_ENFORCE_EQ(
@@ -44,8 +44,8 @@ template <typename DeviceContext, typename T>
 class PartialConcatKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto ins = ctx.MultiInput<framework::Tensor>("X");
-    framework::Tensor* out = ctx.Output<framework::Tensor>("Out");
+    auto ins = ctx.MultiInput<phi::DenseTensor>("X");
+    phi::DenseTensor* out = ctx.Output<phi::DenseTensor>("Out");
     PADDLE_ENFORCE_EQ(ins[0] != nullptr,
                       true,
                       platform::errors::InvalidArgument(
@@ -89,7 +89,7 @@ template <typename T>
 class PartialConcatGradientOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
     auto ins = ctx.MultiInput<framework::LoDTensor>("X");
     auto outs =
         ctx.MultiOutput<framework::LoDTensor>(framework::GradVarName("X"));
diff --git a/paddle/fluid/operators/partial_sum_op.cc b/paddle/fluid/operators/partial_sum_op.cc
index eb8271edccf95..aa2f30aaafc2c 100644
--- a/paddle/fluid/operators/partial_sum_op.cc
+++ b/paddle/fluid/operators/partial_sum_op.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class PartialSumOp : public framework::OperatorWithKernel {
  public:
@@ -94,7 +94,7 @@ class PartialSumOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    auto inputs = ctx.MultiInput<Tensor>("X");
+    auto inputs = ctx.MultiInput<phi::DenseTensor>("X");
     auto input_data_type = framework::proto::VarType::Type(0);
     bool flag = 0;
     for (auto *input : inputs) {
diff --git a/paddle/fluid/operators/partial_sum_op.cu b/paddle/fluid/operators/partial_sum_op.cu
index e0703532c1268..be1a34651e2d9 100644
--- a/paddle/fluid/operators/partial_sum_op.cu
+++ b/paddle/fluid/operators/partial_sum_op.cu
@@ -24,7 +24,7 @@ namespace operators {
 #define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
 
 using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <class T>
 __global__ void SumArrayPartialCUDAKernel(T **in,
@@ -77,8 +77,8 @@ template <typename T>
 class PartialSumOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    auto in_vars = ctx.MultiInput<Tensor>("X");
-    Tensor *out = ctx.Output<Tensor>("Out");
+    auto in_vars = ctx.MultiInput<phi::DenseTensor>("X");
+    Tensor *out = ctx.Output<phi::DenseTensor>("Out");
 
     PADDLE_ENFORCE_EQ(
         in_vars[0] != nullptr,
@@ -151,7 +151,8 @@ template <typename T>
 class PartialSumGradOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    const Tensor *out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    const Tensor *out_grad =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
     auto ins = ctx.MultiInput<LoDTensor>("X");
     auto outs = ctx.MultiOutput<LoDTensor>(framework::GradVarName("X"));
 
diff --git a/paddle/fluid/operators/partial_sum_op.h b/paddle/fluid/operators/partial_sum_op.h
index a595630319220..35f104ef55a0a 100644
--- a/paddle/fluid/operators/partial_sum_op.h
+++ b/paddle/fluid/operators/partial_sum_op.h
@@ -21,14 +21,14 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class PartialSumKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto ins = ctx.MultiInput<Tensor>("X");
-    Tensor* out = ctx.Output<Tensor>("Out");
+    auto ins = ctx.MultiInput<phi::DenseTensor>("X");
+    phi::DenseTensor* out = ctx.Output<phi::DenseTensor>("Out");
     PADDLE_ENFORCE_EQ(
         ins[0] != nullptr,
         true,
@@ -63,7 +63,7 @@ template <typename T>
 class PartialSumGradientOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
     auto ins = ctx.MultiInput<framework::LoDTensor>("X");
     auto outs =
         ctx.MultiOutput<framework::LoDTensor>(framework::GradVarName("X"));
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index e8b35b89157a3..dce9b1360a015 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -32,7 +32,7 @@ namespace operators {
 bool CanMKLDNNSupportPool(const framework::ExecutionContext& ctx) {
   if (ctx.Attr<bool>("adaptive") == false) return true;
   // (jczaja): oneDNN is supporting only unchangable in size pool window
-  auto src_tz = phi::vectorize(ctx.Input<Tensor>("X")->dims());
+  auto src_tz = phi::vectorize(ctx.Input<phi::DenseTensor>("X")->dims());
   std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
   // Fast but not exhustive check
   return ((src_tz[src_tz.size() - 1] % ksize[1] == 0) &&
@@ -64,7 +64,7 @@ framework::OpKernelType PoolOp::GetExpectedKernelType(
 
 framework::OpKernelType PoolOp::GetKernelTypeForVar(
     const std::string& var_name,
-    const Tensor& tensor,
+    const phi::DenseTensor& tensor,
     const framework::OpKernelType& expected_kernel_type) const {
 #ifdef PADDLE_WITH_MKLDNN
   if ((expected_kernel_type.data_layout_ == framework::DataLayout::kMKLDNN) &&
@@ -112,7 +112,7 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
 
 framework::OpKernelType PoolOpGrad::GetKernelTypeForVar(
     const std::string& var_name,
-    const Tensor& tensor,
+    const phi::DenseTensor& tensor,
     const framework::OpKernelType& expected_kernel_type) const {
 #ifdef PADDLE_WITH_MKLDNN
   if ((expected_kernel_type.data_layout_ == framework::DataLayout::kMKLDNN) &&
diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h
index 06b42e504f099..c08b589cbe12e 100644
--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
@@ -22,7 +22,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class PoolOp : public framework::OperatorWithKernel {
  public:
@@ -34,7 +34,7 @@ class PoolOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override;
 };
 
@@ -48,7 +48,7 @@ class PoolOpGrad : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override;
 };
 
diff --git a/paddle/fluid/operators/pool_op_mlu.cc b/paddle/fluid/operators/pool_op_mlu.cc
index 988eb182a16f0..e2af30faf36f4 100644
--- a/paddle/fluid/operators/pool_op_mlu.cc
+++ b/paddle/fluid/operators/pool_op_mlu.cc
@@ -46,8 +46,8 @@ class MLUPoolOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     auto &dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
-    const Tensor *in_x = ctx.Input<Tensor>("X");
-    Tensor *out = ctx.Output<Tensor>("Out");
+    const Tensor *in_x = ctx.Input<phi::DenseTensor>("X");
+    Tensor *out = ctx.Output<phi::DenseTensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
 
     std::string pooling_type = ctx.Attr<std::string>("pooling_type");
@@ -102,8 +102,8 @@ class MLUPoolOpKernel : public framework::OpKernel<T> {
 
     // transpose NCHW to NHWC since cnnl pool2d has worse performance in that
     // layout.
-    framework::Tensor trans_in_x;
-    framework::Tensor trans_out;
+    phi::DenseTensor trans_in_x;
+    phi::DenseTensor trans_out;
     if (channel_last) {
       trans_in_x = *in_x;
       trans_out = *out;
@@ -141,7 +141,7 @@ class MLUPoolOpKernel : public framework::OpKernel<T> {
           handle, pool_mode, out_w, out_h, &extra_input_size);
 
       if (extra_input_size > 0) {
-        framework::Tensor extra_host_tensor;
+        phi::DenseTensor extra_host_tensor;
         extra_host_tensor.mutable_data<int8_t>(
             {static_cast<int64_t>(extra_input_size)}, platform::CPUPlace());
         cnnlInitPoolingExtraInput(handle,
@@ -149,7 +149,7 @@ class MLUPoolOpKernel : public framework::OpKernel<T> {
                                   trans_in_x_desc.get(),
                                   trans_out_desc.get(),
                                   GetBasePtr(&extra_host_tensor));
-        framework::Tensor extra_device_tensor =
+        phi::DenseTensor extra_device_tensor =
             ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
                 {static_cast<int64_t>(extra_input_size)}, dev_ctx);
         framework::TensorCopy(
@@ -212,10 +212,12 @@ class MLUPoolGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     auto &dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
-    const Tensor *in_x = ctx.Input<Tensor>("X");
-    const Tensor *out = ctx.Input<Tensor>("Out");
-    const Tensor *out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor *in_x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    const Tensor *in_x = ctx.Input<phi::DenseTensor>("X");
+    const Tensor *out = ctx.Input<phi::DenseTensor>("Out");
+    const Tensor *out_grad =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    Tensor *in_x_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     in_x_grad->mutable_data<T>(ctx.GetPlace());
 
     std::string pooling_type = ctx.Attr<std::string>("pooling_type");
@@ -249,10 +251,10 @@ class MLUPoolGradOpKernel : public framework::OpKernel<T> {
     }
 
     // inputs need with NHWC layout
-    framework::Tensor trans_in_x;
-    framework::Tensor trans_out;
-    framework::Tensor trans_out_grad;
-    framework::Tensor trans_in_x_grad;
+    phi::DenseTensor trans_in_x;
+    phi::DenseTensor trans_out;
+    phi::DenseTensor trans_out_grad;
+    phi::DenseTensor trans_in_x_grad;
     if (channel_last) {
       trans_in_x = *in_x;
       trans_out = *out;
@@ -300,7 +302,7 @@ class MLUPoolGradOpKernel : public framework::OpKernel<T> {
                                  ceil_mode);
 
     if (pooling_type == "max") {
-      framework::Tensor index_tensor =
+      phi::DenseTensor index_tensor =
           ctx.AllocateTmpTensor<IDX_T, MLUDeviceContext>(trans_out_grad.dims(),
                                                          dev_ctx);
       MLUCnnlTensorDesc index_tensor_desc(
diff --git a/paddle/fluid/operators/pool_op_npu.cc b/paddle/fluid/operators/pool_op_npu.cc
index 7e9b0b65113cc..3fc83a8343c9d 100644
--- a/paddle/fluid/operators/pool_op_npu.cc
+++ b/paddle/fluid/operators/pool_op_npu.cc
@@ -24,8 +24,8 @@ class NPUPoolOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     auto &dev_ctx = ctx.template device_context<platform::NPUDeviceContext>();
-    const Tensor *in_x = ctx.Input<Tensor>("X");
-    Tensor *out = ctx.Output<Tensor>("Out");
+    const Tensor *in_x = ctx.Input<phi::DenseTensor>("X");
+    Tensor *out = ctx.Output<phi::DenseTensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
 
     std::string pooling_type = ctx.Attr<std::string>("pooling_type");
@@ -171,10 +171,12 @@ class NPUPoolGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     auto &dev_ctx = ctx.template device_context<platform::NPUDeviceContext>();
-    const Tensor *in_x = ctx.Input<Tensor>("X");
-    const Tensor *out = ctx.Input<Tensor>("Out");
-    const Tensor *out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor *in_x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    const Tensor *in_x = ctx.Input<phi::DenseTensor>("X");
+    const Tensor *out = ctx.Input<phi::DenseTensor>("Out");
+    const Tensor *out_grad =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    Tensor *in_x_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     in_x_grad->mutable_data<T>(ctx.GetPlace());
 
     std::string pooling_type = ctx.Attr<std::string>("pooling_type");
diff --git a/paddle/fluid/operators/positive_negative_pair_op.h b/paddle/fluid/operators/positive_negative_pair_op.h
index d9e55cac59fd8..e3cbeea2c6f15 100644
--- a/paddle/fluid/operators/positive_negative_pair_op.h
+++ b/paddle/fluid/operators/positive_negative_pair_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 
 template <typename DeviceContext, typename T>
@@ -34,16 +34,19 @@ class PositiveNegativePairKernel : public framework::OpKernel<T> {
   };
 
   void Compute(const framework::ExecutionContext& context) const override {
-    auto score_t = context.Input<Tensor>("Score");
-    auto label_t = context.Input<Tensor>("Label");
-    auto query_t = context.Input<Tensor>("QueryID");
-    auto acc_positive_t = context.Input<Tensor>("AccumulatePositivePair");
-    auto acc_negative_t = context.Input<Tensor>("AccumulateNegativePair");
-    auto acc_neutral_t = context.Input<Tensor>("AccumulateNeutralPair");
-    auto positive_t = context.Output<Tensor>("PositivePair");
-    auto negative_t = context.Output<Tensor>("NegativePair");
-    auto neutral_t = context.Output<Tensor>("NeutralPair");
-    auto weight_t = context.Input<Tensor>("Weight");
+    auto score_t = context.Input<phi::DenseTensor>("Score");
+    auto label_t = context.Input<phi::DenseTensor>("Label");
+    auto query_t = context.Input<phi::DenseTensor>("QueryID");
+    auto acc_positive_t =
+        context.Input<phi::DenseTensor>("AccumulatePositivePair");
+    auto acc_negative_t =
+        context.Input<phi::DenseTensor>("AccumulateNegativePair");
+    auto acc_neutral_t =
+        context.Input<phi::DenseTensor>("AccumulateNeutralPair");
+    auto positive_t = context.Output<phi::DenseTensor>("PositivePair");
+    auto negative_t = context.Output<phi::DenseTensor>("NegativePair");
+    auto neutral_t = context.Output<phi::DenseTensor>("NeutralPair");
+    auto weight_t = context.Input<phi::DenseTensor>("Weight");
 
     auto score = score_t->data<T>();
     auto label = label_t->data<T>();
diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc
index f7abaf648ebcf..af61cc3c3f399 100644
--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 framework::OpKernelType innerGetKernelTypeForVar(
     const Tensor &tensor, const framework::OpKernelType &expected_kernel_type) {
diff --git a/paddle/fluid/operators/prroi_pool_op.cc b/paddle/fluid/operators/prroi_pool_op.cc
index cf8f17d5f747c..c4ebcde91b661 100644
--- a/paddle/fluid/operators/prroi_pool_op.cc
+++ b/paddle/fluid/operators/prroi_pool_op.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 
 class PRROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/fluid/operators/prroi_pool_op.cu b/paddle/fluid/operators/prroi_pool_op.cu
index e95201c472af8..5d5d32ddbb0e8 100644
--- a/paddle/fluid/operators/prroi_pool_op.cu
+++ b/paddle/fluid/operators/prroi_pool_op.cu
@@ -17,7 +17,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 
 static constexpr int kNumCUDAThreads = 512;
@@ -218,9 +218,9 @@ template <typename T>
 class GPUPRROIPoolOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
+    auto* in = ctx.Input<phi::DenseTensor>("X");
     auto* rois = ctx.Input<LoDTensor>("ROIs");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     auto pooled_height = ctx.Attr<int>("pooled_height");
     auto pooled_width = ctx.Attr<int>("pooled_width");
@@ -237,14 +237,14 @@ class GPUPRROIPoolOpKernel : public framework::OpKernel<T> {
     if (rois_num == 0) return;
 
     // set rois batch id
-    framework::Tensor rois_batch_id_list;
+    phi::DenseTensor rois_batch_id_list;
     rois_batch_id_list.Resize({rois_num});
     int* rois_batch_id_data =
         rois_batch_id_list.mutable_data<int>(platform::CPUPlace());
 
     if (ctx.HasInput("BatchRoINums") || rois->lod().empty()) {
-      auto* batchroinum = ctx.Input<Tensor>("BatchRoINums");
-      framework::Tensor batch_index_cpu;
+      auto* batchroinum = ctx.Input<phi::DenseTensor>("BatchRoINums");
+      phi::DenseTensor batch_index_cpu;
       framework::TensorCopySync(
           *batchroinum, platform::CPUPlace(), &batch_index_cpu);
 
@@ -321,12 +321,14 @@ template <typename DeviceContext, typename T>
 class GPUPRROIPoolGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
+    auto* in = ctx.Input<phi::DenseTensor>("X");
     auto* rois = ctx.Input<LoDTensor>("ROIs");
-    auto* out = ctx.Input<framework::Tensor>("Out");
+    auto* out = ctx.Input<phi::DenseTensor>("Out");
 
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* output_grad =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* input_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto* input_roi_grad =
         ctx.Output<LoDTensor>(framework::GradVarName("ROIs"));
 
@@ -342,14 +344,14 @@ class GPUPRROIPoolGradOpKernel : public framework::OpKernel<T> {
 
     if (input_grad || input_roi_grad) {
       // set roi batch id
-      framework::Tensor rois_batch_id_list;
+      phi::DenseTensor rois_batch_id_list;
       rois_batch_id_list.Resize({rois_num});
       int* rois_batch_id_data =
           rois_batch_id_list.mutable_data<int>(platform::CPUPlace());
 
       if (ctx.HasInput("BatchRoINums") || rois->lod().empty()) {
-        auto* batchroinum = ctx.Input<Tensor>("BatchRoINums");
-        framework::Tensor batch_index_cpu;
+        auto* batchroinum = ctx.Input<phi::DenseTensor>("BatchRoINums");
+        phi::DenseTensor batch_index_cpu;
         framework::TensorCopySync(
             *batchroinum, platform::CPUPlace(), &batch_index_cpu);
 
diff --git a/paddle/fluid/operators/prroi_pool_op.h b/paddle/fluid/operators/prroi_pool_op.h
index c071ce370e747..89782d500afcc 100644
--- a/paddle/fluid/operators/prroi_pool_op.h
+++ b/paddle/fluid/operators/prroi_pool_op.h
@@ -331,9 +331,9 @@ template <typename DeviceContext, typename T>
 class CPUPRROIPoolOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* in = ctx.Input<phi::DenseTensor>("X");
     auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     auto pooled_height = ctx.Attr<int>("pooled_height");
     auto pooled_width = ctx.Attr<int>("pooled_width");
@@ -352,12 +352,12 @@ class CPUPRROIPoolOpKernel : public framework::OpKernel<T> {
 
     const T* input_data = in->data<T>();
 
-    framework::Tensor rois_batch_id_list;
+    phi::DenseTensor rois_batch_id_list;
     rois_batch_id_list.Resize({rois_num});
     int* rois_batch_id_data =
         rois_batch_id_list.mutable_data<int>(ctx.GetPlace());
     if (ctx.HasInput("BatchRoINums") || rois->lod().empty()) {
-      auto* batchroinum = ctx.Input<framework::Tensor>("BatchRoINums");
+      auto* batchroinum = ctx.Input<phi::DenseTensor>("BatchRoINums");
       auto* batch_index = batchroinum->data<int64_t>();
       int rois_batch_size = batchroinum->dims()[0];
       size_t c = 0;
@@ -485,15 +485,15 @@ template <typename DeviceContext, typename T>
 class CPUPRROIPoolGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* out = ctx.Input<framework::Tensor>("Out");
+    auto* in = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Input<phi::DenseTensor>("Out");
     auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
     auto* output_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
     auto* input_grad =
-        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto* input_roi_grad =
-        ctx.Output<framework::Tensor>(framework::GradVarName("ROIs"));
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("ROIs"));
 
     auto pooled_height = ctx.Attr<int>("pooled_height");
     auto pooled_width = ctx.Attr<int>("pooled_width");
@@ -511,12 +511,12 @@ class CPUPRROIPoolGradOpKernel : public framework::OpKernel<T> {
       int rois_num = rois->dims()[0];
 
       // set roi batch id
-      framework::Tensor rois_batch_id_list;
+      phi::DenseTensor rois_batch_id_list;
       rois_batch_id_list.Resize({rois_num});
       int* rois_batch_id_data =
           rois_batch_id_list.mutable_data<int>(ctx.GetPlace());
       if (ctx.HasInput("BatchRoINums") || rois->lod().empty()) {
-        auto* batchroinum = ctx.Input<framework::Tensor>("BatchRoINums");
+        auto* batchroinum = ctx.Input<phi::DenseTensor>("BatchRoINums");
         auto* batch_index = batchroinum->data<int64_t>();
         int rois_batch_size = batchroinum->dims()[0];
         size_t c = 0;
diff --git a/paddle/fluid/operators/pscore/fake_init_op.cc b/paddle/fluid/operators/pscore/fake_init_op.cc
index 967714bf446bd..f48166be3129f 100644
--- a/paddle/fluid/operators/pscore/fake_init_op.cc
+++ b/paddle/fluid/operators/pscore/fake_init_op.cc
@@ -32,7 +32,7 @@ class FakeInitOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &dev_place) const override {
-    framework::Tensor *tensor = nullptr;
+    phi::DenseTensor *tensor = nullptr;
 
     auto &out_var = *scope.FindVar(Output("Out"));
 
diff --git a/paddle/fluid/operators/pull_box_extended_sparse_op.h b/paddle/fluid/operators/pull_box_extended_sparse_op.h
index 2e71f0b23a2c9..8191f1dc882ae 100644
--- a/paddle/fluid/operators/pull_box_extended_sparse_op.h
+++ b/paddle/fluid/operators/pull_box_extended_sparse_op.h
@@ -26,9 +26,9 @@ namespace operators {
 template <typename T>
 static void PullBoxExtendedSparseFunctor(
     const framework::ExecutionContext &ctx) {
-  auto inputs = ctx.MultiInput<framework::Tensor>("Ids");
-  auto outputs = ctx.MultiOutput<framework::Tensor>("Out");
-  auto outputs_extend = ctx.MultiOutput<framework::Tensor>("OutExtend");
+  auto inputs = ctx.MultiInput<phi::DenseTensor>("Ids");
+  auto outputs = ctx.MultiOutput<phi::DenseTensor>("Out");
+  auto outputs_extend = ctx.MultiOutput<phi::DenseTensor>("OutExtend");
   const auto slot_size = inputs.size();
   std::vector<const uint64_t *> all_keys(slot_size);
   // BoxPS only supports float now
@@ -63,9 +63,9 @@ static void PushBoxExtendedSparseFunctor(
     const framework::ExecutionContext &ctx) {
   auto inputs = ctx.MultiInput<framework::LoDTensor>("Ids");
   auto d_output =
-      ctx.MultiInput<framework::Tensor>(framework::GradVarName("Out"));
+      ctx.MultiInput<phi::DenseTensor>(framework::GradVarName("Out"));
   auto d_output_extend =
-      ctx.MultiInput<framework::Tensor>(framework::GradVarName("OutExtend"));
+      ctx.MultiInput<phi::DenseTensor>(framework::GradVarName("OutExtend"));
   const auto slot_size = inputs.size();
   std::vector<const uint64_t *> all_keys(slot_size);
   std::vector<const float *> all_grad_values(slot_size * 2);
diff --git a/paddle/fluid/operators/pull_box_sparse_op.h b/paddle/fluid/operators/pull_box_sparse_op.h
index 25d8580f38fd8..44c41dd7aa7e6 100644
--- a/paddle/fluid/operators/pull_box_sparse_op.h
+++ b/paddle/fluid/operators/pull_box_sparse_op.h
@@ -29,8 +29,8 @@ namespace operators {
 
 template <typename T>
 static void PullBoxSparseFunctor(const framework::ExecutionContext &ctx) {
-  auto inputs = ctx.MultiInput<framework::Tensor>("Ids");
-  auto outputs = ctx.MultiOutput<framework::Tensor>("Out");
+  auto inputs = ctx.MultiInput<phi::DenseTensor>("Ids");
+  auto outputs = ctx.MultiOutput<phi::DenseTensor>("Out");
   const auto slot_size = inputs.size();
   std::vector<const uint64_t *> all_keys(slot_size);
   // BoxPS only supports float now
@@ -63,7 +63,7 @@ template <typename T>
 static void PushBoxSparseFunctor(const framework::ExecutionContext &ctx) {
   auto inputs = ctx.MultiInput<framework::LoDTensor>("Ids");
   auto d_output =
-      ctx.MultiInput<framework::Tensor>(framework::GradVarName("Out"));
+      ctx.MultiInput<phi::DenseTensor>(framework::GradVarName("Out"));
   const auto slot_size = inputs.size();
   std::vector<const uint64_t *> all_keys(slot_size);
   std::vector<const float *> all_grad_values(slot_size);
diff --git a/paddle/fluid/operators/pull_gpups_sparse_op.h b/paddle/fluid/operators/pull_gpups_sparse_op.h
index 0852a903645a6..c9da5a75c248f 100644
--- a/paddle/fluid/operators/pull_gpups_sparse_op.h
+++ b/paddle/fluid/operators/pull_gpups_sparse_op.h
@@ -25,8 +25,8 @@ namespace operators {
 
 template <typename T>
 static void PullGpuPSSparseFunctor(const framework::ExecutionContext &ctx) {
-  auto inputs = ctx.MultiInput<framework::Tensor>("Ids");
-  auto outputs = ctx.MultiOutput<framework::Tensor>("Out");
+  auto inputs = ctx.MultiInput<phi::DenseTensor>("Ids");
+  auto outputs = ctx.MultiOutput<phi::DenseTensor>("Out");
   auto embedding_size_vec = ctx.Attr<std::vector<int>>("size");
   const auto slot_size = inputs.size();
   std::vector<const uint64_t *> all_keys(slot_size);
@@ -59,7 +59,7 @@ template <typename T>
 static void PushGpuPSSparseFunctor(const framework::ExecutionContext &ctx) {
   auto inputs = ctx.MultiInput<framework::LoDTensor>("Ids");
   auto d_output =
-      ctx.MultiInput<framework::Tensor>(framework::GradVarName("Out"));
+      ctx.MultiInput<phi::DenseTensor>(framework::GradVarName("Out"));
   const auto slot_size = inputs.size();
   std::vector<const uint64_t *> all_keys(slot_size);
   std::vector<const float *> all_grad_values(slot_size);
diff --git a/paddle/fluid/operators/put_along_axis_op.cc b/paddle/fluid/operators/put_along_axis_op.cc
index 6c31a178eaeb1..65fdb4700964a 100644
--- a/paddle/fluid/operators/put_along_axis_op.cc
+++ b/paddle/fluid/operators/put_along_axis_op.cc
@@ -39,7 +39,7 @@ class PutAlongAxisOp : public framework::OperatorWithKernel {
   }
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const framework::Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     return framework::OpKernelType(
         expected_kernel_type.data_type_, tensor.place(), tensor.layout());
@@ -80,7 +80,7 @@ class PutAlongAxisGradOp : public framework::OperatorWithKernel {
   }
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const framework::Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     return framework::OpKernelType(
         expected_kernel_type.data_type_, tensor.place(), tensor.layout());
diff --git a/paddle/fluid/operators/pyramid_hash_op.cc b/paddle/fluid/operators/pyramid_hash_op.cc
index 0dd74f9324fa3..627f57c0e659c 100644
--- a/paddle/fluid/operators/pyramid_hash_op.cc
+++ b/paddle/fluid/operators/pyramid_hash_op.cc
@@ -28,7 +28,7 @@ extern "C" {
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 using LoD = framework::LoD;
 
@@ -276,9 +276,9 @@ class CPUPyramidHashOPKernel : public framework::OpKernel<T> {
 
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* bottom = ctx.Input<LoDTensor>("X");
-    auto* _blobs_0 = ctx.Input<Tensor>("W");
-    auto* _blobs_1 = ctx.Input<Tensor>("WhiteList");
-    auto* _blobs_2 = ctx.Input<Tensor>("BlackList");
+    auto* _blobs_0 = ctx.Input<phi::DenseTensor>("W");
+    auto* _blobs_1 = ctx.Input<phi::DenseTensor>("WhiteList");
+    auto* _blobs_2 = ctx.Input<phi::DenseTensor>("BlackList");
     auto* top = ctx.Output<LoDTensor>("Out");
     auto* drop_pos = ctx.Output<LoDTensor>("DropPos");
 
@@ -513,7 +513,7 @@ class CPUPyramidHashOPGradKernel : public framework::OpKernel<T> {
 
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* bottom = ctx.Input<LoDTensor>("X");
-    auto* _blobs = ctx.Input<Tensor>("W");
+    auto* _blobs = ctx.Input<phi::DenseTensor>("W");
     auto* drop_pos = ctx.Input<LoDTensor>("DropPos");
     auto* top = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
 
diff --git a/paddle/fluid/operators/quantize_linear_op.cc b/paddle/fluid/operators/quantize_linear_op.cc
index 7012da3aeda94..c03f158cac4e7 100644
--- a/paddle/fluid/operators/quantize_linear_op.cc
+++ b/paddle/fluid/operators/quantize_linear_op.cc
@@ -27,11 +27,11 @@ namespace operators {
 template <typename T>
 struct ChannelDequantizeFunctorV2<phi::CPUContext, T> {
   void operator()(const phi::CPUContext &dev_ctx,
-                  const framework::Tensor *in,
-                  const framework::Tensor *scale,
+                  const phi::DenseTensor *in,
+                  const phi::DenseTensor *scale,
                   T max_range,
                   const int quant_axis,
-                  framework::Tensor *out) {
+                  phi::DenseTensor *out) {
     // Dequant op is before quantized op
     // Dequantize the weight of quantized op
     auto in_dims = in->dims();
@@ -40,8 +40,8 @@ struct ChannelDequantizeFunctorV2<phi::CPUContext, T> {
     if (quant_axis == 0) {
       for (int64_t i = 0; i < channel; i++) {
         T s = scale_factor[i];
-        framework::Tensor one_channel_in = in->Slice(i, i + 1);
-        framework::Tensor one_channel_out = out->Slice(i, i + 1);
+        phi::DenseTensor one_channel_in = in->Slice(i, i + 1);
+        phi::DenseTensor one_channel_out = out->Slice(i, i + 1);
         auto in_e = framework::EigenVector<T>::Flatten(one_channel_in);
         auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
         auto &dev = *dev_ctx.eigen_device();
diff --git a/paddle/fluid/operators/quantize_linear_op.cu b/paddle/fluid/operators/quantize_linear_op.cu
index 37ca11db3e3e2..c5d8b1928fd78 100644
--- a/paddle/fluid/operators/quantize_linear_op.cu
+++ b/paddle/fluid/operators/quantize_linear_op.cu
@@ -26,11 +26,11 @@ namespace operators {
 template <typename T>
 struct ChannelDequantizeFunctorV2<phi::GPUContext, T> {
   void operator()(const phi::GPUContext& dev_ctx,
-                  const framework::Tensor* in,
-                  const framework::Tensor* scale,
+                  const phi::DenseTensor* in,
+                  const phi::DenseTensor* scale,
                   T max_range,
                   const int quant_axis,
-                  framework::Tensor* out) {
+                  phi::DenseTensor* out) {
     auto in_dims = in->dims();
     const T* in_data = in->data<T>();
     T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
diff --git a/paddle/fluid/operators/quantize_linear_op.h b/paddle/fluid/operators/quantize_linear_op.h
index fd0579023b378..8434996926aba 100644
--- a/paddle/fluid/operators/quantize_linear_op.h
+++ b/paddle/fluid/operators/quantize_linear_op.h
@@ -31,22 +31,22 @@ namespace operators {
 template <typename DeviceContext, typename T>
 struct ChannelDequantizeFunctorV2 {
   void operator()(const DeviceContext& dev_ctx,
-                  const framework::Tensor* in,
-                  const framework::Tensor** scales,
+                  const phi::DenseTensor* in,
+                  const phi::DenseTensor** scales,
                   const int scale_num,
                   T max_range,
                   const int quant_axis,
-                  framework::Tensor* out);
+                  phi::DenseTensor* out);
 };
 
 template <typename DeviceContext, typename T>
 class QuantizeLinearKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<framework::Tensor>("X");
-    auto* in_scale = context.Input<framework::Tensor>("Scale");
+    auto* in = context.Input<phi::DenseTensor>("X");
+    auto* in_scale = context.Input<phi::DenseTensor>("Scale");
 
-    auto* out = context.Output<framework::Tensor>("Y");
+    auto* out = context.Output<phi::DenseTensor>("Y");
     out->mutable_data<T>(context.GetPlace());
     int bit_length = context.Attr<int>("bit_length");
     int round_type = context.Attr<int>("round_type");
@@ -58,8 +58,8 @@ class QuantizeLinearKernel : public framework::OpKernel<T> {
     if (quant_axis < 0) {
       if (!is_test) {
         // training
-        auto* in_accum = context.Input<framework::Tensor>("InAccum");
-        auto* in_state = context.Input<framework::Tensor>("InState");
+        auto* in_accum = context.Input<phi::DenseTensor>("InAccum");
+        auto* in_state = context.Input<phi::DenseTensor>("InState");
         phi::DenseTensor tmp_scale;
         tmp_scale.Resize(phi::make_dim(1));
         T* cur_scale_data = dev_ctx.template Alloc<T>(&tmp_scale);
@@ -67,9 +67,9 @@ class QuantizeLinearKernel : public framework::OpKernel<T> {
         FindAbsMaxFunctor<DeviceContext, T>()(
             dev_ctx, in->data<T>(), in->numel(), cur_scale_data);
 
-        auto* out_state = context.Output<framework::Tensor>("OutState");
-        auto* out_accum = context.Output<framework::Tensor>("OutAccum");
-        auto* out_scale = context.Output<framework::Tensor>("OutScale");
+        auto* out_state = context.Output<phi::DenseTensor>("OutState");
+        auto* out_accum = context.Output<phi::DenseTensor>("OutAccum");
+        auto* out_scale = context.Output<phi::DenseTensor>("OutScale");
         out_state->mutable_data<T>(context.GetPlace());
         out_accum->mutable_data<T>(context.GetPlace());
         out_scale->mutable_data<T>(context.GetPlace());
@@ -91,7 +91,7 @@ class QuantizeLinearKernel : public framework::OpKernel<T> {
       }
     } else {
       if (!is_test) {
-        auto* out_scale = context.Output<framework::Tensor>("OutScale");
+        auto* out_scale = context.Output<phi::DenseTensor>("OutScale");
         T* out_scale_data = out_scale->mutable_data<T>(context.GetPlace());
         FindChannelAbsMaxFunctor<DeviceContext, T>()(
             dev_ctx, *in, quant_axis, out_scale_data);
@@ -110,7 +110,7 @@ class DeQuantizeLinearKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto& dev_ctx = context.template device_context<DeviceContext>();
-    auto* in = context.Input<framework::Tensor>("X");
+    auto* in = context.Input<phi::DenseTensor>("X");
 
     auto in_tmp = phi::Cast<T>(
         static_cast<const typename paddle::framework::ConvertToPhiContext<
@@ -118,8 +118,8 @@ class DeQuantizeLinearKernel : public framework::OpKernel<T> {
         *in,
         experimental::CppTypeToDataType<D>::Type());
 
-    auto* scale = context.Input<framework::Tensor>("Scale");
-    auto* out = context.Output<framework::Tensor>("Y");
+    auto* scale = context.Input<phi::DenseTensor>("Scale");
+    auto* out = context.Output<phi::DenseTensor>("Y");
     int bit_length = context.Attr<int>("bit_length");
     auto quant_axis = context.Attr<int>("quant_axis");
     out->mutable_data<D>(dev_ctx.GetPlace());
diff --git a/paddle/fluid/operators/quantize_op.h b/paddle/fluid/operators/quantize_op.h
index dd1b3c42fb5f9..46a0469c806e1 100644
--- a/paddle/fluid/operators/quantize_op.h
+++ b/paddle/fluid/operators/quantize_op.h
@@ -23,7 +23,6 @@ namespace paddle {
 namespace operators {
 
 using framework::OpKernelType;
-using framework::Tensor;
 
 class QuantOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/random_routing_op.cu b/paddle/fluid/operators/random_routing_op.cu
index 0b8aaf2d97078..e59b0263c0dd5 100644
--- a/paddle/fluid/operators/random_routing_op.cu
+++ b/paddle/fluid/operators/random_routing_op.cu
@@ -30,7 +30,7 @@ static inline int GET_BLOCKS(const int N) {
 }
 
 using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 __global__ void random_routing_kernel(int64_t* data,
diff --git a/paddle/fluid/operators/randperm_op.h b/paddle/fluid/operators/randperm_op.h
index 3f4b02065a0fd..5512471fc2cdf 100644
--- a/paddle/fluid/operators/randperm_op.h
+++ b/paddle/fluid/operators/randperm_op.h
@@ -45,7 +45,7 @@ class RandpermKernel : public framework::OpKernel<T> {
     int n = ctx.Attr<int>("n");
     unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
     framework::Variable* out_var = ctx.OutputVar("Out");
-    framework::Tensor* out_tensor =
+    phi::DenseTensor* out_tensor =
         framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(out_var);
 
     if (platform::is_cpu_place(ctx.GetPlace())) {
@@ -53,7 +53,7 @@ class RandpermKernel : public framework::OpKernel<T> {
       random_permate<T>(out_data, n, seed);
 
     } else {
-      framework::Tensor tmp_tensor;
+      phi::DenseTensor tmp_tensor;
       tmp_tensor.Resize(phi::make_ddim({n}));
       T* tmp_data = tmp_tensor.mutable_data<T>(platform::CPUPlace());
       random_permate<T>(tmp_data, n, seed);
diff --git a/paddle/fluid/operators/randperm_op_mlu.cc b/paddle/fluid/operators/randperm_op_mlu.cc
index a3ebf8f5c00fc..2dcb0ff27e1ca 100644
--- a/paddle/fluid/operators/randperm_op_mlu.cc
+++ b/paddle/fluid/operators/randperm_op_mlu.cc
@@ -25,10 +25,10 @@ class RandpermMLUKernel : public framework::OpKernel<T> {
     int n = ctx.Attr<int>("n");
     unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
     framework::Variable* out_var = ctx.OutputVar("Out");
-    framework::Tensor* out_tensor =
+    phi::DenseTensor* out_tensor =
         framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(out_var);
 
-    framework::Tensor tmp_tensor;
+    phi::DenseTensor tmp_tensor;
     tmp_tensor.Resize(phi::make_ddim({n}));
     T* tmp_data = tmp_tensor.mutable_data<T>(platform::CPUPlace());
     random_permate<T>(tmp_data, n, seed);
diff --git a/paddle/fluid/operators/range_op.cc b/paddle/fluid/operators/range_op.cc
index ab9580d5ba95c..8a965034ac45a 100644
--- a/paddle/fluid/operators/range_op.cc
+++ b/paddle/fluid/operators/range_op.cc
@@ -31,7 +31,7 @@ class RangeOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const framework::Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     if (platform::is_xpu_place(tensor.place())) {
       return framework::OpKernelType(
diff --git a/paddle/fluid/operators/range_op.h b/paddle/fluid/operators/range_op.h
index 1dd1c694bb91c..e59d4f3cfcadd 100644
--- a/paddle/fluid/operators/range_op.h
+++ b/paddle/fluid/operators/range_op.h
@@ -52,10 +52,10 @@ template <typename T>
 class CPURangeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    T start = context.Input<framework::Tensor>("Start")->data<T>()[0];
-    T end = context.Input<framework::Tensor>("End")->data<T>()[0];
-    T step = context.Input<framework::Tensor>("Step")->data<T>()[0];
-    auto* out = context.Output<framework::Tensor>("Out");
+    T start = context.Input<phi::DenseTensor>("Start")->data<T>()[0];
+    T end = context.Input<phi::DenseTensor>("End")->data<T>()[0];
+    T step = context.Input<phi::DenseTensor>("Step")->data<T>()[0];
+    auto* out = context.Output<phi::DenseTensor>("Out");
     int64_t size = 0;
     GetSize(start, end, step, &size);
     out->Resize(phi::make_ddim({size}));
diff --git a/paddle/fluid/operators/range_op_mlu.cc b/paddle/fluid/operators/range_op_mlu.cc
index 3e15e0ced0a8f..13d067f8421ad 100644
--- a/paddle/fluid/operators/range_op_mlu.cc
+++ b/paddle/fluid/operators/range_op_mlu.cc
@@ -21,12 +21,12 @@ template <typename T>
 class RangeMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* start_t = context.Input<framework::Tensor>("Start");
-    auto* end_t = context.Input<framework::Tensor>("End");
-    auto* step_t = context.Input<framework::Tensor>("Step");
-    auto* out = context.Output<framework::Tensor>("Out");
+    auto* start_t = context.Input<phi::DenseTensor>("Start");
+    auto* end_t = context.Input<phi::DenseTensor>("End");
+    auto* step_t = context.Input<phi::DenseTensor>("Step");
+    auto* out = context.Output<phi::DenseTensor>("Out");
 
-    framework::Tensor n;
+    phi::DenseTensor n;
     framework::TensorCopy(
         *start_t,
         platform::CPUPlace(),
diff --git a/paddle/fluid/operators/range_op_npu.cc b/paddle/fluid/operators/range_op_npu.cc
index 9c063259f82e4..c9985187f5fc1 100644
--- a/paddle/fluid/operators/range_op_npu.cc
+++ b/paddle/fluid/operators/range_op_npu.cc
@@ -22,12 +22,12 @@ template <typename T>
 class RangeNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* start_t = context.Input<framework::Tensor>("Start");
-    auto* end_t = context.Input<framework::Tensor>("End");
-    auto* step_t = context.Input<framework::Tensor>("Step");
-    auto* out = context.Output<framework::Tensor>("Out");
+    auto* start_t = context.Input<phi::DenseTensor>("Start");
+    auto* end_t = context.Input<phi::DenseTensor>("End");
+    auto* step_t = context.Input<phi::DenseTensor>("Step");
+    auto* out = context.Output<phi::DenseTensor>("Out");
 
-    framework::Tensor n;
+    phi::DenseTensor n;
     framework::TensorCopy(
         *start_t,
         platform::CPUPlace(),
diff --git a/paddle/fluid/operators/rank_attention_op.cc b/paddle/fluid/operators/rank_attention_op.cc
index f68e1668aa9a7..4c740c5985ade 100644
--- a/paddle/fluid/operators/rank_attention_op.cc
+++ b/paddle/fluid/operators/rank_attention_op.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class RankAttentionOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/rank_attention_op.cu b/paddle/fluid/operators/rank_attention_op.cu
index 83f6f23f98506..36117e605031e 100644
--- a/paddle/fluid/operators/rank_attention_op.cu
+++ b/paddle/fluid/operators/rank_attention_op.cu
@@ -24,20 +24,18 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 template <typename DeviceContext, typename T>
 class RankAttentionCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *X = ctx.Input<Tensor>("X");
-    auto *rank_offset = ctx.Input<Tensor>("RankOffset");
-    auto *param = ctx.Input<Tensor>("RankParam");
-    auto *input_help = ctx.Output<Tensor>("InputHelp");
-    auto *ins_rank = ctx.Output<Tensor>("InsRank");
+    auto *X = ctx.Input<phi::DenseTensor>("X");
+    auto *rank_offset = ctx.Input<phi::DenseTensor>("RankOffset");
+    auto *param = ctx.Input<phi::DenseTensor>("RankParam");
+    auto *input_help = ctx.Output<phi::DenseTensor>("InputHelp");
+    auto *ins_rank = ctx.Output<phi::DenseTensor>("InsRank");
     int max_rank = ctx.Attr<int>("MaxRank");
     int64_t max_size = ctx.Attr<int>("MaxSize");
-    auto *Out = ctx.Output<Tensor>("Out");
+    auto *Out = ctx.Output<phi::DenseTensor>("Out");
 
     // check dims
     auto x_dims = X->dims();
@@ -66,7 +64,7 @@ class RankAttentionCUDAKernel : public framework::OpKernel<T> {
 
     int max_ins = std::max(ins_num, max_size);
 
-    Tensor param_help;
+    phi::DenseTensor param_help;
     param_help = ctx.AllocateTmpTensor<T, DeviceContext>(
         {max_ins * block_matrix_row, para_col}, dev_ctx);
     param_help.mutable_data<T>(ctx.GetPlace());
@@ -156,15 +154,17 @@ template <typename DeviceContext, typename T>
 class RankAttentionGradOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *X = ctx.Input<Tensor>("X");                     // not use data
-    auto *rank_offset = ctx.Input<Tensor>("RankOffset");  // not use data
-    auto *param = ctx.Input<Tensor>("RankParam");         // not use data
-    auto *input_help = ctx.Input<Tensor>("InputHelp");
-    auto *ins_rank = ctx.Input<Tensor>("InsRank");
-    auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto *X = ctx.Input<phi::DenseTensor>("X");  // not use data
+    auto *rank_offset =
+        ctx.Input<phi::DenseTensor>("RankOffset");           // not use data
+    auto *param = ctx.Input<phi::DenseTensor>("RankParam");  // not use data
+    auto *input_help = ctx.Input<phi::DenseTensor>("InputHelp");
+    auto *ins_rank = ctx.Input<phi::DenseTensor>("InsRank");
+    auto *dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
     int64_t max_size = ctx.Attr<int>("MaxSize");
 
-    auto *drank_para = ctx.Output<Tensor>(framework::GradVarName("RankParam"));
+    auto *drank_para =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("RankParam"));
 
     // get dim
     auto x_dims = X->dims();
@@ -188,7 +188,7 @@ class RankAttentionGradOpCUDAKernel : public framework::OpKernel<T> {
         drank_para_eigen.constant(static_cast<T>(0));
 
     // copy data
-    Tensor param_grad;
+    phi::DenseTensor param_grad;
     param_grad = ctx.AllocateTmpTensor<T, DeviceContext>(
         {max_ins * block_matrix_row, para_col}, dev_ctx);
     param_grad.mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/rank_loss_op.h b/paddle/fluid/operators/rank_loss_op.h
index 3e02cfb3fc1e0..4c81129c0efb0 100644
--- a/paddle/fluid/operators/rank_loss_op.h
+++ b/paddle/fluid/operators/rank_loss_op.h
@@ -25,10 +25,10 @@ template <typename DeviceContext, typename T>
 class RankLossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
-    auto* out_t = ctx.Output<framework::Tensor>("Out");
-    auto* label_t = ctx.Input<framework::Tensor>("Label");
-    auto* left_t = ctx.Input<framework::Tensor>("Left");
-    auto* right_t = ctx.Input<framework::Tensor>("Right");
+    auto* out_t = ctx.Output<phi::DenseTensor>("Out");
+    auto* label_t = ctx.Input<phi::DenseTensor>("Label");
+    auto* left_t = ctx.Input<phi::DenseTensor>("Left");
+    auto* right_t = ctx.Input<phi::DenseTensor>("Right");
     out_t->mutable_data<T>(ctx.GetPlace());
 
     auto out = framework::EigenVector<T>::Flatten(*out_t);
@@ -47,14 +47,14 @@ class RankLossGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
     auto* d_left_t =
-        ctx.Output<framework::Tensor>(framework::GradVarName("Left"));
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Left"));
     auto* d_right_t =
-        ctx.Output<framework::Tensor>(framework::GradVarName("Right"));
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Right"));
 
-    auto* d_out_t = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* label_t = ctx.Input<framework::Tensor>("Label");
-    auto* left_t = ctx.Input<framework::Tensor>("Left");
-    auto* right_t = ctx.Input<framework::Tensor>("Right");
+    auto* d_out_t = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* label_t = ctx.Input<phi::DenseTensor>("Label");
+    auto* left_t = ctx.Input<phi::DenseTensor>("Left");
+    auto* right_t = ctx.Input<phi::DenseTensor>("Right");
 
     auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
     auto d_out = framework::EigenVector<T>::Flatten(*d_out_t);
diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc
index 03112fcd9ee58..ab15d0589d7c3 100644
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -192,7 +192,7 @@ void RecurrentBase::LinkTensor(const framework::Scope &src_scope,
       src_vars,
       dst_scope,
       dst_vars,
-      [&](const framework::Tensor &src, framework::Tensor *dst) {
+      [&](const phi::DenseTensor &src, phi::DenseTensor *dst) {
         dst->ShareDataWith(src);
       });
 }
@@ -247,8 +247,8 @@ void RecurrentOp::RunImpl(const framework::Scope &scope,
         Inputs(kInputs),
         &cur_scope,
         Inputs(kInputs),
-        [&seq_offset](const framework::Tensor &outside,
-                      framework::Tensor *inside) {
+        [&seq_offset](const phi::DenseTensor &outside,
+                      phi::DenseTensor *inside) {
           inside->ShareDataWith(outside.Slice(seq_offset, seq_offset + 1));
           auto dims = phi::vectorize(inside->dims());
           dims.erase(dims.begin());
@@ -374,7 +374,7 @@ void RecurrentGradOp::RunImpl(const framework::Scope &scope,
         Inputs(kOutputGrads),
         &cur_scope,
         Inputs(kOutputGrads),
-        [&](const framework::Tensor &outside, framework::Tensor *inside) {
+        [&](const phi::DenseTensor &outside, phi::DenseTensor *inside) {
           inside->ShareDataWith(outside.Slice(seq_offset, seq_offset + 1));
           auto dims = phi::vectorize(inside->dims());
           dims.erase(dims.begin());
@@ -439,7 +439,7 @@ void RecurrentGradOp::RunImpl(const framework::Scope &scope,
                 0) {  // Inside Gradient is not created.
               return;
             }
-            framework::Tensor src_slice =
+            phi::DenseTensor src_slice =
                 src_tensor.Slice(seq_offset, seq_offset + 1);
             dst_tensor->ShareDataWith(src_slice);
           },
diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc b/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc
index 434b32329cfaa..68615a44e97c8 100644
--- a/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc
@@ -25,8 +25,8 @@ template <typename DeviceContext, typename T>
 class XPULogsumexpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
+    auto* input = context.Input<phi::DenseTensor>("X");
+    auto* output = context.Output<phi::DenseTensor>("Out");
 
     auto axis = context.Attr<std::vector<int>>("axis");
     auto reduce_all = context.Attr<bool>("reduce_all");
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc
index 34d45e0ae5f32..ce06d1b1089a5 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc
@@ -21,14 +21,14 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class ReduceAnyNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    const phi::DenseTensor* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     bool keep_dim = ctx.Attr<bool>("keep_dim");
     auto dims = ctx.Attr<std::vector<int>>("dim");
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
index 4f76e47069b5e..d652f8b805222 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
@@ -33,7 +33,7 @@ limitations under the License. */
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 
-using Tensor = paddle::framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 USE_OP_ITSELF(reduce_any);
 USE_OP_DEVICE_KERNEL(reduce_any, NPU);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
index 310c1db205da6..1ece3bdf72616 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
@@ -22,8 +22,8 @@ template <typename T>
 class ReduceMaxMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
+    auto* input = context.Input<phi::DenseTensor>("X");
+    auto* output = context.Output<phi::DenseTensor>("Out");
     int out_dtype = context.Attr<int>("out_dtype");
     bool reduce_all = context.Attr<bool>("reduce_all");
     auto dims = context.Attr<std::vector<int>>("dim");
@@ -45,7 +45,7 @@ class ReduceMaxMLUKernel : public framework::OpKernel<T> {
     }
 
     auto place = context.GetPlace();
-    framework::Tensor cast_out(input->type());
+    phi::DenseTensor cast_out(input->type());
     cast_out.Resize(output->dims());
     cast_out.mutable_data<T>(place);
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
index 13be33bae3db7..172786963e4c9 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
@@ -18,13 +18,13 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 template <typename DeviceContext, typename T>
 class ReduceMaxNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     auto dims = ctx.Attr<std::vector<int>>("dim");
     bool keep_dim = ctx.Attr<bool>("keep_dim");
     bool reduce_all = ctx.Attr<bool>("reduce_all");
@@ -32,7 +32,7 @@ class ReduceMaxNPUKernel : public framework::OpKernel<T> {
 
     auto place = ctx.GetPlace();
 
-    framework::Tensor cast_out(x->type());
+    phi::DenseTensor cast_out(x->type());
     cast_out.Resize(out->dims());
     cast_out.mutable_data<T>(place);
 
@@ -115,9 +115,10 @@ template <typename DeviceContext, typename T>
 class ReduceMaxGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* out = context.Input<Tensor>("Out");
-    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* x = context.Input<phi::DenseTensor>("X");
+    auto* out = context.Input<phi::DenseTensor>("Out");
+    auto* out_grad =
+        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
     auto reduce_dims = context.Attr<std::vector<int>>("dim");
     bool reduce_all = context.Attr<bool>("reduce_all");
     int in_dtype = context.Attr<int>("in_dtype");
@@ -128,7 +129,8 @@ class ReduceMaxGradNPUKernel : public framework::OpKernel<T> {
         platform::errors::InvalidArgument(
             "NPU only support in_dtype == -1 in reduce_max_grad op."));
 
-    auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* x_grad =
+        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
     x_grad->mutable_data<T>(context.GetPlace());
 
     auto& dev_ctx =
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc
index 1faffd57c9ab3..b73bde6275347 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc
@@ -29,9 +29,11 @@ template <typename T>
 class ReduceMeanGradMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<Tensor>("X");
-    auto* output_grad = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* input_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* input = context.Input<phi::DenseTensor>("X");
+    auto* output_grad =
+        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* input_grad =
+        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
     input_grad->mutable_data<T>(context.GetPlace());
 
     bool reduce_all = context.Attr<bool>("reduce_all");
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc
index 4e277d2c62231..feca58ce19861 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc
@@ -22,8 +22,8 @@ template <typename T>
 class NPUReduceMeanOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
+    auto* input = ctx.Input<phi::DenseTensor>("X");
+    auto* output = ctx.Output<phi::DenseTensor>("Out");
     output->mutable_data<T>(ctx.GetPlace());
 
     bool reduce_all = ctx.Attr<bool>("reduce_all");
@@ -56,9 +56,11 @@ template <typename T>
 class NPUReduceMeanGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* input = ctx.Input<phi::DenseTensor>("X");
+    auto* output_grad =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* input_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     input_grad->mutable_data<T>(ctx.GetPlace());
 
     bool reduce_all = ctx.Attr<bool>("reduce_all");
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op_mlu.cc b/paddle/fluid/operators/reduce_ops/reduce_min_op_mlu.cc
index 43879af06ea59..631b32e59c822 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op_mlu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_min_op_mlu.cc
@@ -22,8 +22,8 @@ template <typename T>
 class ReduceMinMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
+    auto* input = context.Input<phi::DenseTensor>("X");
+    auto* output = context.Output<phi::DenseTensor>("Out");
     int out_dtype = context.Attr<int>("out_dtype");
     bool reduce_all = context.Attr<bool>("reduce_all");
     auto dims = context.Attr<std::vector<int>>("dim");
@@ -45,7 +45,7 @@ class ReduceMinMLUKernel : public framework::OpKernel<T> {
     }
 
     auto place = context.GetPlace();
-    framework::Tensor cast_out(input->type());
+    phi::DenseTensor cast_out(input->type());
     cast_out.Resize(output->dims());
     cast_out.mutable_data<T>(place);
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_min_op_npu.cc
index 70d995284a288..19efb2e6bfb4c 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_min_op_npu.cc
@@ -18,13 +18,13 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 template <typename DeviceContext, typename T>
 class ReduceMinNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     auto dims = ctx.Attr<std::vector<int>>("dim");
     bool keep_dim = ctx.Attr<bool>("keep_dim");
     bool reduce_all = ctx.Attr<bool>("reduce_all");
@@ -32,7 +32,7 @@ class ReduceMinNPUKernel : public framework::OpKernel<T> {
 
     auto place = ctx.GetPlace();
 
-    framework::Tensor cast_out(x->type());
+    phi::DenseTensor cast_out(x->type());
     cast_out.Resize(out->dims());
     cast_out.mutable_data<T>(place);
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
index d7f153700cfa2..a62bac88ca399 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
@@ -32,8 +32,8 @@ template <typename Tx,
           class ReduceOp,
           typename TransformOp>
 void TensorReduceImpl(const phi::GPUContext& dev_ctx,
-                      const framework::Tensor& x,
-                      framework::Tensor* y,
+                      const phi::DenseTensor& x,
+                      phi::DenseTensor* y,
                       const TransformOp& transform,
                       const std::vector<int>& origin_reduce_dims,
                       gpuStream_t stream,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index d305a65e0d133..991fdfeed176c 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -48,7 +48,7 @@ namespace operators {
             keep_dim);                                           \
   }
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using DDim = framework::DDim;
 
 inline void GetShuffledDim(const DDim& src_dims,
@@ -101,8 +101,8 @@ static inline std::vector<int> GetReduceDim(const std::vector<int>& dims,
 }
 template <typename DeviceContext, typename OutT>
 void GetShuffledInput(const framework::ExecutionContext& context,
-                      const Tensor* input,
-                      Tensor* shuffled_input,
+                      const phi::DenseTensor* input,
+                      phi::DenseTensor* shuffled_input,
                       const std::vector<int>& dims) {
   DDim shuffled_dims(input->dims());
   std::vector<int> perm_axis(input->dims().size());
@@ -132,8 +132,8 @@ inline void GetOriginDimFromShuffled(const DDim& src_dim,
 
 template <typename DeviceContext, typename OutT, typename Functor>
 void HandleLargeDim(const framework::ExecutionContext& context,
-                    const Tensor* input,
-                    Tensor* output,
+                    const phi::DenseTensor* input,
+                    phi::DenseTensor* output,
                     const std::vector<int>& dims,
                     bool keep_dim) {
   //  shuffle the reduced dim to the end
@@ -157,10 +157,10 @@ void HandleLargeDim(const framework::ExecutionContext& context,
 
 template <typename DeviceContext, typename T, typename Functor>
 void HandleLargeDimGrad(const framework::ExecutionContext& context,
-                        const framework::Tensor* x,
-                        const framework::Tensor* out,
-                        const framework::Tensor* dout,
-                        framework::Tensor* dx,
+                        const phi::DenseTensor* x,
+                        const phi::DenseTensor* out,
+                        const phi::DenseTensor* dout,
+                        phi::DenseTensor* dx,
                         Functor functor,
                         const std::vector<int>& dims) {
   const int64_t unreduced = out->numel();
@@ -198,14 +198,14 @@ void HandleLargeDimGrad(const framework::ExecutionContext& context,
 
 template <typename DeviceContext, typename T, typename Functor>
 struct ReduceKernelFunctor {
-  const Tensor* input;
-  Tensor* output;
+  const phi::DenseTensor* input;
+  phi::DenseTensor* output;
   std::vector<int> dims;
   bool keep_dim;
   bool reduce_all;
   const framework::ExecutionContext& context;
-  ReduceKernelFunctor(const Tensor* input,
-                      Tensor* output,
+  ReduceKernelFunctor(const phi::DenseTensor* input,
+                      phi::DenseTensor* output,
                       const std::vector<int>& dims,
                       bool keep_dim,
                       bool reduce_all,
@@ -261,12 +261,12 @@ class ReduceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     bool reduce_all = context.Attr<bool>("reduce_all");
-    auto* output = context.Output<Tensor>("Out");
+    auto* output = context.Output<phi::DenseTensor>("Out");
     auto dims = context.Attr<std::vector<int>>("dim");
     bool keep_dim = context.Attr<bool>("keep_dim");
     int out_dtype = context.Attr<int>("out_dtype");
     framework::proto::VarType::Type cast_out_dtype;
-    auto* input = context.Input<Tensor>("X");
+    auto* input = context.Input<phi::DenseTensor>("X");
 
     if (out_dtype < 0) {
       cast_out_dtype = static_cast<framework::proto::VarType::Type>(
@@ -299,10 +299,10 @@ class ReduceKernel : public framework::OpKernel<T> {
 
 template <typename DeviceContext, typename T, typename Functor>
 void LaunchReduceGradKernel(const framework::ExecutionContext& context,
-                            const framework::Tensor* input0,
-                            const framework::Tensor* input1,
-                            const framework::Tensor* input2,
-                            paddle::framework::Tensor* output,
+                            const phi::DenseTensor* input0,
+                            const phi::DenseTensor* input1,
+                            const phi::DenseTensor* input2,
+                            phi::DenseTensor* output,
                             Functor functor,
                             const std::vector<int>& dims,
                             bool reduce_all = false) {
@@ -400,18 +400,20 @@ template <typename DeviceContext,
           bool kNoNeedBufferY = false>
 class ReduceGradKernel : public framework::OpKernel<T> {
  public:
-  void ComputeFromInput(const Tensor* input2,
+  void ComputeFromInput(const phi::DenseTensor* input2,
                         const framework::ExecutionContext& context) const {
     bool reduce_all = context.Attr<bool>("reduce_all");
     auto dims = context.Attr<std::vector<int>>("dim");
-    auto* input0 = context.Input<Tensor>("X");
-    auto* input1 = context.Input<Tensor>("Out");
+    auto* input0 = context.Input<phi::DenseTensor>("X");
+    auto* input1 = context.Input<phi::DenseTensor>("Out");
 
-    auto* output = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* output =
+        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
     output->mutable_data<T>(context.GetPlace());
 
     // The dims has full dim, set the reduce_all is True
-    const auto& input_dim_size = context.Input<Tensor>("X")->dims().size();
+    const auto& input_dim_size =
+        context.Input<phi::DenseTensor>("X")->dims().size();
     std::set<int> dims_set(dims.begin(), dims.end());
     bool full_dim = true;
     for (auto i = 0; i < input_dim_size; i++) {
@@ -452,7 +454,8 @@ class ReduceGradKernel : public framework::OpKernel<T> {
     int in_dtype = context.Attr<int>("in_dtype");
     if (in_dtype >= 0) {
       Tensor tmp_tensor;
-      auto* pre_input = context.Input<Tensor>(framework::GradVarName("Out"));
+      auto* pre_input =
+          context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
       auto in_kernel_type = framework::OpKernelType(
           framework::TransToProtoVarType(pre_input->dtype()),
           context.GetPlace());
@@ -464,7 +467,8 @@ class ReduceGradKernel : public framework::OpKernel<T> {
       ComputeFromInput(&tmp_tensor, context);
 
     } else {
-      auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
+      auto* input2 =
+          context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
       ComputeFromInput(input2, context);
     }
   }
@@ -666,7 +670,7 @@ class ReduceGradOp : public framework::OperatorWithKernel {
                   ctx, framework::GradVarName("Out"));
 #ifdef PADDLE_WITH_MKLDNN
     auto CanMKLDNNReduceGradBeUsed = [&]() {
-      auto dx_dims = ctx.Input<Tensor>("X")->dims();
+      auto dx_dims = ctx.Input<phi::DenseTensor>("X")->dims();
 
       if (dx_dims.size() > 5) return false;  // max 5D tensor is supported
 
@@ -745,8 +749,8 @@ class ReduceCudaKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     bool reduce_all = context.Attr<bool>("reduce_all");
-    const Tensor* input = context.Input<Tensor>("X");
-    Tensor* output = context.Output<Tensor>("Out");
+    const phi::DenseTensor* input = context.Input<phi::DenseTensor>("X");
+    phi::DenseTensor* output = context.Output<phi::DenseTensor>("Out");
     auto out_dtype = context.Attr<int>("out_dtype");
     auto pt_out_dtype = paddle::framework::TransToPhiDataType(
         static_cast<framework::proto::VarType::Type>(out_dtype));
@@ -777,11 +781,11 @@ class ReduceCudaGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     bool reduce_all = context.Attr<bool>("reduce_all");
     std::vector<int> dims = context.Attr<std::vector<int>>("dim");
-    auto* in_x = context.Input<Tensor>("X");
+    auto* in_x = context.Input<phi::DenseTensor>("X");
 
     auto* d_out =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* d_x = context.Output<framework::Tensor>(framework::GradVarName("X"));
+        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* d_x = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto out_dtype = context.Attr<int>("in_dtype");
     auto pt_out_dtype = framework::TransToPhiDataType(
         static_cast<framework::proto::VarType::Type>(out_dtype));
@@ -795,7 +799,7 @@ class ReduceCudaGradKernel : public framework::OpKernel<T> {
       update_dims[i] = 1;
     }
     // make new tensor
-    framework::Tensor new_d_out(d_out->type());
+    phi::DenseTensor new_d_out(d_out->type());
     new_d_out.ShareDataWith(*d_out);
     new_d_out.Resize(phi::make_ddim(update_dims));
     auto& dev_ctx = context.cuda_device_context();
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op_function.h b/paddle/fluid/operators/reduce_ops/reduce_op_function.h
index 5f02a475d7a91..39a0dc044f272 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op_function.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op_function.h
@@ -21,7 +21,7 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using DDim = framework::DDim;
 template <typename T,
           size_t D,
@@ -43,8 +43,8 @@ template <typename DeviceContext,
           size_t R_D,
           typename Functor>
 void ReduceFunctor(const DeviceContext& context,
-                   const framework::Tensor& input,
-                   framework::Tensor* output,
+                   const phi::DenseTensor& input,
+                   phi::DenseTensor* output,
                    const std::vector<int>& dims,
                    bool keep_dim) {
   auto x = EigenTensor<T, D>::From(input);
@@ -81,10 +81,10 @@ void ReduceFunctor(const DeviceContext& context,
 
 template <typename DeviceContext, typename T, size_t D, typename Functor>
 void ReduceGradFunctor(const DeviceContext& context,
-                       const framework::Tensor& input0,
-                       const framework::Tensor& input1,
-                       const framework::Tensor& input2,
-                       framework::Tensor* output,
+                       const phi::DenseTensor& input0,
+                       const phi::DenseTensor& input1,
+                       const phi::DenseTensor& input2,
+                       phi::DenseTensor* output,
                        Functor functor,
                        const std::vector<int>& dims) {
   auto x = EigenTensor<T, D>::From(input0);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op_mlu.h b/paddle/fluid/operators/reduce_ops/reduce_op_mlu.h
index 27c5f144bef04..6af7967b81150 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op_mlu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op_mlu.h
@@ -31,8 +31,8 @@ void MLUReduceOp(const framework::ExecutionContext& context,
       platform::is_mlu_place(context.GetPlace()),
       true,
       platform::errors::Unavailable("This kernel only runs on MLU."));
-  auto* input = context.Input<Tensor>("X");
-  auto* output = context.Output<Tensor>("Out");
+  auto* input = context.Input<phi::DenseTensor>("X");
+  auto* output = context.Output<phi::DenseTensor>("Out");
   output->mutable_data<T>(context.GetPlace());
 
   bool reduce_all = context.Attr<bool>("reduce_all");
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op_xpu.h b/paddle/fluid/operators/reduce_ops/reduce_op_xpu.h
index 57df3c1a887f5..35cc8fea6d0ba 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op_xpu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op_xpu.h
@@ -40,8 +40,8 @@ void XPUReduce(const framework::ExecutionContext& context,
       platform::errors::Unavailable("This kernel only runs on XPU."));
   bool reduce_all = context.Attr<bool>("reduce_all");
   auto dims = context.Attr<std::vector<int>>("dim");
-  auto* x = context.Input<Tensor>("X");
-  auto* y = context.Output<Tensor>("Out");
+  auto* x = context.Input<phi::DenseTensor>("X");
+  auto* y = context.Output<phi::DenseTensor>("Out");
   y->mutable_data<T>(context.GetPlace());
   auto& dev_ctx = context.template device_context<DeviceContext>();
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc
index 5c94bfc4bd0a8..85b589ebf916e 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc
@@ -18,13 +18,13 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 template <typename DeviceContext, typename T>
 class ReduceProdNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     auto dims = ctx.Attr<std::vector<int>>("dim");
     bool keep_dim = ctx.Attr<bool>("keep_dim");
     bool reduce_all = ctx.Attr<bool>("reduce_all");
@@ -32,7 +32,7 @@ class ReduceProdNPUKernel : public framework::OpKernel<T> {
 
     auto place = ctx.GetPlace();
 
-    framework::Tensor cast_out(x->type());
+    phi::DenseTensor cast_out(x->type());
     cast_out.Resize(out->dims());
     cast_out.mutable_data<T>(place);
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.h b/paddle/fluid/operators/reduce_ops/reduce_sum_op.h
index e0c11feb036f2..69c8935dafd6b 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.h
@@ -28,12 +28,13 @@ template <typename DeviceContext,
           bool kNoNeedBufferX = false>
 class ReduceSumGradKernel : public framework::OpKernel<T> {
  public:
-  void ComputeFromInput(const Tensor* input2,
+  void ComputeFromInput(const phi::DenseTensor* input2,
                         const framework::ExecutionContext& context) const {
     auto dims = context.Attr<std::vector<int>>("dim");
-    auto* input0 = context.Input<Tensor>("X");
+    auto* input0 = context.Input<phi::DenseTensor>("X");
 
-    auto* output = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* output =
+        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
     output->mutable_data<T>(context.GetPlace());
     const auto* input2_d = input2->data<T>();
     auto* output_d = output->data<T>();
@@ -80,7 +81,8 @@ class ReduceSumGradKernel : public framework::OpKernel<T> {
 
       if (in_dtype >= 0) {
         Tensor tmp_tensor;
-        auto* pre_input = context.Input<Tensor>(framework::GradVarName("Out"));
+        auto* pre_input =
+            context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
         auto in_kernel_type = framework::OpKernelType(
             framework::TransToProtoVarType(pre_input->dtype()),
             context.GetPlace());
@@ -91,7 +93,8 @@ class ReduceSumGradKernel : public framework::OpKernel<T> {
             in_kernel_type, out_kernel_type, *pre_input, &tmp_tensor);
         ComputeFromInput(&tmp_tensor, context);
       } else {
-        auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
+        auto* input2 =
+            context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
         ComputeFromInput(input2, context);
       }
       return;
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op_mlu.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op_mlu.cc
index e8b66a2bf2f7c..4ecf6e907b4cb 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op_mlu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op_mlu.cc
@@ -29,9 +29,11 @@ template <typename T>
 class ReduceSumGradMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<Tensor>("X");
-    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* in_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* in = context.Input<phi::DenseTensor>("X");
+    auto* out_grad =
+        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* in_grad =
+        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
     in_grad->mutable_data<T>(context.GetPlace());
 
     bool reduce_all = context.Attr<bool>("reduce_all");
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
index e3b5755d1a6b9..6ba8a9c1373a1 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
@@ -26,8 +26,8 @@ template <typename DeviceContext, typename T>
 class ReduceSumNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     bool reduce_all = ctx.Attr<bool>("reduce_all");
     bool keep_dims = ctx.Attr<bool>("keep_dim");
     auto dims = ctx.Attr<std::vector<int>>("dim");
@@ -43,8 +43,8 @@ class ReduceSumNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    framework::Tensor cast_x;
-    framework::Tensor cast_out;
+    phi::DenseTensor cast_x;
+    phi::DenseTensor cast_out;
     // NOTE: ReduceSumD only supports fp32 and fp16
     if (framework::TransToProtoVarType(x->dtype()) !=
             framework::proto::VarType::FP32 &&
@@ -106,10 +106,9 @@ template <typename DeviceContext, typename T>
 class ReduceSumGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* out_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* x_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     bool reduce_all = ctx.Attr<bool>("reduce_all");
     bool keep_dims = ctx.Attr<bool>("keep_dim");
     auto dims = ctx.Attr<std::vector<int>>("dim");
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op_xpu.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op_xpu.cc
index 1d36bdb284121..29f24de021b49 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op_xpu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op_xpu.cc
@@ -36,9 +36,10 @@ class ReduceSumGradXPUKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     auto dims = context.Attr<std::vector<int>>("dim");
     bool reduce_all = context.Attr<bool>("reduce_all");
-    auto* x = context.Input<Tensor>("X");
-    auto* out = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* x = context.Input<phi::DenseTensor>("X");
+    auto* out = context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* x_grad =
+        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     int in_dtype = context.Attr<int>("in_dtype");
     PADDLE_ENFORCE_EQ(
diff --git a/paddle/fluid/operators/repeat_interleave_op.cc b/paddle/fluid/operators/repeat_interleave_op.cc
index a3f04dd202a3c..aaef332bd0007 100644
--- a/paddle/fluid/operators/repeat_interleave_op.cc
+++ b/paddle/fluid/operators/repeat_interleave_op.cc
@@ -21,8 +21,6 @@
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class RepeatInterleaveOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/requantize_op.h b/paddle/fluid/operators/requantize_op.h
index 8166aa98f076f..5b2f0148f1529 100644
--- a/paddle/fluid/operators/requantize_op.h
+++ b/paddle/fluid/operators/requantize_op.h
@@ -23,7 +23,6 @@ namespace paddle {
 namespace operators {
 
 using framework::OpKernelType;
-using framework::Tensor;
 
 class ReQuantOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 6a25e2c790287..f54f4880747a5 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -41,7 +41,7 @@ class OpBase;
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class ReshapeOp : public framework::OperatorWithKernel {
  public:
@@ -383,7 +383,7 @@ class ReshapeKernel {
     auto *in = ctx.Input<framework::LoDTensor>("X");
 
     auto list_new_shape_tensor =
-        ctx.MultiInput<framework::Tensor>("ShapeTensor");
+        ctx.MultiInput<phi::DenseTensor>("ShapeTensor");
     auto *shape_tensor = ctx.HasInput("Shape")
                              ? ctx.Input<framework::LoDTensor>("Shape")
                              : nullptr;
@@ -394,7 +394,7 @@ class ReshapeKernel {
       for (auto &tensor : list_new_shape_tensor) {
         if (platform::is_gpu_place(tensor->place()) ||
             platform::is_xpu_place(tensor->place())) {
-          framework::Tensor temp;
+          phi::DenseTensor temp;
           paddle::framework::TensorCopySync(
               *tensor, platform::CPUPlace(), &temp);
           pt_vec_shape.push_back(std::move(temp));
@@ -407,7 +407,7 @@ class ReshapeKernel {
       phi::DenseTensor pt_shape;
       if (platform::is_gpu_place(shape_tensor->place()) ||
           platform::is_xpu_place(shape_tensor->place())) {
-        framework::Tensor temp;
+        phi::DenseTensor temp;
         paddle::framework::TensorCopySync(
             *shape_tensor, platform::CPUPlace(), &temp);
         pt_shape = std::move(temp);
@@ -450,8 +450,8 @@ class ReshapeKernel {
 class ReshapeGradKernel {
  public:
   void operator()(const framework::ExecutionContext &ctx) const {
-    auto *d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto *d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto *d_out = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     d_x->mutable_data(ctx.GetPlace(), d_out->type());
 
     if (platform::is_cpu_place(ctx.GetPlace())) {
@@ -479,9 +479,9 @@ class ReshapeGradKernel {
 class ReshapeDoubleGradKernel {
  public:
   void operator()(const framework::ExecutionContext &ctx) const {
-    auto *dd_x = ctx.Input<framework::Tensor>("DDX");
-    auto *d_out = ctx.Input<framework::Tensor>("DOut");
-    auto *dd_out = ctx.Output<framework::Tensor>("DDOut");
+    auto *dd_x = ctx.Input<phi::DenseTensor>("DDX");
+    auto *d_out = ctx.Input<phi::DenseTensor>("DOut");
+    auto *dd_out = ctx.Output<phi::DenseTensor>("DDOut");
     dd_out->mutable_data(ctx.GetPlace(), dd_x->type());
 
     if (platform::is_cpu_place(ctx.GetPlace())) {
diff --git a/paddle/fluid/operators/reshape_op_mlu.cc b/paddle/fluid/operators/reshape_op_mlu.cc
index 46ab9534b6801..fa04ea6a3e50f 100644
--- a/paddle/fluid/operators/reshape_op_mlu.cc
+++ b/paddle/fluid/operators/reshape_op_mlu.cc
@@ -22,11 +22,11 @@ template <typename DeviceContext, typename T>
 class Reshape2MLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     std::vector<int32_t> target_shape_vector;
-    auto shape_tensor_vector = ctx.MultiInput<framework::Tensor>("ShapeTensor");
+    auto shape_tensor_vector = ctx.MultiInput<phi::DenseTensor>("ShapeTensor");
     if (shape_tensor_vector.size() > 0) {
       for (auto* shape_tensor : shape_tensor_vector) {
         PADDLE_ENFORCE_EQ(
@@ -117,8 +117,8 @@ template <typename DeviceContext, typename T>
 class Reshape2GradMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* d_out = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
     auto in_dims = d_x->dims();
 
     d_x->mutable_data(ctx.GetPlace(), d_out->type());
diff --git a/paddle/fluid/operators/reshape_op_npu.cc b/paddle/fluid/operators/reshape_op_npu.cc
index a5ffeb5080799..e87f433586874 100644
--- a/paddle/fluid/operators/reshape_op_npu.cc
+++ b/paddle/fluid/operators/reshape_op_npu.cc
@@ -30,11 +30,11 @@ class Reshape2NPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
     auto place = ctx.GetPlace();
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     std::vector<int32_t> target_shape_vector;
-    auto shape_tensor_vector = ctx.MultiInput<framework::Tensor>("ShapeTensor");
+    auto shape_tensor_vector = ctx.MultiInput<phi::DenseTensor>("ShapeTensor");
     if (shape_tensor_vector.size() > 0) {
       for (auto* shape_tensor : shape_tensor_vector) {
         PADDLE_ENFORCE_EQ(
@@ -127,8 +127,8 @@ template <typename DeviceContext, typename T>
 class Reshape2GradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* d_out = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
     auto in_dims = d_x->dims();
 
     d_x->mutable_data(ctx.GetPlace(), d_out->type());
diff --git a/paddle/fluid/operators/rnn_op_mlu.cc b/paddle/fluid/operators/rnn_op_mlu.cc
index fe567333b6d40..cf4e255668232 100644
--- a/paddle/fluid/operators/rnn_op_mlu.cc
+++ b/paddle/fluid/operators/rnn_op_mlu.cc
@@ -20,9 +20,9 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using DDim = framework::DDim;
-using TensorList = std::vector<framework::Tensor>;
+using TensorList = std::vector<phi::DenseTensor>;
 template <typename TensorType, typename T>
 void reset_parameter_vector(
     const std::vector<TensorType>& raw_params_vec,
@@ -60,23 +60,23 @@ class RNNMLUKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     // Input
     auto& dev_ctx = GetDevCtxFromCTX(ctx);
-    auto* input = ctx.Input<Tensor>("Input");
-    auto pre_state = ctx.MultiInput<Tensor>("PreState");
-    auto weight_list = ctx.MultiInput<framework::Tensor>("WeightList");
+    auto* input = ctx.Input<phi::DenseTensor>("Input");
+    auto pre_state = ctx.MultiInput<phi::DenseTensor>("PreState");
+    auto weight_list = ctx.MultiInput<phi::DenseTensor>("WeightList");
     bool has_seq_length = ctx.HasInput("SequenceLength");
     // Output
-    auto state = ctx.MultiOutput<Tensor>("State");
-    auto* output = ctx.Output<Tensor>("Out");
-    auto* reserve_data = ctx.Output<Tensor>("Reserve");
+    auto state = ctx.MultiOutput<phi::DenseTensor>("State");
+    auto* output = ctx.Output<phi::DenseTensor>("Out");
+    auto* reserve_data = ctx.Output<phi::DenseTensor>("Reserve");
     // Attributes
     const int& num_layers = ctx.Attr<int>("num_layers");
     const bool& is_bidirec = ctx.Attr<bool>("is_bidirec");
     const int& hidden_size = ctx.Attr<int>("hidden_size");
     const std::string& mode = ctx.Attr<std::string>("mode");
 
-    const Tensor* sequence_length = nullptr;
+    const phi::DenseTensor* sequence_length = nullptr;
     if (has_seq_length) {
-      sequence_length = ctx.Input<Tensor>("SequenceLength");
+      sequence_length = ctx.Input<phi::DenseTensor>("SequenceLength");
     }
 
     auto init_h = pre_state[0];  // -> hx
@@ -178,7 +178,7 @@ class RNNMLUKernel : public framework::OpKernel<T> {
 
     // copy weight params
     size_t weightspace_size;
-    framework::Tensor weightspace;
+    phi::DenseTensor weightspace;
     PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetRNNWeightSpaceSize(
         GetHandleFromCTX(ctx), rnn_desc.get(), &weightspace_size));
 
@@ -306,10 +306,9 @@ class RNNMLUKernel : public framework::OpKernel<T> {
       auto masked_mode = CNNL_MASKED_FILL;
       float off_value = 0.0f;
 
-      framework::Tensor on_value_tensor(input->dtype());
-      framework::Tensor masked_tensor(framework::TransToPhiDataType(VT::INT8));
-      framework::Tensor h_masked_tensor(
-          framework::TransToPhiDataType(VT::INT8));
+      phi::DenseTensor on_value_tensor(input->dtype());
+      phi::DenseTensor masked_tensor(framework::TransToPhiDataType(VT::INT8));
+      phi::DenseTensor h_masked_tensor(framework::TransToPhiDataType(VT::INT8));
       on_value_tensor.Resize({1});
       masked_tensor.Resize({seq_len, batch_size, direction_num * hidden_size});
       h_masked_tensor.Resize(
@@ -362,20 +361,20 @@ class RNNMLUGradKernel : public framework::OpKernel<T> {
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     auto stream = ctx.template device_context<MLUDeviceContext>().stream();
     // get the tensor pointer for the input
-    auto* input = ctx.Input<Tensor>("Input");
-    auto pre_state = ctx.MultiInput<Tensor>("PreState");
-    auto weight_list = ctx.MultiInput<framework::Tensor>("WeightList");
-    auto* output = ctx.Input<Tensor>("Out");
-    auto* reserve_data = ctx.Input<Tensor>("Reserve");
+    auto* input = ctx.Input<phi::DenseTensor>("Input");
+    auto pre_state = ctx.MultiInput<phi::DenseTensor>("PreState");
+    auto weight_list = ctx.MultiInput<phi::DenseTensor>("WeightList");
+    auto* output = ctx.Input<phi::DenseTensor>("Out");
+    auto* reserve_data = ctx.Input<phi::DenseTensor>("Reserve");
     const int& num_layers = ctx.Attr<int>("num_layers");
     const bool& is_bidirec = ctx.Attr<bool>("is_bidirec");
     const int& hidden_size = ctx.Attr<int>("hidden_size");
     const std::string& mode = ctx.Attr<std::string>("mode");
 
     bool has_seq_length = ctx.HasInput("SequenceLength");
-    const Tensor* sequence_length = nullptr;
+    const phi::DenseTensor* sequence_length = nullptr;
     if (has_seq_length) {
-      sequence_length = ctx.Input<Tensor>("SequenceLength");
+      sequence_length = ctx.Input<phi::DenseTensor>("SequenceLength");
     }
 
     PADDLE_ENFORCE_EQ(
@@ -387,19 +386,22 @@ class RNNMLUGradKernel : public framework::OpKernel<T> {
     auto init_h = pre_state[0];  // -> hx
     auto init_c = pre_state[1];  // -> cx
 
-    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto state_grad = ctx.MultiInput<Tensor>(framework::GradVarName("State"));
+    auto output_grad =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto state_grad =
+        ctx.MultiInput<phi::DenseTensor>(framework::GradVarName("State"));
     auto last_h_grad = state_grad[0];  // -> dhy
     auto last_c_grad = state_grad[1];  // -> dcy
 
     // get the tensor pointer for the output
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    auto weight_grad_list = ctx.MultiOutput<framework::Tensor>(
-        framework::GradVarName("WeightList"));
+    auto* input_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
+    auto weight_grad_list =
+        ctx.MultiOutput<phi::DenseTensor>(framework::GradVarName("WeightList"));
     auto pre_state_grad =
-        ctx.MultiOutput<Tensor>(framework::GradVarName("PreState"));
-    Tensor* init_h_grad = nullptr;
-    Tensor* init_c_grad = nullptr;
+        ctx.MultiOutput<phi::DenseTensor>(framework::GradVarName("PreState"));
+    phi::DenseTensor* init_h_grad = nullptr;
+    phi::DenseTensor* init_c_grad = nullptr;
     if (pre_state_grad.size() > 0) {    // has gradient
       init_h_grad = pre_state_grad[0];  // -> dhx
       init_c_grad = pre_state_grad[1];  // -> dcx
@@ -458,8 +460,8 @@ class RNNMLUGradKernel : public framework::OpKernel<T> {
     FillMLUTensorWithHostValue(ctx, static_cast<T>(0.0), input_grad);
 
     Tensor a, b;
-    Tensor* dynamic_grad_pre_h = &a;
-    Tensor* dynamic_grad_pre_c = &b;
+    phi::DenseTensor* dynamic_grad_pre_h = &a;
+    phi::DenseTensor* dynamic_grad_pre_c = &b;
     if (init_h_grad) {
       init_h_grad->mutable_data<T>(last_h_grad->dims(), ctx.GetPlace());
       FillMLUTensorWithHostValue(ctx, static_cast<T>(0.0), init_h_grad);
@@ -516,7 +518,7 @@ class RNNMLUGradKernel : public framework::OpKernel<T> {
 
     // copy weight
     size_t weightspace_size;
-    framework::Tensor weightspace, dweightspace;
+    phi::DenseTensor weightspace, dweightspace;
     PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetRNNWeightSpaceSize(
         GetHandleFromCTX(ctx), rnn_desc.get(), &weightspace_size));
 
diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc
index 922d255bbe20e..75054916e90da 100644
--- a/paddle/fluid/operators/roi_align_op.cc
+++ b/paddle/fluid/operators/roi_align_op.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 
 class ROIAlignOp : public framework::OperatorWithKernel {
diff --git a/paddle/fluid/operators/roi_align_op_mlu.cc b/paddle/fluid/operators/roi_align_op_mlu.cc
index c6f17b56cd074..58791ef1bca2a 100644
--- a/paddle/fluid/operators/roi_align_op_mlu.cc
+++ b/paddle/fluid/operators/roi_align_op_mlu.cc
@@ -19,16 +19,16 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 
 template <typename T>
 class ROIAlignOpMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
+    auto* in = ctx.Input<phi::DenseTensor>("X");
     auto* rois = ctx.Input<LoDTensor>("ROIs");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
     out->set_layout(framework::DataLayout::kNHWC);
 
@@ -46,7 +46,7 @@ class ROIAlignOpMLUKernel : public framework::OpKernel<T> {
     std::vector<int> roi_batch_id_list(rois_num);
     int rois_batch_size = 0;
     if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
+      auto* rois_num_t = ctx.Input<phi::DenseTensor>("RoisNum");
       rois_batch_size = rois_num_t->numel();
       PADDLE_ENFORCE_EQ(
           rois_batch_size,
@@ -176,8 +176,8 @@ class ROIAlignGradOpMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* rois = ctx.Input<LoDTensor>("ROIs");
-    auto* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* in_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* in_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     auto spatial_scale = ctx.Attr<T>("spatial_scale");
     auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
@@ -193,7 +193,7 @@ class ROIAlignGradOpMLUKernel : public framework::OpKernel<T> {
     auto cplace = platform::CPUPlace();
     int rois_batch_size = 0;
     if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
+      auto* rois_num_t = ctx.Input<phi::DenseTensor>("RoisNum");
       rois_batch_size = rois_num_t->numel();
       std::vector<int> rois_num_list(rois_batch_size);
       memory::Copy(cplace,
diff --git a/paddle/fluid/operators/roi_align_op_npu.cc b/paddle/fluid/operators/roi_align_op_npu.cc
index f14e29f8ddc27..8fd2616a92cc4 100644
--- a/paddle/fluid/operators/roi_align_op_npu.cc
+++ b/paddle/fluid/operators/roi_align_op_npu.cc
@@ -15,16 +15,16 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class ROIAlignNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* X = ctx.Input<framework::Tensor>("X");              // (B,C,H,W）
-    auto* ROIs = ctx.Input<framework::Tensor>("ROIs");        // (N，4）
-    auto* ROIsNum = ctx.Input<framework::Tensor>("RoisNum");  // [0 1 1 2 2 2]
-    auto* Out = ctx.Output<framework::Tensor>("Out");
+    auto* X = ctx.Input<phi::DenseTensor>("X");              // (B,C,H,W）
+    auto* ROIs = ctx.Input<phi::DenseTensor>("ROIs");        // (N，4）
+    auto* ROIsNum = ctx.Input<phi::DenseTensor>("RoisNum");  // [0 1 1 2 2 2]
+    auto* Out = ctx.Output<phi::DenseTensor>("Out");
     Out->mutable_data<T>(ctx.GetPlace());
 
     auto spatial_scale = ctx.Attr<float>("spatial_scale");
@@ -63,7 +63,7 @@ class ROIAlignNPUKernel : public framework::OpKernel<T> {
     runner_c.Run(stream);
 
     // concate to make (N, 5)
-    std::vector<paddle::framework::Tensor> x_list;
+    std::vector<phi::DenseTensor> x_list;
     x_list.push_back(ROIsNum_fp);
     x_list.push_back(*ROIs);
     auto axis = 1;
@@ -95,11 +95,10 @@ template <typename T>
 class ROIAlignNPUGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* in = ctx.Input<phi::DenseTensor>("X");
     auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* out_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* in_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     auto pooled_height = ctx.Attr<int>("pooled_height");
     auto pooled_width = ctx.Attr<int>("pooled_width");
@@ -137,7 +136,7 @@ class ROIAlignNPUGradKernel : public framework::OpKernel<T> {
             "ROIAlignGradNPU only support ROIs type equaled to FP32."));
 
     // Cast RoisNum to fp32 tensor
-    auto* RoisNum = ctx.Input<framework::Tensor>("RoisNum");
+    auto* RoisNum = ctx.Input<phi::DenseTensor>("RoisNum");
     Tensor ROIs_N5;
     ROIs_N5.mutable_data<float>({rois_num, 5}, place);
     Tensor ROIsNum_fp;
@@ -150,7 +149,7 @@ class ROIAlignNPUGradKernel : public framework::OpKernel<T> {
     ROIsNum_fp.Resize({rois_num, 1});
 
     // Combine *ROIsNum with ROIs to get new ROIs
-    std::vector<paddle::framework::Tensor> x_list;
+    std::vector<phi::DenseTensor> x_list;
     x_list.push_back(ROIsNum_fp);
     x_list.push_back(*rois);
     const auto& runner_concat = NpuOpRunner(
diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc
index c95e235aff98b..74b9b0c06c3d0 100644
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -23,7 +23,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 
 class ROIPoolOp : public framework::OperatorWithKernel {
diff --git a/paddle/fluid/operators/roll_op.cc b/paddle/fluid/operators/roll_op.cc
index 7ac1d4b8d4508..a504c7f8ddb87 100644
--- a/paddle/fluid/operators/roll_op.cc
+++ b/paddle/fluid/operators/roll_op.cc
@@ -26,8 +26,6 @@
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class RollOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/row_conv_op.cc b/paddle/fluid/operators/row_conv_op.cc
index 1bf471641d5a5..4dcda3fdae29c 100644
--- a/paddle/fluid/operators/row_conv_op.cc
+++ b/paddle/fluid/operators/row_conv_op.cc
@@ -24,7 +24,6 @@ namespace paddle {
 namespace operators {
 
 using LoDTensor = framework::LoDTensor;
-using framework::Tensor;
 
 template <typename T,
           int MajorType = Eigen::RowMajor,
@@ -144,7 +143,7 @@ class RowConvKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
     auto *x = context.Input<LoDTensor>("X");
-    auto *filter = context.Input<Tensor>("Filter");
+    auto *filter = context.Input<phi::DenseTensor>("Filter");
     auto *out = context.Output<LoDTensor>("Out");
 
     out->mutable_data<T>(context.GetPlace());
@@ -184,12 +183,12 @@ class RowConvKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
         current_timesteps = end - start;
       }
       // int current_timesteps = end - start;
-      Tensor cur_input_sequence =
+      phi::DenseTensor cur_input_sequence =
           x->Slice(start, end);  // Current input sequence
       cur_input_sequence =
           cur_input_sequence.Resize({current_timesteps, input_dim});
 
-      Tensor cur_output_sequence =
+      phi::DenseTensor cur_output_sequence =
           out->Slice(start, end);  // Current output sequence
       cur_output_sequence =
           cur_output_sequence.Resize({current_timesteps, input_dim});
@@ -219,10 +218,11 @@ class RowConvGradKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
     auto *x = context.Input<LoDTensor>("X");
-    auto *filter = context.Input<Tensor>("Filter");
+    auto *filter = context.Input<phi::DenseTensor>("Filter");
     auto *d_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
     auto *dx = context.Output<LoDTensor>(framework::GradVarName("X"));
-    auto *d_filter = context.Output<Tensor>(framework::GradVarName("Filter"));
+    auto *d_filter =
+        context.Output<phi::DenseTensor>(framework::GradVarName("Filter"));
 
     auto &x_lod = x->lod();
     bool is_tensor = x_lod.empty();
@@ -264,9 +264,10 @@ class RowConvGradKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
         } else {
           current_timesteps = end - start;
         }
-        Tensor cur_input = x->Slice(start, end);  // Current input sequence
+        phi::DenseTensor cur_input =
+            x->Slice(start, end);  // Current input sequence
         cur_input = cur_input.Resize({current_timesteps, input_dim});
-        Tensor cur_doutput =
+        phi::DenseTensor cur_doutput =
             d_out->Slice(start, end);  // Current output grad sequence
         cur_doutput = cur_doutput.Resize({current_timesteps, input_dim});
         auto cur_ip = EigenMatrix<T>::From(cur_input);
@@ -298,10 +299,10 @@ class RowConvGradKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
           current_timesteps = end - start;
         }
 
-        Tensor cur_doutput =
+        phi::DenseTensor cur_doutput =
             d_out->Slice(start, end);  // Current output grad sequence
         cur_doutput = cur_doutput.Resize({current_timesteps, input_dim});
-        Tensor cur_dinput =
+        phi::DenseTensor cur_dinput =
             dx->Slice(start, end);  // Current input grad sequence
         cur_dinput = cur_dinput.Resize({current_timesteps, input_dim});
 
diff --git a/paddle/fluid/operators/row_conv_op.cu b/paddle/fluid/operators/row_conv_op.cu
index f69889f7f8f25..6134c930ea01c 100644
--- a/paddle/fluid/operators/row_conv_op.cu
+++ b/paddle/fluid/operators/row_conv_op.cu
@@ -19,7 +19,6 @@ namespace paddle {
 namespace operators {
 
 using LoDTensor = framework::LoDTensor;
-using framework::Tensor;
 
 namespace {
 
@@ -327,7 +326,7 @@ class RowConvKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
     auto *X = context.Input<LoDTensor>("X");
-    auto *Filter = context.Input<Tensor>("Filter");
+    auto *Filter = context.Input<phi::DenseTensor>("Filter");
     auto *Out = context.Output<LoDTensor>("Out");
 
     const T *in = X->data<T>();
@@ -381,14 +380,16 @@ class RowConvGradKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
     auto *X = context.Input<LoDTensor>("X");
-    auto *Filter = context.Input<Tensor>("Filter");
+    auto *Filter = context.Input<phi::DenseTensor>("Filter");
     auto *dOut = context.Input<LoDTensor>(framework::GradVarName("Out"));
     const T *in = X->data<T>();
     const T *weights = Filter->data<T>();
     const T *dout = dOut->data<T>();
 
-    Tensor *dX = context.Output<LoDTensor>(framework::GradVarName("X"));
-    Tensor *dFilter = context.Output<Tensor>(framework::GradVarName("Filter"));
+    phi::DenseTensor *dX =
+        context.Output<LoDTensor>(framework::GradVarName("X"));
+    phi::DenseTensor *dFilter =
+        context.Output<phi::DenseTensor>(framework::GradVarName("Filter"));
     int batch_size = 0;
     bool is_tensor = X->lod().empty();
     if (is_tensor) {
diff --git a/paddle/fluid/operators/rrelu_op.cc b/paddle/fluid/operators/rrelu_op.cc
index ecbe7fe663fc1..823eb03aff6ce 100644
--- a/paddle/fluid/operators/rrelu_op.cc
+++ b/paddle/fluid/operators/rrelu_op.cc
@@ -22,8 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class RReluOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/run_program_op.cc b/paddle/fluid/operators/run_program_op.cc
index 45fee045cbfd5..64afb3a2b91e9 100644
--- a/paddle/fluid/operators/run_program_op.cc
+++ b/paddle/fluid/operators/run_program_op.cc
@@ -55,7 +55,7 @@ class RunProgramOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const framework::Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     return expected_kernel_type;
   }
@@ -173,7 +173,7 @@ class RunProgramGradOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const framework::Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     return expected_kernel_type;
   }
diff --git a/paddle/fluid/operators/sample_logits_op.cu b/paddle/fluid/operators/sample_logits_op.cu
index d0d8af95a3f72..8871627b85242 100644
--- a/paddle/fluid/operators/sample_logits_op.cu
+++ b/paddle/fluid/operators/sample_logits_op.cu
@@ -112,18 +112,21 @@ __global__ void gpu_compute_remove_accidental_hits(const int size,
 template <typename T>
 class SampleLogitsCUDAKernel : public framework::OpKernel<T> {
  public:
-  using Tensor = framework::Tensor;
+  using Tensor = phi::DenseTensor;
   void Compute(const framework::ExecutionContext& context) const override {
     // get necessary inputs
-    const Tensor* logits = context.Input<Tensor>("Logits");
-    const Tensor* labels = context.Input<Tensor>("Labels");
+    const phi::DenseTensor* logits = context.Input<phi::DenseTensor>("Logits");
+    const phi::DenseTensor* labels = context.Input<phi::DenseTensor>("Labels");
     VLOG(3) << "Enter SampleLogitsCUDAKernel";
 
     // get necessary outputs
-    Tensor* samples = context.Output<Tensor>("Samples");
-    Tensor* probabilities = context.Output<Tensor>("Probabilities");
-    Tensor* sampled_logits = context.Output<Tensor>("SampledLogits");
-    Tensor* sampled_labels = context.Output<Tensor>("SampledLabels");
+    phi::DenseTensor* samples = context.Output<phi::DenseTensor>("Samples");
+    phi::DenseTensor* probabilities =
+        context.Output<phi::DenseTensor>("Probabilities");
+    phi::DenseTensor* sampled_logits =
+        context.Output<phi::DenseTensor>("SampledLogits");
+    phi::DenseTensor* sampled_labels =
+        context.Output<phi::DenseTensor>("SampledLabels");
 
     // shapes
     const auto batch_size = logits->dims()[0];
@@ -158,10 +161,10 @@ class SampleLogitsCUDAKernel : public framework::OpKernel<T> {
             size, num_true, sampled_labels_data);
 
     if (use_customized_samples) {
-      const Tensor* customized_samples =
-          context.Input<Tensor>("CustomizedSamples");
-      const Tensor* customized_probabilities =
-          context.Input<Tensor>("CustomizedProbabilities");
+      const phi::DenseTensor* customized_samples =
+          context.Input<phi::DenseTensor>("CustomizedSamples");
+      const phi::DenseTensor* customized_probabilities =
+          context.Input<phi::DenseTensor>("CustomizedProbabilities");
       PADDLE_ENFORCE_EQ(customized_samples,
                         samples,
                         platform::errors::InvalidArgument(
@@ -235,12 +238,15 @@ class SampleLogitsCUDAKernel : public framework::OpKernel<T> {
 template <typename T>
 class SampleLogitsGradCUDAKernel : public framework::OpKernel<T> {
  public:
-  using Tensor = framework::Tensor;
+  using Tensor = phi::DenseTensor;
   void Compute(const framework::ExecutionContext& context) const override {
-    auto logits_grad = context.Output<Tensor>(framework::GradVarName("Logits"));
-    const Tensor* samples = context.Input<Tensor>("Samples");
-    const Tensor* sampled_logits_grad =
-        context.Input<Tensor>(framework::GradVarName("SampledLogits"));
+    auto logits_grad =
+        context.Output<phi::DenseTensor>(framework::GradVarName("Logits"));
+    const phi::DenseTensor* samples =
+        context.Input<phi::DenseTensor>("Samples");
+    const phi::DenseTensor* sampled_logits_grad =
+        context.Input<phi::DenseTensor>(
+            framework::GradVarName("SampledLogits"));
     logits_grad->mutable_data<T>(context.GetPlace());
 
     auto& dev_ctx = context.cuda_device_context();
diff --git a/paddle/fluid/operators/sample_logits_op.h b/paddle/fluid/operators/sample_logits_op.h
index d6affde0ce022..584d115d28ff3 100644
--- a/paddle/fluid/operators/sample_logits_op.h
+++ b/paddle/fluid/operators/sample_logits_op.h
@@ -27,7 +27,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
@@ -48,9 +48,9 @@ struct TolerableValue {
 // UNDERSTAND: something like take_along_axis in numpy.
 template <typename T>
 static void CPUTakeAlongD1(const platform::DeviceContext& ctx,
-                           const framework::Tensor& array,
-                           const framework::Tensor& index,
-                           framework::Tensor* value) {
+                           const phi::DenseTensor& array,
+                           const phi::DenseTensor& index,
+                           phi::DenseTensor* value) {
   PADDLE_ENFORCE_EQ(
       platform::is_cpu_place(ctx.GetPlace()),
       true,
@@ -119,9 +119,9 @@ static void CPUTakeAlongD1(const platform::DeviceContext& ctx,
 // indices, scatter is done in += way.
 template <typename T>
 static void CPUPutAlongD1(const platform::DeviceContext& ctx,
-                          framework::Tensor* array,
-                          const framework::Tensor& index,
-                          const framework::Tensor& value) {
+                          phi::DenseTensor* array,
+                          const phi::DenseTensor& index,
+                          const phi::DenseTensor& value) {
   PADDLE_ENFORCE_EQ(
       platform::is_cpu_place(ctx.GetPlace()),
       true,
@@ -188,8 +188,8 @@ static void CPUPutAlongD1(const platform::DeviceContext& ctx,
 // logits by a float max, here 1e20
 template <typename T>
 static void compute_remove_accidental_hits(const platform::DeviceContext& ctx,
-                                           framework::Tensor* sampled_logits,
-                                           const framework::Tensor& samples,
+                                           phi::DenseTensor* sampled_logits,
+                                           const phi::DenseTensor& samples,
                                            const int num_true) {
   const auto batch_size = sampled_logits->dims()[0];
   const auto num_sampled_classes = sampled_logits->dims()[1];
@@ -212,7 +212,7 @@ static void compute_remove_accidental_hits(const platform::DeviceContext& ctx,
 template <typename T>
 class SampleLogitsKernel : public framework::OpKernel<T> {
  public:
-  using Tensor = framework::Tensor;
+  using Tensor = phi::DenseTensor;
   void Compute(const framework::ExecutionContext& context) const override {
     PADDLE_ENFORCE_EQ(
         platform::is_cpu_place(context.GetPlace()),
@@ -220,14 +220,17 @@ class SampleLogitsKernel : public framework::OpKernel<T> {
         platform::errors::InvalidArgument("this kernel only runs on cpu."));
     VLOG(3) << "Enter SampleLogitsKernel";
     // get necessary inputs
-    const Tensor* logits = context.Input<Tensor>("Logits");
-    const Tensor* labels = context.Input<Tensor>("Labels");
+    const phi::DenseTensor* logits = context.Input<phi::DenseTensor>("Logits");
+    const phi::DenseTensor* labels = context.Input<phi::DenseTensor>("Labels");
 
     // get necessary outputs
-    Tensor* samples = context.Output<Tensor>("Samples");
-    Tensor* probabilities = context.Output<Tensor>("Probabilities");
-    Tensor* sampled_logits = context.Output<Tensor>("SampledLogits");
-    Tensor* sampled_labels = context.Output<Tensor>("SampledLabels");
+    phi::DenseTensor* samples = context.Output<phi::DenseTensor>("Samples");
+    phi::DenseTensor* probabilities =
+        context.Output<phi::DenseTensor>("Probabilities");
+    phi::DenseTensor* sampled_logits =
+        context.Output<phi::DenseTensor>("SampledLogits");
+    phi::DenseTensor* sampled_labels =
+        context.Output<phi::DenseTensor>("SampledLabels");
 
     // shapes
     const auto batch_size = logits->dims()[0];
@@ -257,10 +260,10 @@ class SampleLogitsKernel : public framework::OpKernel<T> {
     }
 
     if (use_customized_samples) {
-      const Tensor* customized_samples =
-          context.Input<Tensor>("CustomizedSamples");
-      const Tensor* customized_probabilities =
-          context.Input<Tensor>("CustomizedProbabilities");
+      const phi::DenseTensor* customized_samples =
+          context.Input<phi::DenseTensor>("CustomizedSamples");
+      const phi::DenseTensor* customized_probabilities =
+          context.Input<phi::DenseTensor>("CustomizedProbabilities");
       PADDLE_ENFORCE_EQ(customized_samples,
                         samples,
                         platform::errors::InvalidArgument(
@@ -305,12 +308,15 @@ class SampleLogitsKernel : public framework::OpKernel<T> {
 template <typename T>
 class SampleLogitsGradKernel : public framework::OpKernel<T> {
  public:
-  using Tensor = framework::Tensor;
+  using Tensor = phi::DenseTensor;
   void Compute(const framework::ExecutionContext& context) const override {
-    auto logits_grad = context.Output<Tensor>(framework::GradVarName("Logits"));
-    const Tensor* samples = context.Input<Tensor>("Samples");
-    const Tensor* sampled_logits_grad =
-        context.Input<Tensor>(framework::GradVarName("SampledLogits"));
+    auto logits_grad =
+        context.Output<phi::DenseTensor>(framework::GradVarName("Logits"));
+    const phi::DenseTensor* samples =
+        context.Input<phi::DenseTensor>("Samples");
+    const phi::DenseTensor* sampled_logits_grad =
+        context.Input<phi::DenseTensor>(
+            framework::GradVarName("SampledLogits"));
     logits_grad->mutable_data<T>(context.GetPlace());
 
     auto& dev_ctx = context.template device_context<phi::CPUContext>();
diff --git a/paddle/fluid/operators/sampling_id_op.cc b/paddle/fluid/operators/sampling_id_op.cc
index cd91e119faae0..6d2d3f4a60047 100644
--- a/paddle/fluid/operators/sampling_id_op.cc
+++ b/paddle/fluid/operators/sampling_id_op.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class SamplingIdOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/sampling_id_op.h b/paddle/fluid/operators/sampling_id_op.h
index 713d1b0475c3c..43c0bdcf4043e 100644
--- a/paddle/fluid/operators/sampling_id_op.h
+++ b/paddle/fluid/operators/sampling_id_op.h
@@ -27,13 +27,13 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class SamplingIdKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("X");
+    const phi::DenseTensor* input = context.Input<phi::DenseTensor>("X");
     const int batch_size = static_cast<int>(input->dims()[0]);
     const int width = static_cast<int>(input->dims()[1]);
 
@@ -75,7 +75,7 @@ class SamplingIdKernel : public framework::OpKernel<T> {
     std::vector<int64_t> out_dim;
     out_dim.push_back(static_cast<int64_t>(batch_size));
 
-    Tensor* output = context.Output<Tensor>("Out");
+    phi::DenseTensor* output = context.Output<phi::DenseTensor>("Out");
     output->Resize(phi::make_ddim(out_dim));
     output->mutable_data<T>(context.GetPlace());
     framework::TensorFromVector(ids, context.device_context(), output);
diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc
index 6b5c2367bb9ad..a25241d368aff 100644
--- a/paddle/fluid/operators/save_combine_op.cc
+++ b/paddle/fluid/operators/save_combine_op.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class SaveCombineOp : public framework::OperatorWithKernel {
  public:
@@ -37,7 +37,7 @@ class SaveCombineOp : public framework::OperatorWithKernel {
   //  in operator impl, which is not elegant enough.
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     return framework::OpKernelType(expected_kernel_type.data_type_,
                                    tensor.place());
diff --git a/paddle/fluid/operators/scale_op_mlu.cc b/paddle/fluid/operators/scale_op_mlu.cc
index 363c3e98a6dfc..c9aefcfc5b1fc 100644
--- a/paddle/fluid/operators/scale_op_mlu.cc
+++ b/paddle/fluid/operators/scale_op_mlu.cc
@@ -28,10 +28,10 @@ class ScaleMLUKernel : public framework::OpKernel<T> {
 
     // cnnl require input, scale, bias with same type. And all in device side.
     auto scale = static_cast<T>(ctx.Attr<float>("scale"));
-    framework::Tensor scale_tensor;
+    phi::DenseTensor scale_tensor;
     if (ctx.HasInput("ScaleTensor")) {
-      framework::Tensor float_scale_tensor =
-          *ctx.Input<framework::Tensor>("ScaleTensor");
+      phi::DenseTensor float_scale_tensor =
+          *ctx.Input<phi::DenseTensor>("ScaleTensor");
       if (framework::TransToProtoVarType(float_scale_tensor.dtype()) !=
           framework::TransToProtoVarType(in->dtype())) {
         scale_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
@@ -60,7 +60,7 @@ class ScaleMLUKernel : public framework::OpKernel<T> {
     }
 
     auto bias = static_cast<T>(ctx.Attr<float>("bias"));
-    framework::Tensor bias_tensor =
+    phi::DenseTensor bias_tensor =
         ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
     MLUCnnlTensorDesc bias_desc(bias_tensor);
     MLUCnnl::Fill(ctx,
@@ -98,7 +98,7 @@ class ScaleMLUKernel : public framework::OpKernel<T> {
                      output_desc.get(),
                      GetBasePtr(out));
     } else {
-      framework::Tensor new_bias_tensor =
+      phi::DenseTensor new_bias_tensor =
           ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
       MLUCnnlTensorDesc new_bias_desc(new_bias_tensor);
 
diff --git a/paddle/fluid/operators/scale_op_npu.cc b/paddle/fluid/operators/scale_op_npu.cc
index 3663ded61daea..8d7e8d59004fa 100644
--- a/paddle/fluid/operators/scale_op_npu.cc
+++ b/paddle/fluid/operators/scale_op_npu.cc
@@ -19,9 +19,9 @@ namespace paddle {
 namespace operators {
 
 template <typename T>
-static inline T GetAttrFromTensor(const framework::Tensor* tensor) {
+static inline T GetAttrFromTensor(const phi::DenseTensor* tensor) {
   const auto* tensor_data = tensor->data<T>();
-  framework::Tensor cpu_tensor;
+  phi::DenseTensor cpu_tensor;
   if (platform::is_gpu_place(tensor->place()) ||
       platform::is_npu_place(tensor->place())) {
     paddle::framework::TensorCopySync(
@@ -35,8 +35,8 @@ template <typename T>
 class ScaleNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     auto scale = ctx.Attr<float>("scale");
     auto bias = ctx.Attr<float>("bias");
     auto bias_after_scale = ctx.Attr<bool>("bias_after_scale");
@@ -47,7 +47,7 @@ class ScaleNPUKernel : public framework::OpKernel<T> {
     VLOG(4) << "scale:" << scale << ", bias:" << bias
             << " ,bias_after_scale:" << bias_after_scale;
     if (ctx.HasInput("ScaleTensor")) {
-      auto* scale_tensor = ctx.Input<framework::Tensor>("ScaleTensor");
+      auto* scale_tensor = ctx.Input<phi::DenseTensor>("ScaleTensor");
       scale = static_cast<float>(GetAttrFromTensor<T>(scale_tensor));
     }
     if (isinf(scale)) {
diff --git a/paddle/fluid/operators/scatter_nd_add_op.cc b/paddle/fluid/operators/scatter_nd_add_op.cc
index e15a9b98e8c4a..4ed08a387f2a0 100644
--- a/paddle/fluid/operators/scatter_nd_add_op.cc
+++ b/paddle/fluid/operators/scatter_nd_add_op.cc
@@ -37,7 +37,7 @@ class ScatterNdAddOp : public framework::OperatorWithKernel {
                           "Ref and Updates must have same type"));
     return framework::OpKernelType(
         framework::TransToProtoVarType(
-            ctx.Input<framework::Tensor>("X")->type()),
+            ctx.Input<phi::DenseTensor>("X")->type()),
         ctx.device_context());
   }
 };
diff --git a/paddle/fluid/operators/scatter_op_mlu.cc b/paddle/fluid/operators/scatter_op_mlu.cc
index 952da0edb8f34..a4cb5d7424936 100644
--- a/paddle/fluid/operators/scatter_op_mlu.cc
+++ b/paddle/fluid/operators/scatter_op_mlu.cc
@@ -19,11 +19,11 @@ template <typename T>
 class ScatterMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* indices = ctx.Input<Tensor>("Ids");
-    auto* updates = ctx.Input<Tensor>("Updates");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* indices = ctx.Input<phi::DenseTensor>("Ids");
+    auto* updates = ctx.Input<phi::DenseTensor>("Updates");
     bool overwrite = ctx.Attr<bool>("overwrite");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     auto place = ctx.GetPlace();
     out->mutable_data<T>(place);
     MLUCnnlTensorDesc x_desc(*x);
diff --git a/paddle/fluid/operators/scatter_op_npu.cc b/paddle/fluid/operators/scatter_op_npu.cc
index 40e0b983e25d9..6bffd24734055 100644
--- a/paddle/fluid/operators/scatter_op_npu.cc
+++ b/paddle/fluid/operators/scatter_op_npu.cc
@@ -22,23 +22,23 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class ScatterNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* index = ctx.Input<Tensor>("Ids");
-    auto* updates = ctx.Input<Tensor>("Updates");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* index = ctx.Input<phi::DenseTensor>("Ids");
+    auto* updates = ctx.Input<phi::DenseTensor>("Updates");
     bool overwrite = ctx.Attr<bool>("overwrite");
 
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     auto place = ctx.GetPlace();
     out->mutable_data<T>(place);
 
-    framework::Tensor tmp_tensor(index->type());
+    phi::DenseTensor tmp_tensor(index->type());
     const auto index_dims = index->dims();
     if (index_dims.size() == 1) {
       tmp_tensor.ShareDataWith(*index);
diff --git a/paddle/fluid/operators/scatter_test.cc b/paddle/fluid/operators/scatter_test.cc
index 1249e3e807ec7..7f774089fd9ca 100644
--- a/paddle/fluid/operators/scatter_test.cc
+++ b/paddle/fluid/operators/scatter_test.cc
@@ -20,9 +20,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 
 TEST(scatter, ScatterUpdate) {
-  paddle::framework::Tensor src;
-  paddle::framework::Tensor index;
-  paddle::framework::Tensor output;
+  phi::DenseTensor src;
+  phi::DenseTensor index;
+  phi::DenseTensor output;
 
   auto* p_src = src.mutable_data<float>(phi::make_ddim({1, 4}),
                                         paddle::platform::CPUPlace());
diff --git a/paddle/fluid/operators/search_compute.h b/paddle/fluid/operators/search_compute.h
index 07cd48604b8aa..eceef2b4e5470 100644
--- a/paddle/fluid/operators/search_compute.h
+++ b/paddle/fluid/operators/search_compute.h
@@ -28,7 +28,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 using LoD = framework::LoD;
 
diff --git a/paddle/fluid/operators/seed_op.cc b/paddle/fluid/operators/seed_op.cc
index 7de155b01c20e..88a1884ae53e4 100644
--- a/paddle/fluid/operators/seed_op.cc
+++ b/paddle/fluid/operators/seed_op.cc
@@ -17,7 +17,7 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 class SeedOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/seed_op.cu b/paddle/fluid/operators/seed_op.cu
index be406db50569d..87ba439d79201 100644
--- a/paddle/fluid/operators/seed_op.cu
+++ b/paddle/fluid/operators/seed_op.cu
@@ -22,7 +22,7 @@ template <typename Place, typename T>
 class GPUSeedKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    auto *out = context.Output<Tensor>("Out");
+    auto *out = context.Output<phi::DenseTensor>("Out");
     int seed = get_seed(context);
 
     auto force_cpu = context.Attr<bool>("force_cpu");
diff --git a/paddle/fluid/operators/seed_op.h b/paddle/fluid/operators/seed_op.h
index 202f25e0b4cd1..a1c3484b7a728 100644
--- a/paddle/fluid/operators/seed_op.h
+++ b/paddle/fluid/operators/seed_op.h
@@ -19,7 +19,7 @@
 
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 static int get_seed(const framework::ExecutionContext& context) {
   int user_seed = context.Attr<int>("seed");
@@ -49,7 +49,7 @@ template <typename DeviceContext, typename T>
 class CPUSeedKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* out = context.Output<Tensor>("Out");
+    auto* out = context.Output<phi::DenseTensor>("Out");
     auto* out_data = out->mutable_data<T>(context.GetPlace());
     out_data[0] = get_seed(context);
   }
diff --git a/paddle/fluid/operators/seed_op_npu.cc b/paddle/fluid/operators/seed_op_npu.cc
index 39bd21e9ba6ab..cee905bdc1491 100644
--- a/paddle/fluid/operators/seed_op_npu.cc
+++ b/paddle/fluid/operators/seed_op_npu.cc
@@ -22,7 +22,7 @@ template <typename DeviceContext, typename T>
 class NPUSeedKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     int user_seed = ctx.Attr<int>("seed");
     std::random_device rnd;
     int seed;
diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.h b/paddle/fluid/operators/sequence_ops/sequence_concat_op.h
index 4943e0e2ea09b..fa10965462191 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.h
@@ -26,7 +26,7 @@ namespace operators {
 namespace detail {
 template <typename Container>
 inline framework::LoD ConcatLoD(const Container &xs,
-                                std::vector<framework::Tensor> *xs_in_order) {
+                                std::vector<phi::DenseTensor> *xs_in_order) {
   std::vector<size_t> result;
   result.resize(xs[0].get().lod()[0].size());
 
@@ -34,7 +34,7 @@ inline framework::LoD ConcatLoD(const Container &xs,
     size_t sum = 0;
     for (size_t j = 0; j < xs.size(); ++j) {
       auto &x_lod = xs[j].get().lod()[0];
-      const framework::Tensor &tensor = xs[j].get();
+      const phi::DenseTensor &tensor = xs[j].get();
       if (x_lod[i - 1] < x_lod[i]) {
         xs_in_order->emplace_back(tensor.Slice(x_lod[i - 1], x_lod[i]));
       }
@@ -98,7 +98,7 @@ class SeqConcatKernel : public framework::OpKernel<T> {
             "received input lod size is %d",
             lod_size));
 
-    std::vector<framework::Tensor> x_in_order;
+    std::vector<phi::DenseTensor> x_in_order;
     out.set_lod(detail::ConcatLoD(xs, &x_in_order));
     out.mutable_data<T>(context.GetPlace());
     math::ConcatFunctor<DeviceContext, T> functor;
@@ -129,8 +129,8 @@ class SeqConcatGradKernel : public framework::OpKernel<T> {
       }
     }
 
-    std::vector<framework::Tensor> sliced_x;
-    std::vector<paddle::optional<framework::Tensor>> sliced_dx;
+    std::vector<phi::DenseTensor> sliced_x;
+    std::vector<paddle::optional<phi::DenseTensor>> sliced_dx;
 
     for (size_t i = 1; i < xs[0]->lod()[0].size(); ++i) {
       for (size_t j = 0; j < xs.size(); ++j) {
@@ -157,13 +157,13 @@ class SeqConcatGradKernel : public framework::OpKernel<T> {
       }
     }
 
-    std::vector<const framework::Tensor *> sliced_x_ptr;
+    std::vector<const phi::DenseTensor *> sliced_x_ptr;
     sliced_x_ptr.reserve(sliced_x.size());
     for (auto &x : sliced_x) {
       sliced_x_ptr.emplace_back(&x);
     }
 
-    std::vector<framework::Tensor *> sliced_dx_ptr;
+    std::vector<phi::DenseTensor *> sliced_dx_ptr;
     sliced_dx_ptr.reserve(sliced_dx.size());
     for (auto &dx : sliced_dx) {
       if (dx) {
@@ -174,7 +174,7 @@ class SeqConcatGradKernel : public framework::OpKernel<T> {
     math::SplitFunctor<DeviceContext, T> functor;
     functor(context.template device_context<DeviceContext>(),
             GET_DATA_SAFELY(
-                context.Input<framework::Tensor>(framework::GradVarName("Out")),
+                context.Input<phi::DenseTensor>(framework::GradVarName("Out")),
                 "Input",
                 framework::GradVarName("Out"),
                 "SeqConcatGrad"),
diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op.h b/paddle/fluid/operators/sequence_ops/sequence_conv_op.h
index ee7677aa2164d..80a9019906e8b 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_conv_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op.h
@@ -22,7 +22,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 
 template <typename DeviceContext, typename T>
@@ -31,7 +31,7 @@ class SequenceConvKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in = context.Input<LoDTensor>("X");
     auto* out = context.Output<LoDTensor>("Out");
-    auto filter = *context.Input<Tensor>("Filter");
+    auto filter = *context.Input<phi::DenseTensor>("Filter");
 
     out->mutable_data<T>(context.GetPlace());
 
@@ -53,9 +53,9 @@ class SequenceConvKernel : public framework::OpKernel<T> {
             "present. But received: lod level %u.",
             in->lod().size()));
 
-    const Tensor* padding_data = nullptr;
+    const phi::DenseTensor* padding_data = nullptr;
     if (padding_trainable) {
-      padding_data = context.Input<Tensor>("PaddingData");
+      padding_data = context.Input<phi::DenseTensor>("PaddingData");
     }
 
     int up_pad = std::max(0, -context_start);
@@ -94,11 +94,12 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in_g = context.Output<LoDTensor>(framework::GradVarName("X"));
     auto* out_g = context.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* filter_g = context.Output<Tensor>(framework::GradVarName("Filter"));
+    auto* filter_g =
+        context.Output<phi::DenseTensor>(framework::GradVarName("Filter"));
     auto* padding_data_g =
-        context.Output<Tensor>(framework::GradVarName("PaddingData"));
+        context.Output<phi::DenseTensor>(framework::GradVarName("PaddingData"));
     auto* in = context.Input<LoDTensor>("X");
-    auto* filter = context.Input<Tensor>("Filter");
+    auto* filter = context.Input<phi::DenseTensor>("Filter");
 
     int context_start = context.Attr<int>("contextStart");
     int context_length = context.Attr<int>("contextLength");
@@ -180,9 +181,9 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
       Tensor filter_grad = *filter_g;
       LoDTensor out_grad = *out_g;
 
-      const Tensor* padding_data = nullptr;
+      const phi::DenseTensor* padding_data = nullptr;
       if (padding_trainable) {
-        padding_data = context.Input<Tensor>("PaddingData");
+        padding_data = context.Input<phi::DenseTensor>("PaddingData");
       }
 
       seq_project_functor(dev_ctx,
diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc b/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc
index bfd5ce38645e8..f0083ec4042e6 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class SequenceConvXPUKernel : public framework::OpKernel<T> {
@@ -27,7 +27,7 @@ class SequenceConvXPUKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in = context.Input<LoDTensor>("X");
     auto* out = context.Output<LoDTensor>("Out");
-    auto filter = *context.Input<Tensor>("Filter");
+    auto filter = *context.Input<phi::DenseTensor>("Filter");
 
     out->mutable_data<T>(context.GetPlace());
 
@@ -161,9 +161,10 @@ class SequenceConvGradXPUKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in_g = context.Output<LoDTensor>(framework::GradVarName("X"));
     auto* out_g = context.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* filter_g = context.Output<Tensor>(framework::GradVarName("Filter"));
+    auto* filter_g =
+        context.Output<phi::DenseTensor>(framework::GradVarName("Filter"));
     auto* in = context.Input<LoDTensor>("X");
-    auto* filter = context.Input<Tensor>("Filter");
+    auto* filter = context.Input<phi::DenseTensor>("Filter");
 
     int context_start = context.Attr<int>("contextStart");
     int context_length = context.Attr<int>("contextLength");
diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc
index 8ea756e455e23..6c14fa997fe5e 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc
@@ -47,7 +47,7 @@ class SequenceMaskOp : public framework::OperatorWithKernel {
   }
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     if (var_name == "depth_tensor") {
       return expected_kernel_type;
diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op.h b/paddle/fluid/operators/sequence_ops/sequence_mask_op.h
index a8105ef71a550..01cbed4509d85 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.h
@@ -29,7 +29,7 @@ namespace paddle {
 namespace operators {
 
 using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename Tx, typename Ty>
 struct SequenceMaskForRangeFunctor {
@@ -75,17 +75,17 @@ class SequenceMaskKernel : public framework::OpKernel<Tx> {
 
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *x = ctx.Input<Tensor>("X");
-    auto *y = ctx.Output<Tensor>("Y");
+    auto *x = ctx.Input<phi::DenseTensor>("X");
+    auto *y = ctx.Output<phi::DenseTensor>("Y");
     int maxlen = ctx.Attr<int>("maxlen");
     if (ctx.HasInput("MaxLenTensor")) {
-      auto max_len_tensor = ctx.Input<Tensor>("MaxLenTensor");
+      auto max_len_tensor = ctx.Input<phi::DenseTensor>("MaxLenTensor");
       PADDLE_ENFORCE_NOT_NULL(max_len_tensor,
                               platform::errors::InvalidArgument(
                                   "Input(MaxLenTensor) should not be NULL."
                                   "But received Input(MaxLenTensor) is NULL"));
       if (platform::is_gpu_place(max_len_tensor->place())) {
-        framework::Tensor temp;
+        phi::DenseTensor temp;
         paddle::framework::TensorCopySync(
             *max_len_tensor, platform::CPUPlace(), &temp);
         maxlen = *temp.data<int32_t>();
diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc b/paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc
index b39e4f3bdd612..1290e79bc076d 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc
@@ -18,24 +18,24 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class SequenceMaskNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Output<Tensor>("Y");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* y = ctx.Output<phi::DenseTensor>("Y");
     int maxlen = ctx.Attr<int>("maxlen");
 
     if (ctx.HasInput("MaxLenTensor")) {
-      auto max_len_tensor = ctx.Input<Tensor>("MaxLenTensor");
+      auto max_len_tensor = ctx.Input<phi::DenseTensor>("MaxLenTensor");
       PADDLE_ENFORCE_NOT_NULL(max_len_tensor,
                               platform::errors::InvalidArgument(
                                   "Input(MaxLenTensor) should not be NULL."
                                   "But received Input(MaxLenTensor) is NULL"));
-      framework::Tensor temp;
+      phi::DenseTensor temp;
       paddle::framework::TensorCopySync(
           *max_len_tensor, platform::CPUPlace(), &temp);
       maxlen = *temp.data<int32_t>();
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.h b/paddle/fluid/operators/sequence_ops/sequence_pool_op.h
index 90e84c5061e17..0811733a2d7ce 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.h
@@ -23,7 +23,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 
 template <typename DeviceContext, typename T>
@@ -71,7 +71,7 @@ class SequencePoolKernel : public framework::OpKernel<T> {
     dims[0] = lod[lod_level - 1].size() - 1;
     out->Resize({dims});
     out->mutable_data<T>(context.GetPlace());
-    Tensor* index = nullptr;
+    phi::DenseTensor* index = nullptr;
 
     bool is_test =
         context.HasAttr("is_test") ? context.Attr<bool>("is_test") : false;
@@ -81,7 +81,7 @@ class SequencePoolKernel : public framework::OpKernel<T> {
     if (pooltype == "MAX" &&
         (is_test == false ||
          platform::is_cpu_place(context.GetPlace()) == false)) {
-      index = context.Output<Tensor>("MaxIndex");
+      index = context.Output<phi::DenseTensor>("MaxIndex");
       index->Resize({dims});
       index->mutable_data<int>(context.GetPlace());
     }
@@ -103,9 +103,9 @@ class SequencePoolGradKernel : public framework::OpKernel<T> {
     auto* out_g = context.Input<LoDTensor>(framework::GradVarName("Out"));
     auto* in_g = context.Output<LoDTensor>(framework::GradVarName("X"));
     std::string pooltype = context.Attr<std::string>("pooltype");
-    const Tensor* index = nullptr;
+    const phi::DenseTensor* index = nullptr;
     if (pooltype == "MAX") {
-      index = context.Input<Tensor>("MaxIndex");
+      index = context.Input<phi::DenseTensor>("MaxIndex");
     }
     in_g->mutable_data<T>(context.GetPlace());
     math::SequencePoolGradFunctor<DeviceContext, T> pool;
diff --git a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc
index 1d53c39713acf..2050dfb27ddc8 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc
@@ -22,7 +22,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 
 class SequenceScatterOpMaker : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h
index 0be41c295e38d..68fe81c186da6 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h
@@ -20,17 +20,17 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 
 template <typename T>
 class SequenceScatterOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
     auto* ids = ctx.Input<LoDTensor>("Ids");
     auto* updates = ctx.Input<LoDTensor>("Updates");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     auto& ids_lod = ids->lod();
     PADDLE_ENFORCE_EQ(ids_lod.empty(),
@@ -96,10 +96,10 @@ class SequenceScatterGradientOpKernel : public framework::OpKernel<T> {
         platform::errors::Unimplemented("Device dose not match. The "
                                         "SequenceScatterGradientOpKernel can "
                                         "only run on CPU device."));
-    auto* dX = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dX = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto* dUpdates = ctx.Output<LoDTensor>(framework::GradVarName("Updates"));
     auto* ids = ctx.Input<LoDTensor>("Ids");
-    auto* dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dOut = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
 
     auto& ids_lod = ids->lod();
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_slice_op.h b/paddle/fluid/operators/sequence_ops/sequence_slice_op.h
index ad535341fd46f..e6310f7f9f54a 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_slice_op.h
@@ -20,7 +20,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 using LoD = framework::LoD;
 
@@ -45,8 +45,8 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* in = ctx.Input<LoDTensor>("X");
-    auto* offset = ctx.Input<Tensor>("Offset");
-    auto* length = ctx.Input<Tensor>("Length");
+    auto* offset = ctx.Input<phi::DenseTensor>("Offset");
+    auto* length = ctx.Input<phi::DenseTensor>("Length");
     auto* out = ctx.Output<LoDTensor>("Out");
 
     auto lod = in->lod();
@@ -85,8 +85,8 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
 
     const int64_t* offset_data = offset->data<int64_t>();
     const int64_t* length_data = length->data<int64_t>();
-    framework::Tensor offset_cpu;
-    framework::Tensor length_cpu;
+    phi::DenseTensor offset_cpu;
+    phi::DenseTensor length_cpu;
 
     if (platform::is_gpu_place(ctx.GetPlace())) {
       offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace());
@@ -156,8 +156,8 @@ class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* in = ctx.Input<LoDTensor>("X");
-    auto* offset = ctx.Input<Tensor>("Offset");
-    auto* length = ctx.Input<Tensor>("Length");
+    auto* offset = ctx.Input<phi::DenseTensor>("Offset");
+    auto* length = ctx.Input<phi::DenseTensor>("Length");
     auto* out_grad =
         ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
     auto* x_grad =
@@ -165,8 +165,8 @@ class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
 
     const int64_t* offset_data = offset->data<int64_t>();
     const int64_t* length_data = length->data<int64_t>();
-    framework::Tensor offset_cpu;
-    framework::Tensor length_cpu;
+    phi::DenseTensor offset_cpu;
+    phi::DenseTensor length_cpu;
 
     if (platform::is_gpu_place(ctx.GetPlace())) {
       offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace());
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc
index b060aa9f08b15..73548eee454e4 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 
 template <typename T>
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.h b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.h
index 0d3d3b695af4b..d5489d296ba47 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
index 1c1168e449eb7..149a9f0c2db18 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
@@ -27,7 +27,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 static constexpr int TopKPosPaddingId = -1;
 
@@ -75,7 +75,7 @@ class SequenceTopkAvgPoolingKernel : public framework::OpKernel<T> {
     auto* row = context.Input<LoDTensor>("ROW");
     auto* col = context.Input<LoDTensor>("COLUMN");
     auto* out = context.Output<LoDTensor>("Out");
-    auto* pos = context.Output<Tensor>("pos");
+    auto* pos = context.Output<phi::DenseTensor>("pos");
 
     PADDLE_ENFORCE_EQ(
         in->lod().empty(),
@@ -184,7 +184,7 @@ class SequenceTopkAvgPoolingGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     auto* d_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
     auto* d_in = context.Output<LoDTensor>(framework::GradVarName("X"));
-    auto* pos_input = context.Input<Tensor>("pos");
+    auto* pos_input = context.Input<phi::DenseTensor>("pos");
     auto* row_input = context.Input<LoDTensor>("ROW");
     auto* col_input = context.Input<LoDTensor>("COLUMN");
     auto* forward_input = context.Input<LoDTensor>("X");
diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h
index 747549eed5182..43425c3e3a27b 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h
@@ -36,7 +36,7 @@ class SequenceUnpadOpKernel : public framework::OpKernel<T> {
     auto* out_t = ctx.Output<LoDTensor>("Out");
 
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    framework::Tensor seq_len_cpu =
+    phi::DenseTensor seq_len_cpu =
         ctx.AllocateTmpTensor<T, DeviceContext>(len_t->dims(), dev_ctx);
     if (platform::is_gpu_place(ctx.GetPlace()) ||
         platform::is_xpu_place(ctx.GetPlace())) {
diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc
index 074642e1b0241..b1fe2dedcb293 100644
--- a/paddle/fluid/operators/set_value_op.cc
+++ b/paddle/fluid/operators/set_value_op.cc
@@ -36,7 +36,7 @@ class OpBase;
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class SetValue : public framework::OperatorWithKernel {
  public:
@@ -210,7 +210,7 @@ class SetValueGrad : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    auto in_tensor = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto in_tensor = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
     return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
                                        ctx, framework::GradVarName("Out")),
                                    in_tensor->place());
diff --git a/paddle/fluid/operators/set_value_op.h b/paddle/fluid/operators/set_value_op.h
index d754f609393cf..7ef766020251b 100644
--- a/paddle/fluid/operators/set_value_op.h
+++ b/paddle/fluid/operators/set_value_op.h
@@ -31,7 +31,7 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using DDim = framework::DDim;
 
 inline std::string GetValueName(framework::proto::VarType::Type data_type) {
diff --git a/paddle/fluid/operators/set_value_op_mlu.cc b/paddle/fluid/operators/set_value_op_mlu.cc
index 9a6277dfa2312..1b950a6da6084 100644
--- a/paddle/fluid/operators/set_value_op_mlu.cc
+++ b/paddle/fluid/operators/set_value_op_mlu.cc
@@ -26,14 +26,16 @@ template <typename T>
 class SetValueMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
-    auto* in = ctx.Input<Tensor>("Input");
-    auto* value_tensor = ctx.Input<Tensor>("ValueTensor");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* in = ctx.Input<phi::DenseTensor>("Input");
+    auto* value_tensor = ctx.Input<phi::DenseTensor>("ValueTensor");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
 
-    auto starts_tensor_list = ctx.MultiInput<Tensor>("StartsTensorList");
-    auto ends_tensor_list = ctx.MultiInput<Tensor>("EndsTensorList");
-    auto steps_tensor_list = ctx.MultiInput<Tensor>("StepsTensorList");
+    auto starts_tensor_list =
+        ctx.MultiInput<phi::DenseTensor>("StartsTensorList");
+    auto ends_tensor_list = ctx.MultiInput<phi::DenseTensor>("EndsTensorList");
+    auto steps_tensor_list =
+        ctx.MultiInput<phi::DenseTensor>("StepsTensorList");
 
     auto axes = ctx.Attr<std::vector<int64_t>>("axes");
     auto starts = ctx.Attr<std::vector<int64_t>>("starts");
@@ -135,7 +137,7 @@ class SetValueMLUKernel : public framework::OpKernel<T> {
     int64_t stride_step = phi::product(in_dims);
     std::vector<int64_t> index_indices(stride_step);
     std::iota(index_indices.begin(), index_indices.end(), 0);
-    framework::Tensor index_temp;
+    phi::DenseTensor index_temp;
     in_temp.ShareDataWith(*in);
     val_temp.ShareDataWith(value_temp);
     paddle::framework::TensorFromVector(
diff --git a/paddle/fluid/operators/set_value_op_npu.cc b/paddle/fluid/operators/set_value_op_npu.cc
index 419cbe6f9a77e..7526b13311b05 100644
--- a/paddle/fluid/operators/set_value_op_npu.cc
+++ b/paddle/fluid/operators/set_value_op_npu.cc
@@ -25,13 +25,15 @@ template <typename T>
 class SetValueNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
-    auto* in = ctx.Input<Tensor>("Input");
-    auto* value_tensor = ctx.Input<Tensor>("ValueTensor");
-    auto* out = ctx.Output<Tensor>("Out");
-
-    auto starts_tensor_list = ctx.MultiInput<Tensor>("StartsTensorList");
-    auto ends_tensor_list = ctx.MultiInput<Tensor>("EndsTensorList");
-    auto steps_tensor_list = ctx.MultiInput<Tensor>("StepsTensorList");
+    auto* in = ctx.Input<phi::DenseTensor>("Input");
+    auto* value_tensor = ctx.Input<phi::DenseTensor>("ValueTensor");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
+
+    auto starts_tensor_list =
+        ctx.MultiInput<phi::DenseTensor>("StartsTensorList");
+    auto ends_tensor_list = ctx.MultiInput<phi::DenseTensor>("EndsTensorList");
+    auto steps_tensor_list =
+        ctx.MultiInput<phi::DenseTensor>("StepsTensorList");
 
     auto axes = ctx.Attr<std::vector<int64_t>>("axes");
     auto starts = ctx.Attr<std::vector<int64_t>>("starts");
diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc
index 14f4b00b60d73..b191f7cfa0011 100644
--- a/paddle/fluid/operators/shape_op.cc
+++ b/paddle/fluid/operators/shape_op.cc
@@ -45,7 +45,7 @@ class ShapeOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const framework::Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     return framework::OpKernelType(expected_kernel_type.data_type_,
                                    expected_kernel_type.place_,
diff --git a/paddle/fluid/operators/shape_op_mlu.cc b/paddle/fluid/operators/shape_op_mlu.cc
index a890b22e7a933..5fde42dc7880f 100644
--- a/paddle/fluid/operators/shape_op_mlu.cc
+++ b/paddle/fluid/operators/shape_op_mlu.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 using SelectedRows = phi::SelectedRows;
 
@@ -35,7 +35,7 @@ class ShapeMLUKernel : public framework::OpKernel<T> {
     } else {
       in_dims = in_var->Get<LoDTensor>().dims();
     }
-    auto* out_t = ctx.Output<Tensor>("Out");
+    auto* out_t = ctx.Output<phi::DenseTensor>("Out");
     out_t->Resize({in_dims.size()});
     out_t->mutable_data<int32_t>(ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/shape_op_npu.cc b/paddle/fluid/operators/shape_op_npu.cc
index 95f2857bf3fcc..60a0162818c9d 100644
--- a/paddle/fluid/operators/shape_op_npu.cc
+++ b/paddle/fluid/operators/shape_op_npu.cc
@@ -18,14 +18,14 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class ShapeNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("Input");
-    auto* out_t = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("Input");
+    auto* out_t = ctx.Output<phi::DenseTensor>("Out");
     out_t->Resize({x->dims().size()});
     out_t->mutable_data<int32_t>(ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/shard_index_op_npu.cc b/paddle/fluid/operators/shard_index_op_npu.cc
index b80a50454d756..a6ff8022d0b84 100644
--- a/paddle/fluid/operators/shard_index_op_npu.cc
+++ b/paddle/fluid/operators/shard_index_op_npu.cc
@@ -19,7 +19,7 @@ namespace paddle {
 namespace operators {
 
 using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 template <typename T>
 class ShardIndexNPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/share_buffer_op.cc b/paddle/fluid/operators/share_buffer_op.cc
index 89d6b231d7234..5ccd8d3189717 100644
--- a/paddle/fluid/operators/share_buffer_op.cc
+++ b/paddle/fluid/operators/share_buffer_op.cc
@@ -33,7 +33,7 @@ class ShareBufferOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const framework::Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     return expected_kernel_type;
   }
diff --git a/paddle/fluid/operators/share_buffer_op.h b/paddle/fluid/operators/share_buffer_op.h
index 908047345fe0b..70cb72db36d79 100644
--- a/paddle/fluid/operators/share_buffer_op.h
+++ b/paddle/fluid/operators/share_buffer_op.h
@@ -24,8 +24,8 @@ template <typename T>
 class ShareBufferOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto inputs = ctx.MultiInput<framework::Tensor>("X");
-    auto outputs = ctx.MultiOutput<framework::Tensor>("Out");
+    const auto inputs = ctx.MultiInput<phi::DenseTensor>("X");
+    auto outputs = ctx.MultiOutput<phi::DenseTensor>("Out");
     size_t n = inputs.size();
     PADDLE_ENFORCE_EQ(
         n,
diff --git a/paddle/fluid/operators/shuffle_batch_op.cc b/paddle/fluid/operators/shuffle_batch_op.cc
index 16daffd1291d9..2fe8512b4b155 100644
--- a/paddle/fluid/operators/shuffle_batch_op.cc
+++ b/paddle/fluid/operators/shuffle_batch_op.cc
@@ -63,7 +63,7 @@ class ShuffleBatchOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const framework::Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     if (var_name == "Seed") {
       return expected_kernel_type;
diff --git a/paddle/fluid/operators/shuffle_batch_op.cu b/paddle/fluid/operators/shuffle_batch_op.cu
index 6b70b8d37d79c..4ab4868bfb5b2 100644
--- a/paddle/fluid/operators/shuffle_batch_op.cu
+++ b/paddle/fluid/operators/shuffle_batch_op.cu
@@ -56,11 +56,11 @@ class ShuffleBatchCUDAKernel : public framework::OpKernel<T> {
     PADDLE_THROW(platform::errors::Unimplemented(
         "GPU shuffle_batch is not supported on Windows yet"));
 #else
-    auto *x = ctx.Input<framework::Tensor>("X");
-    auto *seed = ctx.Input<framework::Tensor>("Seed");
-    auto *out = ctx.Output<framework::Tensor>("Out");
-    auto *shuffleidx = ctx.Output<framework::Tensor>("ShuffleIdx");
-    auto *seed_out = ctx.Output<framework::Tensor>("SeedOut");
+    auto *x = ctx.Input<phi::DenseTensor>("X");
+    auto *seed = ctx.Input<phi::DenseTensor>("Seed");
+    auto *out = ctx.Output<phi::DenseTensor>("Out");
+    auto *shuffleidx = ctx.Output<phi::DenseTensor>("ShuffleIdx");
+    auto *seed_out = ctx.Output<phi::DenseTensor>("SeedOut");
 
     int64_t x_embed_size = x->dims()[x->dims().size() - 1];
     int64_t elem_size = 1;
@@ -76,7 +76,7 @@ class ShuffleBatchCUDAKernel : public framework::OpKernel<T> {
         // NOTE: We have overwritten GetKernelTypeForVar, so seed_place would
         // not be CUDAPlace in practice. This case would only happen in Python
         // op_test framework.
-        framework::Tensor tmp_tensor;
+        phi::DenseTensor tmp_tensor;
         framework::TensorCopySync(*seed, platform::CPUPlace(), &tmp_tensor);
         seed_int = *(tmp_tensor.data<int64_t>());
       } else {
@@ -126,9 +126,9 @@ class ShuffleBatchGradCUDAKernel : public framework::OpKernel<T> {
         "GPU shuffle_batch_grad is not supported on Windows yet"));
 #else
     const auto *out_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    const auto *shuffleidx = ctx.Input<framework::Tensor>("ShuffleIdx");
-    auto *x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    const auto *shuffleidx = ctx.Input<phi::DenseTensor>("ShuffleIdx");
+    auto *x_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     const auto *out_grad_data = out_grad->data<T>();
     const auto *shuffleidx_data = shuffleidx->data<int64_t>();
diff --git a/paddle/fluid/operators/shuffle_batch_op.h b/paddle/fluid/operators/shuffle_batch_op.h
index 009212afa81d0..cd24a8a2de159 100644
--- a/paddle/fluid/operators/shuffle_batch_op.h
+++ b/paddle/fluid/operators/shuffle_batch_op.h
@@ -32,7 +32,7 @@
 
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 
 template <typename T>
diff --git a/paddle/fluid/operators/shuffle_channel_op.cu b/paddle/fluid/operators/shuffle_channel_op.cu
index f51724d843107..26eee095377c0 100644
--- a/paddle/fluid/operators/shuffle_channel_op.cu
+++ b/paddle/fluid/operators/shuffle_channel_op.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 static constexpr int kNumCUDAThreads = 512;
 static constexpr int kNumMaximumNumBlocks = 4096;
 
@@ -48,8 +48,8 @@ template <typename DeviceContext, typename T>
 class ShuffleChannelOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<framework::Tensor>("X");
-    auto* output = ctx.Output<framework::Tensor>("Out");
+    auto* input = ctx.Input<phi::DenseTensor>("X");
+    auto* output = ctx.Output<phi::DenseTensor>("Out");
     int group = ctx.Attr<int>("group");
 
     auto input_dims = input->dims();
@@ -88,9 +88,9 @@ class ShuffleChannelGradOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* output_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
     auto* input_grad =
-        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     int group = ctx.Attr<int>("group");
 
diff --git a/paddle/fluid/operators/shuffle_channel_op.h b/paddle/fluid/operators/shuffle_channel_op.h
index 06abd0628ea39..51dfd894c0cfc 100644
--- a/paddle/fluid/operators/shuffle_channel_op.h
+++ b/paddle/fluid/operators/shuffle_channel_op.h
@@ -23,8 +23,8 @@ template <typename DeviceContext, typename T>
 class ShuffleChannelOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<framework::Tensor>("X");
-    auto* output = ctx.Output<framework::Tensor>("Out");
+    auto* input = ctx.Input<phi::DenseTensor>("X");
+    auto* output = ctx.Output<phi::DenseTensor>("Out");
     int group = ctx.Attr<int>("group");
 
     const auto& input_dims = input->dims();
@@ -59,9 +59,9 @@ class ShuffleChannelGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* output_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
     auto* input_grad =
-        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     int group = ctx.Attr<int>("group");
 
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
index c9705fa9a9924..a05fae4b45a38 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
 const int kIgnoreIndex = -100;
 
 class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel {
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_mlu.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_mlu.cc
index b679432e51a5b..d77724281327c 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_mlu.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_mlu.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 const int kIgnoreIndex = -100;
 
 void CheckAttrs(const framework::ExecutionContext& ctx) {
@@ -43,10 +43,10 @@ class SigmoidCrossEntropyWithLogitsMLUKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     CheckAttrs(ctx);
 
-    auto* x = ctx.Input<Tensor>("X");
-    auto* label = ctx.Input<Tensor>("Label");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* label = ctx.Input<phi::DenseTensor>("Label");
 
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     auto place = ctx.GetPlace();
 
@@ -77,11 +77,11 @@ class SigmoidCrossEntropyWithLogitsMLUGradKernel
   void Compute(const framework::ExecutionContext& ctx) const override {
     CheckAttrs(ctx);
 
-    auto* x = ctx.Input<Tensor>("X");
-    auto* label = ctx.Input<Tensor>("Label");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* label = ctx.Input<phi::DenseTensor>("Label");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
 
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     auto place = ctx.GetPlace();
 
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
index 59eb23aceda02..ea3f119a05a91 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 const int kIgnoreIndex = -100;
 
 void CheckAttrs(const framework::ExecutionContext& ctx) {
@@ -45,10 +45,10 @@ class SigmoidCrossEntropyWithLogitsNPUKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     CheckAttrs(ctx);
 
-    auto* x = ctx.Input<Tensor>("X");
-    auto* label = ctx.Input<Tensor>("Label");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* label = ctx.Input<phi::DenseTensor>("Label");
 
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     auto place = ctx.GetPlace();
 
@@ -71,11 +71,11 @@ class SigmoidCrossEntropyWithLogitsNPUGradKernel
   void Compute(const framework::ExecutionContext& ctx) const override {
     CheckAttrs(ctx);
 
-    auto* x = ctx.Input<Tensor>("X");
-    auto* label = ctx.Input<Tensor>("Label");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* label = ctx.Input<phi::DenseTensor>("Label");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
 
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     auto place = ctx.GetPlace();
 
diff --git a/paddle/fluid/operators/similarity_focus_op.h b/paddle/fluid/operators/similarity_focus_op.h
index b6de90e8e0367..8c055c2323c84 100644
--- a/paddle/fluid/operators/similarity_focus_op.h
+++ b/paddle/fluid/operators/similarity_focus_op.h
@@ -24,14 +24,14 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class SimilarityFocusKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    Tensor* out = context.Output<Tensor>("Out");
-    const Tensor* x = context.Input<Tensor>("X");
+    phi::DenseTensor* out = context.Output<phi::DenseTensor>("Out");
+    const phi::DenseTensor* x = context.Input<phi::DenseTensor>("X");
     T* out_data = out->mutable_data<T>(context.GetPlace());
     const T* x_data = x->data<T>();
 
diff --git a/paddle/fluid/operators/size_op.cc b/paddle/fluid/operators/size_op.cc
index 79b4be9de2dbb..6d04f7a1c7cab 100644
--- a/paddle/fluid/operators/size_op.cc
+++ b/paddle/fluid/operators/size_op.cc
@@ -33,7 +33,7 @@ class SizeOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const framework::Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     return expected_kernel_type;
   }
diff --git a/paddle/fluid/operators/size_op_mlu.cc b/paddle/fluid/operators/size_op_mlu.cc
index 32338b05d6e30..5553f538a575d 100644
--- a/paddle/fluid/operators/size_op_mlu.cc
+++ b/paddle/fluid/operators/size_op_mlu.cc
@@ -22,8 +22,8 @@ template <typename T>
 class SizeMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::Tensor>("Input");
-    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("Input");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     out->mutable_data<int64_t>(ctx.GetPlace());
 
     int64_t size = x->numel();
diff --git a/paddle/fluid/operators/size_op_npu.cc b/paddle/fluid/operators/size_op_npu.cc
index ed95a85be9815..92aeba9280568 100644
--- a/paddle/fluid/operators/size_op_npu.cc
+++ b/paddle/fluid/operators/size_op_npu.cc
@@ -22,8 +22,8 @@ template <typename DeviceContext, typename T>
 class SizeNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::Tensor>("Input");
-    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("Input");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
 
     Tensor cpu_tensor;
diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc
index 9d9e5816db702..44a9bd24032ab 100644
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
@@ -24,7 +24,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class SliceOp : public framework::OperatorWithKernel {
  public:
@@ -162,9 +162,9 @@ class SliceOp : public framework::OperatorWithKernel {
         // 16(depending on which blocking format is used) submemory cannot be
         // created, so in that scenario a fallback is needed
         auto tmp_md = dnnl::memory::desc(
-            phi::vectorize(ctx.Input<Tensor>("Input")->dims()),
+            phi::vectorize(ctx.Input<phi::DenseTensor>("Input")->dims()),
             dnnl::memory::data_type::f32,
-            ctx.Input<Tensor>("Input")->format());
+            ctx.Input<phi::DenseTensor>("Input")->format());
         if (tmp_md.data.format_desc.blocking.inner_nblks == 0)
           return framework::OpKernelType(input_data_type,
                                          ctx.GetPlace(),
@@ -338,9 +338,10 @@ class SliceOpGrad : public framework::OperatorWithKernel {
       // created, so in that scenario a fallback is needed
       auto tmp_md = dnnl::memory::desc(
           phi::vectorize(
-              ctx.Input<Tensor>(framework::GradVarName("Out"))->dims()),
+              ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"))
+                  ->dims()),
           dnnl::memory::data_type::f32,
-          ctx.Input<Tensor>(framework::GradVarName("Out"))->format());
+          ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"))->format());
       if (tmp_md.data.format_desc.blocking.inner_nblks == 0)
         return framework::OpKernelType(input_data_type,
                                        ctx.GetPlace(),
diff --git a/paddle/fluid/operators/slice_op.h b/paddle/fluid/operators/slice_op.h
index beaec7bc5b91b..5efb0c3819450 100644
--- a/paddle/fluid/operators/slice_op.h
+++ b/paddle/fluid/operators/slice_op.h
@@ -24,7 +24,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using Variable = framework::Variable;
 using LoDTensorArray = framework::LoDTensorArray;
 using DDim = framework::DDim;
@@ -73,7 +73,7 @@ inline void DealTensorArray(const framework::ExecutionContext& ctx,
       }
     }
   } else {
-    auto out = ctx.Output<Tensor>("Out");
+    auto out = ctx.Output<phi::DenseTensor>("Out");
     auto in_tensor = in_array->at(start);
     paddle::framework::TensorCopy(in_tensor, ctx.GetPlace(), out);
   }
@@ -99,16 +99,19 @@ class SliceKernel : public framework::OpKernel<T> {
     auto infer_flags = ctx.Attr<std::vector<int>>("infer_flags");
 
     // Step 1: Get the accurate attribute value of starts and ends
-    auto starts_tensor_list = ctx.MultiInput<Tensor>("StartsTensorList");
+    auto starts_tensor_list =
+        ctx.MultiInput<phi::DenseTensor>("StartsTensorList");
     if (ctx.HasInput("StartsTensor")) {
-      starts = GetDataFromTensor<int64_t>(ctx.Input<Tensor>("StartsTensor"));
+      starts = GetDataFromTensor<int64_t>(
+          ctx.Input<phi::DenseTensor>("StartsTensor"));
     } else if (starts_tensor_list.size() > 0) {
       starts = GetDataFromTensorList<int64_t>(starts_tensor_list);
     }
 
-    auto ends_tensor_list = ctx.MultiInput<Tensor>("EndsTensorList");
+    auto ends_tensor_list = ctx.MultiInput<phi::DenseTensor>("EndsTensorList");
     if (ctx.HasInput("EndsTensor")) {
-      ends = GetDataFromTensor<int64_t>(ctx.Input<Tensor>("EndsTensor"));
+      ends =
+          GetDataFromTensor<int64_t>(ctx.Input<phi::DenseTensor>("EndsTensor"));
     } else if (ends_tensor_list.size() > 0) {
       ends = GetDataFromTensorList<int64_t>(ends_tensor_list);
     }
@@ -143,16 +146,19 @@ class SliceGradKernel : public framework::OpKernel<T> {
     std::vector<int64_t> ends(ends_int.begin(), ends_int.end());
 
     // Get the accurate attribute value of starts and ends
-    auto starts_tensor_list = ctx.MultiInput<Tensor>("StartsTensorList");
+    auto starts_tensor_list =
+        ctx.MultiInput<phi::DenseTensor>("StartsTensorList");
     if (ctx.HasInput("StartsTensor")) {
-      starts = GetDataFromTensor<int64_t>(ctx.Input<Tensor>("StartsTensor"));
+      starts = GetDataFromTensor<int64_t>(
+          ctx.Input<phi::DenseTensor>("StartsTensor"));
     } else if (starts_tensor_list.size() > 0) {
       starts = GetDataFromTensorList<int64_t>(starts_tensor_list);
     }
 
-    auto ends_tensor_list = ctx.MultiInput<Tensor>("EndsTensorList");
+    auto ends_tensor_list = ctx.MultiInput<phi::DenseTensor>("EndsTensorList");
     if (ctx.HasInput("EndsTensor")) {
-      ends = GetDataFromTensor<int64_t>(ctx.Input<Tensor>("EndsTensor"));
+      ends =
+          GetDataFromTensor<int64_t>(ctx.Input<phi::DenseTensor>("EndsTensor"));
     } else if (ends_tensor_list.size() > 0) {
       ends = GetDataFromTensorList<int64_t>(ends_tensor_list);
     }
@@ -196,7 +202,8 @@ class SliceGradKernel : public framework::OpKernel<T> {
               d_out_arr->at(i), ctx.GetPlace(), &(d_in_arr->at(start + i)));
         }
       } else {
-        auto* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
+        auto* d_out =
+            ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
         paddle::framework::TensorCopy(
             *d_out, ctx.GetPlace(), &(d_in_arr->at(start)));
       }
diff --git a/paddle/fluid/operators/slice_op_mlu.cc b/paddle/fluid/operators/slice_op_mlu.cc
index 7d12916f33439..60c86b1fcf5f6 100644
--- a/paddle/fluid/operators/slice_op_mlu.cc
+++ b/paddle/fluid/operators/slice_op_mlu.cc
@@ -19,14 +19,14 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class SliceMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("Input");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* input = ctx.Input<phi::DenseTensor>("Input");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     auto axes = ctx.Attr<std::vector<int>>("axes");
     auto starts = ctx.Attr<std::vector<int>>("starts");
@@ -36,16 +36,18 @@ class SliceMLUKernel : public framework::OpKernel<T> {
     auto infer_flags = ctx.Attr<std::vector<int>>("infer_flags");
 
     // Get the accurate attribute value of starts and ends
-    auto starts_tensor_list = ctx.MultiInput<Tensor>("StartsTensorList");
+    auto starts_tensor_list =
+        ctx.MultiInput<phi::DenseTensor>("StartsTensorList");
     if (ctx.HasInput("StartsTensor")) {
-      starts = GetDataFromTensor<int>(ctx.Input<Tensor>("StartsTensor"));
+      starts =
+          GetDataFromTensor<int>(ctx.Input<phi::DenseTensor>("StartsTensor"));
     } else if (starts_tensor_list.size() > 0) {
       starts = GetDataFromTensorList<int>(starts_tensor_list);
     }
 
-    auto ends_tensor_list = ctx.MultiInput<Tensor>("EndsTensorList");
+    auto ends_tensor_list = ctx.MultiInput<phi::DenseTensor>("EndsTensorList");
     if (ctx.HasInput("EndsTensor")) {
-      ends = GetDataFromTensor<int>(ctx.Input<Tensor>("EndsTensor"));
+      ends = GetDataFromTensor<int>(ctx.Input<phi::DenseTensor>("EndsTensor"));
     } else if (ends_tensor_list.size() > 0) {
       ends = GetDataFromTensorList<int>(ends_tensor_list);
     }
@@ -127,25 +129,28 @@ template <typename T>
 class SliceGradMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("Input");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dinput = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    auto* input = ctx.Input<phi::DenseTensor>("Input");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dinput =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
 
     auto axes = ctx.Attr<std::vector<int>>("axes");
     auto starts = ctx.Attr<std::vector<int>>("starts");
     auto ends = ctx.Attr<std::vector<int>>("ends");
 
     // Get the accurate attribute value of starts and ends
-    auto starts_tensor_list = ctx.MultiInput<Tensor>("StartsTensorList");
+    auto starts_tensor_list =
+        ctx.MultiInput<phi::DenseTensor>("StartsTensorList");
     if (ctx.HasInput("StartsTensor")) {
-      starts = GetDataFromTensor<int>(ctx.Input<Tensor>("StartsTensor"));
+      starts =
+          GetDataFromTensor<int>(ctx.Input<phi::DenseTensor>("StartsTensor"));
     } else if (starts_tensor_list.size() > 0) {
       starts = GetDataFromTensorList<int>(starts_tensor_list);
     }
 
-    auto ends_tensor_list = ctx.MultiInput<Tensor>("EndsTensorList");
+    auto ends_tensor_list = ctx.MultiInput<phi::DenseTensor>("EndsTensorList");
     if (ctx.HasInput("EndsTensor")) {
-      ends = GetDataFromTensor<int>(ctx.Input<Tensor>("EndsTensor"));
+      ends = GetDataFromTensor<int>(ctx.Input<phi::DenseTensor>("EndsTensor"));
     } else if (ends_tensor_list.size() > 0) {
       ends = GetDataFromTensorList<int>(ends_tensor_list);
     }
diff --git a/paddle/fluid/operators/slice_op_npu.cc b/paddle/fluid/operators/slice_op_npu.cc
index 9d248bfd7f39c..5ed606c7e0057 100644
--- a/paddle/fluid/operators/slice_op_npu.cc
+++ b/paddle/fluid/operators/slice_op_npu.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using NPUDeviceContext = platform::NPUDeviceContext;
 
 void UpdateAttr(const framework::DDim& in_dims,
@@ -58,8 +58,8 @@ template <typename T>
 class SliceNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("Input");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* input = ctx.Input<phi::DenseTensor>("Input");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     auto axes_int = ctx.Attr<std::vector<int>>("axes");
     auto starts_int = ctx.Attr<std::vector<int>>("starts");
@@ -74,16 +74,18 @@ class SliceNPUKernel : public framework::OpKernel<T> {
     const auto& in_dims = input->dims();
 
     // Get the accurate attribute value of starts and ends
-    auto starts_tensor_list = ctx.MultiInput<Tensor>("StartsTensorList");
+    auto starts_tensor_list =
+        ctx.MultiInput<phi::DenseTensor>("StartsTensorList");
     if (ctx.HasInput("StartsTensor")) {
-      starts = GetDataFromTensor<int>(ctx.Input<Tensor>("StartsTensor"));
+      starts =
+          GetDataFromTensor<int>(ctx.Input<phi::DenseTensor>("StartsTensor"));
     } else if (starts_tensor_list.size() > 0) {
       starts = GetDataFromTensorList<int>(starts_tensor_list);
     }
 
-    auto ends_tensor_list = ctx.MultiInput<Tensor>("EndsTensorList");
+    auto ends_tensor_list = ctx.MultiInput<phi::DenseTensor>("EndsTensorList");
     if (ctx.HasInput("EndsTensor")) {
-      ends = GetDataFromTensor<int>(ctx.Input<Tensor>("EndsTensor"));
+      ends = GetDataFromTensor<int>(ctx.Input<phi::DenseTensor>("EndsTensor"));
     } else if (ends_tensor_list.size() > 0) {
       ends = GetDataFromTensorList<int>(ends_tensor_list);
     }
@@ -154,9 +156,10 @@ template <typename T>
 class SliceGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("Input");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dinput = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    auto* input = ctx.Input<phi::DenseTensor>("Input");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dinput =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
 
     auto axes_int = ctx.Attr<std::vector<int>>("axes");
     auto starts_int = ctx.Attr<std::vector<int>>("starts");
@@ -166,16 +169,18 @@ class SliceGradNPUKernel : public framework::OpKernel<T> {
     std::vector<int> ends(ends_int.begin(), ends_int.end());
 
     // Get the accurate attribute value of starts and ends
-    auto starts_tensor_list = ctx.MultiInput<Tensor>("StartsTensorList");
+    auto starts_tensor_list =
+        ctx.MultiInput<phi::DenseTensor>("StartsTensorList");
     if (ctx.HasInput("StartsTensor")) {
-      starts = GetDataFromTensor<int>(ctx.Input<Tensor>("StartsTensor"));
+      starts =
+          GetDataFromTensor<int>(ctx.Input<phi::DenseTensor>("StartsTensor"));
     } else if (starts_tensor_list.size() > 0) {
       starts = GetDataFromTensorList<int>(starts_tensor_list);
     }
 
-    auto ends_tensor_list = ctx.MultiInput<Tensor>("EndsTensorList");
+    auto ends_tensor_list = ctx.MultiInput<phi::DenseTensor>("EndsTensorList");
     if (ctx.HasInput("EndsTensor")) {
-      ends = GetDataFromTensor<int>(ctx.Input<Tensor>("EndsTensor"));
+      ends = GetDataFromTensor<int>(ctx.Input<phi::DenseTensor>("EndsTensor"));
     } else if (ends_tensor_list.size() > 0) {
       ends = GetDataFromTensorList<int>(ends_tensor_list);
     }
diff --git a/paddle/fluid/operators/smooth_l1_loss_op.h b/paddle/fluid/operators/smooth_l1_loss_op.h
index fd9d62a78f540..3cc565ef91203 100644
--- a/paddle/fluid/operators/smooth_l1_loss_op.h
+++ b/paddle/fluid/operators/smooth_l1_loss_op.h
@@ -20,7 +20,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
@@ -50,12 +50,12 @@ template <typename DeviceContext, typename T, typename AttrType = T>
 class SmoothL1LossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("X");
-    auto* in1 = context.Input<Tensor>("Y");
-    auto* in2 = context.Input<Tensor>("InsideWeight");
-    auto* in3 = context.Input<Tensor>("OutsideWeight");
-    auto* out0 = context.Output<Tensor>("Diff");
-    auto* out1 = context.Output<Tensor>("Out");
+    auto* in0 = context.Input<phi::DenseTensor>("X");
+    auto* in1 = context.Input<phi::DenseTensor>("Y");
+    auto* in2 = context.Input<phi::DenseTensor>("InsideWeight");
+    auto* in3 = context.Input<phi::DenseTensor>("OutsideWeight");
+    auto* out0 = context.Output<phi::DenseTensor>("Diff");
+    auto* out1 = context.Output<phi::DenseTensor>("Out");
 
     out0->mutable_data<T>(context.GetPlace());
     out1->mutable_data<T>(context.GetPlace());
@@ -121,10 +121,10 @@ template <typename DeviceContext, typename T, typename AttrType = T>
 class SmoothL1LossGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("InsideWeight");
-    auto* in1 = context.Input<Tensor>("OutsideWeight");
-    auto* in2 = context.Input<Tensor>("Diff");
-    auto* og = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* in0 = context.Input<phi::DenseTensor>("InsideWeight");
+    auto* in1 = context.Input<phi::DenseTensor>("OutsideWeight");
+    auto* in2 = context.Input<phi::DenseTensor>("Diff");
+    auto* og = context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
     auto sigma = static_cast<T>(context.Attr<AttrType>("sigma"));
     T sigma2 = sigma * sigma;
     bool has_weight = (in0 != nullptr) && (in1 != nullptr);
@@ -165,8 +165,8 @@ class SmoothL1LossGradKernel : public framework::OpKernel<T> {
                          Eigen::array<int, 2>({{1, static_cast<int>(cols)}})) *
                      weights * diff_mat_view;
 
-    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* out1 = context.Output<Tensor>(framework::GradVarName("Y"));
+    auto* out0 = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* out1 = context.Output<phi::DenseTensor>(framework::GradVarName("Y"));
 
     if (out0) {
       out0->mutable_data<T>(context.GetPlace());
diff --git a/paddle/fluid/operators/smooth_l1_loss_op_npu.cc b/paddle/fluid/operators/smooth_l1_loss_op_npu.cc
index 5c1e0cfbb5e90..1a4fb14bbb0b6 100644
--- a/paddle/fluid/operators/smooth_l1_loss_op_npu.cc
+++ b/paddle/fluid/operators/smooth_l1_loss_op_npu.cc
@@ -23,12 +23,12 @@ template <typename DeviceContext, typename T>
 class SmoothL1LossNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* in_x = context.Input<Tensor>("X");
-    auto* in_y = context.Input<Tensor>("Y");
-    auto* inside_weight = context.Input<Tensor>("InsideWeight");
-    auto* outside_weight = context.Input<Tensor>("OutsideWeight");
-    auto* out_diff = context.Output<Tensor>("Diff");
-    auto* out_loss = context.Output<Tensor>("Out");
+    auto* in_x = context.Input<phi::DenseTensor>("X");
+    auto* in_y = context.Input<phi::DenseTensor>("Y");
+    auto* inside_weight = context.Input<phi::DenseTensor>("InsideWeight");
+    auto* outside_weight = context.Input<phi::DenseTensor>("OutsideWeight");
+    auto* out_diff = context.Output<phi::DenseTensor>("Diff");
+    auto* out_loss = context.Output<phi::DenseTensor>("Out");
     out_diff->mutable_data<T>(context.GetPlace());
     out_loss->mutable_data<T>(context.GetPlace());
 
@@ -117,12 +117,14 @@ template <typename DeviceContext, typename T>
 class SmoothL1LossGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* inside_weight = context.Input<Tensor>("InsideWeight");
-    auto* outside_weight = context.Input<Tensor>("OutsideWeight");
-    auto* diff = context.Input<Tensor>("Diff");
-    auto* og = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* outx_grad = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* outy_grad = context.Output<Tensor>(framework::GradVarName("Y"));
+    auto* inside_weight = context.Input<phi::DenseTensor>("InsideWeight");
+    auto* outside_weight = context.Input<phi::DenseTensor>("OutsideWeight");
+    auto* diff = context.Input<phi::DenseTensor>("Diff");
+    auto* og = context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* outx_grad =
+        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* outy_grad =
+        context.Output<phi::DenseTensor>(framework::GradVarName("Y"));
     auto sigma = context.Attr<T>("sigma");
     T sigma2 = 1.0 / (sigma * sigma);
     bool has_weight = (inside_weight != nullptr) && (outside_weight != nullptr);
diff --git a/paddle/fluid/operators/softmax_op_mlu.cc b/paddle/fluid/operators/softmax_op_mlu.cc
index 50ef6c6599294..0a39b1335d6e4 100644
--- a/paddle/fluid/operators/softmax_op_mlu.cc
+++ b/paddle/fluid/operators/softmax_op_mlu.cc
@@ -67,7 +67,7 @@ class SoftmaxGradMLUKernel : public framework::OpKernel<T> {
     auto* out = ctx.Input<framework::LoDTensor>("Out");
     auto* dOut = ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
 
-    auto* dX = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dX = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     dX->mutable_data<T>(ctx.GetPlace());
 
     const int rank = out->dims().size();
diff --git a/paddle/fluid/operators/softmax_op_npu.cc b/paddle/fluid/operators/softmax_op_npu.cc
index 6e4ccadaec04f..fa40fa3be7a66 100644
--- a/paddle/fluid/operators/softmax_op_npu.cc
+++ b/paddle/fluid/operators/softmax_op_npu.cc
@@ -48,7 +48,7 @@ class SoftmaxGradNPUKernel : public framework::OpKernel<T> {
     auto* out = ctx.Input<framework::LoDTensor>("Out");
     auto* dOut = ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
 
-    auto* dX = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dX = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     auto dims = dX->dims();
     const int rank = dims.size();
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc
index f2fb529656744..91333b3393000 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc
@@ -19,17 +19,17 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class SoftmaxWithCrossEntropyMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* logits = ctx.Input<Tensor>("Logits");
-    auto* labels = ctx.Input<Tensor>("Label");
-    auto* softmax = ctx.Output<Tensor>("Softmax");
-    auto* loss = ctx.Output<Tensor>("Loss");
-    auto* backprop = ctx.Output<Tensor>("Backprop");
+    auto* logits = ctx.Input<phi::DenseTensor>("Logits");
+    auto* labels = ctx.Input<phi::DenseTensor>("Label");
+    auto* softmax = ctx.Output<phi::DenseTensor>("Softmax");
+    auto* loss = ctx.Output<phi::DenseTensor>("Loss");
+    auto* backprop = ctx.Output<phi::DenseTensor>("Backprop");
     auto soft_label = ctx.Attr<bool>("soft_label");
 
     PADDLE_ENFORCE_EQ(ctx.Attr<bool>("use_softmax"),
@@ -103,7 +103,7 @@ class SoftmaxWithCrossEntropyMLUKernel : public framework::OpKernel<T> {
                         platform::errors::InvalidArgument(
                             "If soft_label=False, axis must be -1 or"
                             " can be regard as last dimention in mlu kernel."));
-      framework::Tensor labels_int32(framework::TransToPhiDataType(VT::INT32));
+      phi::DenseTensor labels_int32(framework::TransToPhiDataType(VT::INT32));
       labels_int32.Resize(labels->dims());
       labels_int32.mutable_data<int32_t>(ctx.GetPlace());
 
@@ -142,9 +142,11 @@ template <typename T>
 class SoftmaxWithCrossEntropyGradMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* backprop = ctx.Input<Tensor>("Backprop");
-    auto* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
-    auto* logits_grad = ctx.Output<Tensor>(framework::GradVarName("Logits"));
+    auto* backprop = ctx.Input<phi::DenseTensor>("Backprop");
+    auto* loss_grad =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Loss"));
+    auto* logits_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Logits"));
     PADDLE_ENFORCE_NOT_NULL(backprop,
                             platform::errors::PreconditionNotMet(
                                 "backprop should not be null in MLU kernel of "
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
index db1581a26febb..ddcb07b4d77e4 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
@@ -24,17 +24,17 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* logits = ctx.Input<Tensor>("Logits");
-    auto* labels = ctx.Input<Tensor>("Label");
-    auto* softmax = ctx.Output<Tensor>("Softmax");
-    auto* loss = ctx.Output<Tensor>("Loss");
-    auto* backprop = ctx.Output<Tensor>("Backprop");
+    auto* logits = ctx.Input<phi::DenseTensor>("Logits");
+    auto* labels = ctx.Input<phi::DenseTensor>("Label");
+    auto* softmax = ctx.Output<phi::DenseTensor>("Softmax");
+    auto* loss = ctx.Output<phi::DenseTensor>("Loss");
+    auto* backprop = ctx.Output<phi::DenseTensor>("Backprop");
     auto soft_label = ctx.Attr<bool>("soft_label");
     PADDLE_ENFORCE_EQ(soft_label,
                       false,
@@ -93,9 +93,11 @@ template <typename DeviceContext, typename T>
 class SoftmaxWithCrossEntropyGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* backprop = ctx.Input<Tensor>("Backprop");
-    auto* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
-    auto* logits_grad = ctx.Output<Tensor>(framework::GradVarName("Logits"));
+    auto* backprop = ctx.Input<phi::DenseTensor>("Backprop");
+    auto* loss_grad =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Loss"));
+    auto* logits_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Logits"));
 
     PADDLE_ENFORCE_NOT_NULL(backprop,
                             platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/operators/space_to_depth_op.cc b/paddle/fluid/operators/space_to_depth_op.cc
index dce7539fe72b8..6cc8d0f79be4e 100644
--- a/paddle/fluid/operators/space_to_depth_op.cc
+++ b/paddle/fluid/operators/space_to_depth_op.cc
@@ -23,7 +23,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class SpaceToDepthOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/sparse_attention_op.cu b/paddle/fluid/operators/sparse_attention_op.cu
index fd2ccfdea33cf..b03a0b6c84e71 100644
--- a/paddle/fluid/operators/sparse_attention_op.cu
+++ b/paddle/fluid/operators/sparse_attention_op.cu
@@ -203,22 +203,22 @@ __global__ void BlockSparseSoftmaxBackward(T* dst,
   }
 }
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 /*
 input: sparse C in CSR format (num_rows,num_rows)
 output: sparse C after softmax operation
 */
 template <typename DeviceContext, typename T>
 void SparseSoftmaxForward(const phi::GPUContext& ctx,
-                          const Tensor* offset,
-                          const Tensor* columns,
-                          Tensor* input,
-                          Tensor* output,
+                          const phi::DenseTensor* offset,
+                          const phi::DenseTensor* columns,
+                          phi::DenseTensor* input,
+                          phi::DenseTensor* output,
                           const int blocksize,
                           const int num_rows,
                           const int num_cols,
-                          const Tensor* key_padding_mask,
-                          const Tensor* attn_mask) {
+                          const phi::DenseTensor* key_padding_mask,
+                          const phi::DenseTensor* attn_mask) {
   const int* offset_data = offset->data<int>();
   const int* columns_data = columns->data<int>();
   T* input_data = input->data<T>();
@@ -323,11 +323,11 @@ void SparseSoftmaxForward(const phi::GPUContext& ctx,
 
 template <typename DeviceContext, typename T>
 void SparseSoftmaxBackward(const phi::GPUContext& ctx,
-                           const Tensor* offset,
-                           const Tensor* columns,
-                           Tensor* dx,
-                           const Tensor* dout,
-                           const Tensor* out,
+                           const phi::DenseTensor* offset,
+                           const phi::DenseTensor* columns,
+                           phi::DenseTensor* dx,
+                           const phi::DenseTensor* dout,
+                           const phi::DenseTensor* out,
                            const int blocksize,
                            const int num_rows,
                            const int num_cols) {
@@ -454,11 +454,11 @@ output: sparse C in CSR format (num_rows,num_rows)
 */
 template <typename DeviceContext, typename T>
 void DotSdd(const phi::GPUContext& ctx,
-            const Tensor* a,
-            const Tensor* b,
-            const Tensor* c_offset,
-            const Tensor* c_columns,
-            Tensor* c_value,
+            const phi::DenseTensor* a,
+            const phi::DenseTensor* b,
+            const phi::DenseTensor* c_offset,
+            const phi::DenseTensor* c_columns,
+            phi::DenseTensor* c_value,
             const int num_rows,
             const int num_cols,
             const bool a_transpose,
@@ -550,11 +550,11 @@ output: dense C (num_rows,num_cols)
 */
 template <typename DeviceContext, typename T>
 void DotDsd(const phi::GPUContext& ctx,
-            const Tensor* a_offset,
-            const Tensor* a_columns,
-            const Tensor* a_value,
-            const Tensor* b,
-            Tensor* c,
+            const phi::DenseTensor* a_offset,
+            const phi::DenseTensor* a_columns,
+            const phi::DenseTensor* a_value,
+            const phi::DenseTensor* b,
+            phi::DenseTensor* c,
             const int num_rows,
             const int num_cols,
             const bool a_transpose,
@@ -641,7 +641,7 @@ void DotDsd(const phi::GPUContext& ctx,
   platform::dynload::cusparseDestroy(handle);
 }
 
-std::vector<Tensor> GetSplitTensor(Tensor* input) {
+std::vector<Tensor> GetSplitTensor(phi::DenseTensor* input) {
   auto dims = input->dims();
   int batch_size = dims[0];
   int num_heads = dims[1];
@@ -658,23 +658,24 @@ template <typename DeviceContext, typename T>
 class SparseAttentionCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto query = *ctx.Input<Tensor>("Q");
-    auto key = *ctx.Input<Tensor>("K");
-    auto value = *ctx.Input<Tensor>("V");
-    auto offset = *ctx.Input<Tensor>("Offset");
-    auto columns = *ctx.Input<Tensor>("Columns");
-    auto output_ptr = ctx.Output<Tensor>("Out");
+    auto query = *ctx.Input<phi::DenseTensor>("Q");
+    auto key = *ctx.Input<phi::DenseTensor>("K");
+    auto value = *ctx.Input<phi::DenseTensor>("V");
+    auto offset = *ctx.Input<phi::DenseTensor>("Offset");
+    auto columns = *ctx.Input<phi::DenseTensor>("Columns");
+    auto output_ptr = ctx.Output<phi::DenseTensor>("Out");
     output_ptr->mutable_data<T>(ctx.GetPlace());
-    auto sparse_dot_sdd_ptr = ctx.Output<Tensor>("SparseDotSdd");
+    auto sparse_dot_sdd_ptr = ctx.Output<phi::DenseTensor>("SparseDotSdd");
     sparse_dot_sdd_ptr->mutable_data<T>(ctx.GetPlace());
-    auto softmax_ptr = ctx.Output<Tensor>("Softmax");
+    auto softmax_ptr = ctx.Output<phi::DenseTensor>("Softmax");
     softmax_ptr->mutable_data<T>(ctx.GetPlace());
     // add Mask
     auto* key_padding_mask = ctx.HasInput("KeyPaddingMask")
-                                 ? ctx.Input<Tensor>("KeyPaddingMask")
+                                 ? ctx.Input<phi::DenseTensor>("KeyPaddingMask")
                                  : nullptr;
-    auto* attn_mask =
-        ctx.HasInput("AttnMask") ? ctx.Input<Tensor>("AttnMask") : nullptr;
+    auto* attn_mask = ctx.HasInput("AttnMask")
+                          ? ctx.Input<phi::DenseTensor>("AttnMask")
+                          : nullptr;
 
     auto output = *output_ptr;
     auto result_sdd = *sparse_dot_sdd_ptr;
@@ -775,17 +776,19 @@ template <typename DeviceContext, typename T>
 class SparseAttentionGradCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto query = *ctx.Input<Tensor>("Q");
-    auto key = *ctx.Input<Tensor>("K");
-    auto value = *ctx.Input<Tensor>("V");
-    auto offset = *ctx.Input<Tensor>("Offset");
-    auto columns = *ctx.Input<Tensor>("Columns");
-    auto sparse_dot_sdd = *ctx.Input<Tensor>("SparseDotSdd");
-    auto softmax = *ctx.Input<Tensor>("Softmax");
-    auto dout = *ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dquery_ptr = ctx.Output<Tensor>(framework::GradVarName("Q"));
-    auto* dkey_ptr = ctx.Output<Tensor>(framework::GradVarName("K"));
-    auto* dvalue_ptr = ctx.Output<Tensor>(framework::GradVarName("V"));
+    auto query = *ctx.Input<phi::DenseTensor>("Q");
+    auto key = *ctx.Input<phi::DenseTensor>("K");
+    auto value = *ctx.Input<phi::DenseTensor>("V");
+    auto offset = *ctx.Input<phi::DenseTensor>("Offset");
+    auto columns = *ctx.Input<phi::DenseTensor>("Columns");
+    auto sparse_dot_sdd = *ctx.Input<phi::DenseTensor>("SparseDotSdd");
+    auto softmax = *ctx.Input<phi::DenseTensor>("Softmax");
+    auto dout = *ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dquery_ptr =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Q"));
+    auto* dkey_ptr = ctx.Output<phi::DenseTensor>(framework::GradVarName("K"));
+    auto* dvalue_ptr =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("V"));
     dquery_ptr->mutable_data<T>(ctx.GetPlace());
     dkey_ptr->mutable_data<T>(ctx.GetPlace());
     dvalue_ptr->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/spectral_norm_op.cc b/paddle/fluid/operators/spectral_norm_op.cc
index 19a846afd4376..372e31aa9af63 100644
--- a/paddle/fluid/operators/spectral_norm_op.cc
+++ b/paddle/fluid/operators/spectral_norm_op.cc
@@ -20,8 +20,6 @@
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class SpectralNormOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc
index e5c59575a749b..8ca08e75c40ec 100644
--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
@@ -22,7 +22,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 using framework::LoDTensor;
-using framework::Tensor;
+
 using framework::Variable;
 
 class SplitOp : public framework::OperatorWithKernel {
@@ -120,7 +120,7 @@ class SplitOp : public framework::OperatorWithKernel {
       // reorders, because if blocked dimension is not divisible by 8 or
       // 16(depending on which blocking format is used) submemory cannot be
       // created, so in that scenario a fallback is needed
-      const auto x_md = ctx.Input<Tensor>("X")->mem_desc();
+      const auto x_md = ctx.Input<phi::DenseTensor>("X")->mem_desc();
       if (x_md.data.format_desc.blocking.inner_nblks == 0)
         return framework::OpKernelType(input_data_type,
                                        ctx.GetPlace(),
@@ -133,7 +133,7 @@ class SplitOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     if (var_name == "AxisTensor" || var_name == "SectionsTensorList") {
       return expected_kernel_type;
diff --git a/paddle/fluid/operators/split_op_mlu.cc b/paddle/fluid/operators/split_op_mlu.cc
index 635f3925a8f02..cda18720e7aba 100644
--- a/paddle/fluid/operators/split_op_mlu.cc
+++ b/paddle/fluid/operators/split_op_mlu.cc
@@ -18,15 +18,15 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class SplitMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     // init parameter
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
+    auto* in = ctx.Input<phi::DenseTensor>("X");
+    auto outs = ctx.MultiOutput<phi::DenseTensor>("Out");
     int num = ctx.Attr<int>("num");
     std::vector<int> sections = ctx.Attr<std::vector<int>>("sections");
     int axis = ctx.Attr<int>("axis");
@@ -36,12 +36,12 @@ class SplitMLUKernel : public framework::OpKernel<T> {
 
     bool need_resize_outs_dims = false;
     if (ctx.HasInput("AxisTensor")) {
-      auto* axis_tensor = ctx.Input<framework::Tensor>("AxisTensor");
+      auto* axis_tensor = ctx.Input<phi::DenseTensor>("AxisTensor");
       axis = GetDataFromTensor(axis_tensor)[0];
       need_resize_outs_dims = true;
     }
     auto sections_tensor_list =
-        ctx.MultiInput<framework::Tensor>("SectionsTensorList");
+        ctx.MultiInput<phi::DenseTensor>("SectionsTensorList");
     if (sections_tensor_list.size() > 0) {
       sections = GetDataFromTensorList(sections_tensor_list);
       need_resize_outs_dims = true;
diff --git a/paddle/fluid/operators/split_op_npu.cc b/paddle/fluid/operators/split_op_npu.cc
index 427070e8f2120..2fa8fa2a805eb 100644
--- a/paddle/fluid/operators/split_op_npu.cc
+++ b/paddle/fluid/operators/split_op_npu.cc
@@ -21,14 +21,14 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class SplitNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
+    auto* in = ctx.Input<phi::DenseTensor>("X");
+    auto outs = ctx.MultiOutput<phi::DenseTensor>("Out");
     int num = ctx.Attr<int>("num");
     std::vector<int> sections = ctx.Attr<std::vector<int>>("sections");
     int axis = ctx.Attr<int>("axis");
diff --git a/paddle/fluid/operators/spp_op.h b/paddle/fluid/operators/spp_op.h
index fd369aee0eaa6..260d368dd0ba1 100644
--- a/paddle/fluid/operators/spp_op.h
+++ b/paddle/fluid/operators/spp_op.h
@@ -28,8 +28,8 @@ template <typename DeviceContext, typename T>
 class SppKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
+    const phi::DenseTensor* in_x = context.Input<phi::DenseTensor>("X");
+    auto* out = context.Output<phi::DenseTensor>("Out");
     int pyramid_height = context.template Attr<int>("pyramid_height");
     std::string pooling_type =
         context.template Attr<std::string>("pooling_type");
@@ -48,7 +48,7 @@ class SppKernel : public framework::OpKernel<T> {
       std::vector<int> strides({kernel_size_h, kernel_size_w});
       std::vector<int> paddings({padding_h, padding_w});
       // pooling output shape
-      framework::Tensor out_level;
+      phi::DenseTensor out_level;
       std::vector<int64_t> output_shape_vec(
           {in_x->dims()[0], in_x->dims()[1], bins, bins});
       framework::DDim output_shape(phi::make_ddim(output_shape_vec));
@@ -110,12 +110,12 @@ template <typename DeviceContext, typename T>
 class SppGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
-    const framework::Tensor* out = context.Input<framework::Tensor>("Out");
-    const framework::Tensor* out_grad =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    framework::Tensor* in_x_grad =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    const phi::DenseTensor* in_x = context.Input<phi::DenseTensor>("X");
+    const phi::DenseTensor* out = context.Input<phi::DenseTensor>("Out");
+    const phi::DenseTensor* out_grad =
+        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    phi::DenseTensor* in_x_grad =
+        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
     int pyramid_height = context.template Attr<int>("pyramid_height");
     std::string pooling_type =
         context.template Attr<std::string>("pooling_type");
@@ -140,8 +140,8 @@ class SppGradKernel : public framework::OpKernel<T> {
       std::vector<int> strides({kernel_size_h, kernel_size_w});
       std::vector<int> paddings({padding_h, padding_w});
       // split out and outgrad  ...  to flatten
-      framework::Tensor out_level;
-      framework::Tensor outgrad_level;
+      phi::DenseTensor out_level;
+      phi::DenseTensor outgrad_level;
       int out_flatten_w = in_x->dims()[1] * bins * bins;
       std::vector<int64_t> out_flatten_shape_vec(
           {in_x->dims()[0], out_flatten_w});
diff --git a/paddle/fluid/operators/squared_l2_distance_op.h b/paddle/fluid/operators/squared_l2_distance_op.h
index 1c1a34b14ba00..1698c65fc47ac 100644
--- a/paddle/fluid/operators/squared_l2_distance_op.h
+++ b/paddle/fluid/operators/squared_l2_distance_op.h
@@ -19,16 +19,16 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class SquaredL2DistanceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("X");
-    auto* in1 = context.Input<Tensor>("Y");
-    auto* out0 = context.Output<Tensor>("sub_result");
-    auto* out1 = context.Output<Tensor>("Out");
+    auto* in0 = context.Input<phi::DenseTensor>("X");
+    auto* in1 = context.Input<phi::DenseTensor>("Y");
+    auto* out0 = context.Output<phi::DenseTensor>("sub_result");
+    auto* out1 = context.Output<phi::DenseTensor>("Out");
 
     auto in0_dims = in0->dims();
     auto in1_dims = in1->dims();
@@ -66,10 +66,10 @@ template <typename DeviceContext, typename T>
 class SquaredL2DistanceGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("sub_result");
-    auto* in1 = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* x_g = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* y_g = context.Output<Tensor>(framework::GradVarName("Y"));
+    auto* in0 = context.Input<phi::DenseTensor>("sub_result");
+    auto* in1 = context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* x_g = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* y_g = context.Output<phi::DenseTensor>(framework::GradVarName("Y"));
 
     PADDLE_ENFORCE_NOT_NULL(
         x_g,
diff --git a/paddle/fluid/operators/squared_l2_norm_op.cc b/paddle/fluid/operators/squared_l2_norm_op.cc
index 4653cc0cc2860..2e97f5b9b0dc2 100644
--- a/paddle/fluid/operators/squared_l2_norm_op.cc
+++ b/paddle/fluid/operators/squared_l2_norm_op.cc
@@ -20,8 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class SquaredL2NormOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/squared_l2_norm_op_mlu.cc b/paddle/fluid/operators/squared_l2_norm_op_mlu.cc
index 741d23540b6e4..fcd83b40875ec 100644
--- a/paddle/fluid/operators/squared_l2_norm_op_mlu.cc
+++ b/paddle/fluid/operators/squared_l2_norm_op_mlu.cc
@@ -19,15 +19,15 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class SquaredL2NormMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
     auto &dev_ctx = context.template device_context<MLUDeviceContext>();
-    auto *x = context.Input<Tensor>("X");
-    auto *out = context.Output<Tensor>("Out");
+    auto *x = context.Input<phi::DenseTensor>("X");
+    auto *out = context.Output<phi::DenseTensor>("Out");
 
     auto place = context.GetPlace();
 
@@ -40,9 +40,9 @@ class SquaredL2NormMLUKernel : public framework::OpKernel<T> {
     MLUCnnl::L2Loss(context, input_desc.get(), GetBasePtr(x), GetBasePtr(out));
 
     // do mul
-    framework::Tensor scale_tensor =
+    phi::DenseTensor scale_tensor =
         context.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
-    framework::Tensor bias_tensor =
+    phi::DenseTensor bias_tensor =
         context.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
     MLUCnnlTensorDesc scale_desc(scale_tensor);
     MLUCnnlTensorDesc bias_desc(bias_tensor);
@@ -67,9 +67,11 @@ class SquaredL2NormGradMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
     auto &dev_ctx = context.template device_context<MLUDeviceContext>();
-    auto *x = context.Input<Tensor>("X");
-    auto *x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-    auto *out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto *x = context.Input<phi::DenseTensor>("X");
+    auto *x_grad =
+        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto *out_grad =
+        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
 
     PADDLE_ENFORCE_EQ(
         out_grad->numel(),
@@ -108,9 +110,9 @@ class SquaredL2NormGradMLUKernel : public framework::OpKernel<T> {
                       ToCnnlDataType(x->dtype()));
 
     // mul
-    framework::Tensor scale_tensor =
+    phi::DenseTensor scale_tensor =
         context.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
-    framework::Tensor bias_tensor =
+    phi::DenseTensor bias_tensor =
         context.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
     MLUCnnlTensorDesc scale_desc(scale_tensor);
     MLUCnnlTensorDesc bias_desc(bias_tensor);
diff --git a/paddle/fluid/operators/squared_l2_norm_op_npu.cc b/paddle/fluid/operators/squared_l2_norm_op_npu.cc
index 56fae36570c19..25260ed4c1286 100644
--- a/paddle/fluid/operators/squared_l2_norm_op_npu.cc
+++ b/paddle/fluid/operators/squared_l2_norm_op_npu.cc
@@ -18,14 +18,14 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class SquaredL2NormNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    auto *x = context.Input<Tensor>("X");
-    auto *out = context.Output<Tensor>("Out");
+    auto *x = context.Input<phi::DenseTensor>("X");
+    auto *out = context.Output<phi::DenseTensor>("Out");
 
     auto place = context.GetPlace();
     auto stream =
@@ -47,9 +47,11 @@ template <typename DeviceContext, typename T>
 class SquaredL2NormGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    auto *x = context.Input<Tensor>("X");
-    auto *x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-    auto *out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto *x = context.Input<phi::DenseTensor>("X");
+    auto *x_grad =
+        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto *out_grad =
+        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
 
     PADDLE_ENFORCE_EQ(
         out_grad->numel(),
diff --git a/paddle/fluid/operators/stack_op_mlu.cc b/paddle/fluid/operators/stack_op_mlu.cc
index 3e9a51b47939e..eeac200676f4a 100644
--- a/paddle/fluid/operators/stack_op_mlu.cc
+++ b/paddle/fluid/operators/stack_op_mlu.cc
@@ -19,14 +19,14 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class StackMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto x = ctx.MultiInput<Tensor>("X");
-    auto* y = ctx.Output<Tensor>("Y");
+    auto x = ctx.MultiInput<phi::DenseTensor>("X");
+    auto* y = ctx.Output<phi::DenseTensor>("Y");
     int axis = ctx.Attr<int>("axis");
     if (axis < 0) axis += (x[0]->dims().size() + 1);
     int num = static_cast<int>(x.size());
diff --git a/paddle/fluid/operators/stack_op_npu.cc b/paddle/fluid/operators/stack_op_npu.cc
index 540e3f22cd816..3b5c0b1dc0cb6 100644
--- a/paddle/fluid/operators/stack_op_npu.cc
+++ b/paddle/fluid/operators/stack_op_npu.cc
@@ -18,14 +18,14 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class StackNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto x = ctx.MultiInput<Tensor>("X");
-    auto* y = ctx.Output<Tensor>("Y");
+    auto x = ctx.MultiInput<phi::DenseTensor>("X");
+    auto* y = ctx.Output<phi::DenseTensor>("Y");
     int axis = ctx.Attr<int>("axis");
     if (axis < 0) axis += (x[0]->dims().size() + 1);
     int num = static_cast<int>(x.size());
@@ -39,7 +39,7 @@ class StackNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    std::vector<paddle::framework::Tensor> x_list;
+    std::vector<phi::DenseTensor> x_list;
     for (int i = 0; i < num; i++) {
       x_list.push_back(*x[i]);
     }
@@ -55,8 +55,8 @@ template <typename T>
 class StackGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto dx = ctx.MultiOutput<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
+    auto dx = ctx.MultiOutput<phi::DenseTensor>(framework::GradVarName("X"));
     int axis = ctx.Attr<int>("axis");
     if (axis < 0) axis += dy->dims().size();
     int num = dy->dims()[axis];
@@ -70,7 +70,7 @@ class StackGradNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    std::vector<paddle::framework::Tensor> dx_list;
+    std::vector<phi::DenseTensor> dx_list;
     for (int i = 0; i < num; i++) {
       dx[i]->mutable_data<T>(ctx.GetPlace());
       dx_list.push_back(*dx[i]);
diff --git a/paddle/fluid/operators/stft_op.h b/paddle/fluid/operators/stft_op.h
index fb2ca31608cd7..23130f687e305 100644
--- a/paddle/fluid/operators/stft_op.h
+++ b/paddle/fluid/operators/stft_op.h
@@ -27,7 +27,7 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class StftKernel : public framework::OpKernel<T> {
@@ -38,9 +38,9 @@ class StftKernel : public framework::OpKernel<T> {
   */
   void Compute(const framework::ExecutionContext& ctx) const override {
     using C = paddle::platform::complex<T>;
-    const Tensor* x = ctx.Input<Tensor>("X");
-    const Tensor* window = ctx.Input<Tensor>("Window");
-    Tensor* out = ctx.Output<Tensor>("Out");
+    const phi::DenseTensor* x = ctx.Input<phi::DenseTensor>("X");
+    const phi::DenseTensor* window = ctx.Input<phi::DenseTensor>("Window");
+    phi::DenseTensor* out = ctx.Output<phi::DenseTensor>("Out");
     out->mutable_data<C>(ctx.GetPlace());
 
     const size_t x_rank = x->dims().size();
@@ -109,9 +109,9 @@ class StftGradKernel : public framework::OpKernel<T> {
     using C = paddle::platform::complex<T>;
     auto& dev_ctx = ctx.device_context<DeviceContext>();
 
-    const Tensor* window = ctx.Input<Tensor>("Window");
-    const auto* dy = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    const phi::DenseTensor* window = ctx.Input<phi::DenseTensor>("Window");
+    const auto* dy = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     dx->mutable_data<T>(ctx.GetPlace());
 
     const size_t dy_rank = dy->dims().size();
diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h
index 350c3820a38c2..3a562d2f26e85 100644
--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
@@ -146,9 +146,9 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
 template <typename T>
 inline void StridedMemcpyWithAxis0(
     const platform::DeviceContext& dev_ctx,
-    const framework::Tensor& input,
-    const std::vector<const framework::Tensor*>& shape_refer,
-    std::vector<framework::Tensor*>* outputs) {
+    const phi::DenseTensor& input,
+    const std::vector<const phi::DenseTensor*>& shape_refer,
+    std::vector<phi::DenseTensor*>* outputs) {
   const framework::DDim in_stride = stride_numel(input.dims());
   const int axis = 0;
   size_t input_offset = 0;
diff --git a/paddle/fluid/operators/strided_slice_op.cc b/paddle/fluid/operators/strided_slice_op.cc
index 788ffb7e1f82e..ad75d23452c91 100644
--- a/paddle/fluid/operators/strided_slice_op.cc
+++ b/paddle/fluid/operators/strided_slice_op.cc
@@ -27,7 +27,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class StridedSliceOp : public framework::OperatorWithKernel {
  public:
@@ -58,7 +58,7 @@ class StridedSliceOp : public framework::OperatorWithKernel {
           ctx.device_context());
     }
     // NOTE: cuda pinned tensor need to copy its data to target place
-    auto in_tensor = ctx.Input<Tensor>("Input");
+    auto in_tensor = ctx.Input<phi::DenseTensor>("Input");
     if (platform::is_cuda_pinned_place(in_tensor->place())) {
       return framework::OpKernelType(
           framework::TransToProtoVarType(in_tensor->dtype()),
diff --git a/paddle/fluid/operators/strided_slice_op_mlu.cc b/paddle/fluid/operators/strided_slice_op_mlu.cc
index 95972d8159267..806c8205d0970 100644
--- a/paddle/fluid/operators/strided_slice_op_mlu.cc
+++ b/paddle/fluid/operators/strided_slice_op_mlu.cc
@@ -95,7 +95,7 @@ class StridedSliceMLUKernel : public framework::OpKernel<T> {
                       false,
                       platform::errors::InvalidArgument(
                           "Tensor array as input is not supported."));
-    int rank = ctx.Input<framework::Tensor>("Input")->dims().size();
+    int rank = ctx.Input<phi::DenseTensor>("Input")->dims().size();
     switch (rank) {
       case 1:
         StridedSliceCompute<1>(ctx);
@@ -133,8 +133,8 @@ class StridedSliceMLUKernel : public framework::OpKernel<T> {
   void StridedSliceCompute(const framework::ExecutionContext& ctx) const {
     auto place = ctx.GetPlace();
 
-    auto in = ctx.Input<framework::Tensor>("Input");
-    auto out = ctx.Output<framework::Tensor>("Out");
+    auto in = ctx.Input<phi::DenseTensor>("Input");
+    auto out = ctx.Output<phi::DenseTensor>("Out");
     auto in_dims = in->dims();
 
     // list<int>
@@ -152,31 +152,31 @@ class StridedSliceMLUKernel : public framework::OpKernel<T> {
 
     // vector<Tensor<int32>>
     auto list_new_starts_tensor =
-        ctx.MultiInput<framework::Tensor>("StartsTensorList");
+        ctx.MultiInput<phi::DenseTensor>("StartsTensorList");
     auto list_new_ends_tensor =
-        ctx.MultiInput<framework::Tensor>("EndsTensorList");
+        ctx.MultiInput<phi::DenseTensor>("EndsTensorList");
     auto list_new_strides_tensor =
-        ctx.MultiInput<framework::Tensor>("StridesTensorList");
+        ctx.MultiInput<phi::DenseTensor>("StridesTensorList");
 
     // Tensor<int32>
     if (list_new_starts_tensor.size() > 0) {
       starts = GetDataFromTensorList<int64_t>(list_new_starts_tensor);
     } else if (ctx.HasInput("StartsTensor")) {
-      auto* starts_tensor = ctx.Input<framework::Tensor>("StartsTensor");
+      auto* starts_tensor = ctx.Input<phi::DenseTensor>("StartsTensor");
       starts = GetDataFromTensor<int64_t>(starts_tensor);
     }
 
     if (list_new_ends_tensor.size() > 0) {
       ends = GetDataFromTensorList<int64_t>(list_new_ends_tensor);
     } else if (ctx.HasInput("EndsTensor")) {
-      auto* ends_tensor = ctx.Input<framework::Tensor>("EndsTensor");
+      auto* ends_tensor = ctx.Input<phi::DenseTensor>("EndsTensor");
       ends = GetDataFromTensor<int64_t>(ends_tensor);
     }
 
     if (list_new_strides_tensor.size() > 0) {
       strides = GetDataFromTensorList<int64_t>(list_new_strides_tensor);
     } else if (ctx.HasInput("StridesTensor")) {
-      auto* strides_tensor = ctx.Input<framework::Tensor>("StridesTensor");
+      auto* strides_tensor = ctx.Input<phi::DenseTensor>("StridesTensor");
       strides = GetDataFromTensor<int64_t>(strides_tensor);
     }
 
@@ -263,7 +263,7 @@ class StridedSliceGradMLUKernel : public framework::OpKernel<T> {
                       false,
                       platform::errors::InvalidArgument(
                           "Tensor array as input is not supported."));
-    int rank = ctx.Input<framework::Tensor>("Input")->dims().size();
+    int rank = ctx.Input<phi::DenseTensor>("Input")->dims().size();
 
     switch (rank) {
       case 1:
@@ -302,10 +302,10 @@ class StridedSliceGradMLUKernel : public framework::OpKernel<T> {
   void StridedSliceGradCompute(const framework::ExecutionContext& ctx) const {
     auto place = ctx.GetPlace();
 
-    auto* input = ctx.Input<framework::Tensor>("Input");
+    auto* input = ctx.Input<phi::DenseTensor>("Input");
     auto input_dims = input->dims();
-    auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
     dx->mutable_data<T>(input_dims, place);
 
     auto starts_int = ctx.Attr<std::vector<int>>("starts");
@@ -321,30 +321,30 @@ class StridedSliceGradMLUKernel : public framework::OpKernel<T> {
     auto decrease_axis = ctx.Attr<std::vector<int>>("decrease_axis");
 
     auto list_new_ends_tensor =
-        ctx.MultiInput<framework::Tensor>("EndsTensorList");
+        ctx.MultiInput<phi::DenseTensor>("EndsTensorList");
     auto list_new_starts_tensor =
-        ctx.MultiInput<framework::Tensor>("StartsTensorList");
+        ctx.MultiInput<phi::DenseTensor>("StartsTensorList");
     auto list_new_strides_tensor =
-        ctx.MultiInput<framework::Tensor>("StridesTensorList");
+        ctx.MultiInput<phi::DenseTensor>("StridesTensorList");
 
     if (list_new_starts_tensor.size() > 0) {
       starts = GetDataFromTensorList<int64_t>(list_new_starts_tensor);
     } else if (ctx.HasInput("StartsTensor")) {
-      auto* starts_tensor = ctx.Input<framework::Tensor>("StartsTensor");
+      auto* starts_tensor = ctx.Input<phi::DenseTensor>("StartsTensor");
       starts = GetDataFromTensor<int64_t>(starts_tensor);
     }
 
     if (list_new_ends_tensor.size() > 0) {
       ends = GetDataFromTensorList<int64_t>(list_new_ends_tensor);
     } else if (ctx.HasInput("EndsTensor")) {
-      auto* ends_tensor = ctx.Input<framework::Tensor>("EndsTensor");
+      auto* ends_tensor = ctx.Input<phi::DenseTensor>("EndsTensor");
       ends = GetDataFromTensor<int64_t>(ends_tensor);
     }
 
     if (list_new_strides_tensor.size() > 0) {
       strides = GetDataFromTensorList<int64_t>(list_new_strides_tensor);
     } else if (ctx.HasInput("StridesTensor")) {
-      auto* strides_tensor = ctx.Input<framework::Tensor>("StridesTensor");
+      auto* strides_tensor = ctx.Input<phi::DenseTensor>("StridesTensor");
       strides = GetDataFromTensor<int64_t>(strides_tensor);
     }
 
diff --git a/paddle/fluid/operators/strided_slice_op_npu.cc b/paddle/fluid/operators/strided_slice_op_npu.cc
index 6b7ee2e744ea7..9a1492fea1ee5 100644
--- a/paddle/fluid/operators/strided_slice_op_npu.cc
+++ b/paddle/fluid/operators/strided_slice_op_npu.cc
@@ -29,7 +29,7 @@ class StridedSliceNPUKernel : public framework::OpKernel<T> {
                       false,
                       platform::errors::InvalidArgument(
                           "Tensor array as input is not supported."));
-    int rank = ctx.Input<framework::Tensor>("Input")->dims().size();
+    int rank = ctx.Input<phi::DenseTensor>("Input")->dims().size();
     switch (rank) {
       case 1:
         StridedSliceCompute<1>(ctx);
@@ -64,8 +64,8 @@ class StridedSliceNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    auto in = ctx.Input<framework::Tensor>("Input");
-    auto out = ctx.Output<framework::Tensor>("Out");
+    auto in = ctx.Input<phi::DenseTensor>("Input");
+    auto out = ctx.Output<phi::DenseTensor>("Out");
     auto in_dims = in->dims();
 
     // list<int>
@@ -83,31 +83,31 @@ class StridedSliceNPUKernel : public framework::OpKernel<T> {
 
     // vector<Tensor<int32>>
     auto list_new_ends_tensor =
-        ctx.MultiInput<framework::Tensor>("EndsTensorList");
+        ctx.MultiInput<phi::DenseTensor>("EndsTensorList");
     auto list_new_starts_tensor =
-        ctx.MultiInput<framework::Tensor>("StartsTensorList");
+        ctx.MultiInput<phi::DenseTensor>("StartsTensorList");
     auto list_new_strides_tensor =
-        ctx.MultiInput<framework::Tensor>("StridesTensorList");
+        ctx.MultiInput<phi::DenseTensor>("StridesTensorList");
 
     // Tensor<int32>
     if (list_new_starts_tensor.size() > 0) {
       starts = GetDataFromTensorList<int64_t>(list_new_starts_tensor);
     } else if (ctx.HasInput("StartsTensor")) {
-      auto* starts_tensor = ctx.Input<framework::Tensor>("StartsTensor");
+      auto* starts_tensor = ctx.Input<phi::DenseTensor>("StartsTensor");
       starts = GetDataFromTensor<int64_t>(starts_tensor);
     }
 
     if (list_new_ends_tensor.size() > 0) {
       ends = GetDataFromTensorList<int64_t>(list_new_ends_tensor);
     } else if (ctx.HasInput("EndsTensor")) {
-      auto* ends_tensor = ctx.Input<framework::Tensor>("EndsTensor");
+      auto* ends_tensor = ctx.Input<phi::DenseTensor>("EndsTensor");
       ends = GetDataFromTensor<int64_t>(ends_tensor);
     }
 
     if (list_new_strides_tensor.size() > 0) {
       strides = GetDataFromTensorList<int64_t>(list_new_strides_tensor);
     } else if (ctx.HasInput("StridesTensor")) {
-      auto* strides_tensor = ctx.Input<framework::Tensor>("StridesTensor");
+      auto* strides_tensor = ctx.Input<phi::DenseTensor>("StridesTensor");
       strides = GetDataFromTensor<int64_t>(strides_tensor);
     }
 
@@ -256,7 +256,7 @@ class StridedSliceGradNPUKernel : public framework::OpKernel<T> {
                       false,
                       platform::errors::InvalidArgument(
                           "Tensor array as input is not supported."));
-    int rank = ctx.Input<framework::Tensor>("Input")->dims().size();
+    int rank = ctx.Input<phi::DenseTensor>("Input")->dims().size();
 
     switch (rank) {
       case 1:
@@ -291,10 +291,10 @@ class StridedSliceGradNPUKernel : public framework::OpKernel<T> {
     auto& dev_ctx =
         ctx.template device_context<paddle::platform::NPUDeviceContext>();
 
-    auto* input = ctx.Input<framework::Tensor>("Input");
+    auto* input = ctx.Input<phi::DenseTensor>("Input");
     auto input_dims = input->dims();
-    auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
     dx->mutable_data<T>(input_dims, place);
 
     auto starts_int = ctx.Attr<std::vector<int>>("starts");
@@ -310,30 +310,30 @@ class StridedSliceGradNPUKernel : public framework::OpKernel<T> {
     auto decrease_axis = ctx.Attr<std::vector<int>>("decrease_axis");
 
     auto list_new_ends_tensor =
-        ctx.MultiInput<framework::Tensor>("EndsTensorList");
+        ctx.MultiInput<phi::DenseTensor>("EndsTensorList");
     auto list_new_starts_tensor =
-        ctx.MultiInput<framework::Tensor>("StartsTensorList");
+        ctx.MultiInput<phi::DenseTensor>("StartsTensorList");
     auto list_new_strides_tensor =
-        ctx.MultiInput<framework::Tensor>("StridesTensorList");
+        ctx.MultiInput<phi::DenseTensor>("StridesTensorList");
 
     if (list_new_starts_tensor.size() > 0) {
       starts = GetDataFromTensorList<int64_t>(list_new_starts_tensor);
     } else if (ctx.HasInput("StartsTensor")) {
-      auto* starts_tensor = ctx.Input<framework::Tensor>("StartsTensor");
+      auto* starts_tensor = ctx.Input<phi::DenseTensor>("StartsTensor");
       starts = GetDataFromTensor<int64_t>(starts_tensor);
     }
 
     if (list_new_ends_tensor.size() > 0) {
       ends = GetDataFromTensorList<int64_t>(list_new_ends_tensor);
     } else if (ctx.HasInput("EndsTensor")) {
-      auto* ends_tensor = ctx.Input<framework::Tensor>("EndsTensor");
+      auto* ends_tensor = ctx.Input<phi::DenseTensor>("EndsTensor");
       ends = GetDataFromTensor<int64_t>(ends_tensor);
     }
 
     if (list_new_strides_tensor.size() > 0) {
       strides = GetDataFromTensorList<int64_t>(list_new_strides_tensor);
     } else if (ctx.HasInput("StridesTensor")) {
-      auto* strides_tensor = ctx.Input<framework::Tensor>("StridesTensor");
+      auto* strides_tensor = ctx.Input<phi::DenseTensor>("StridesTensor");
       strides = GetDataFromTensor<int64_t>(strides_tensor);
     }
 
diff --git a/paddle/fluid/operators/string/faster_tokenizer_op.cc b/paddle/fluid/operators/string/faster_tokenizer_op.cc
index 3539e2213a39d..f1a7688372adc 100644
--- a/paddle/fluid/operators/string/faster_tokenizer_op.cc
+++ b/paddle/fluid/operators/string/faster_tokenizer_op.cc
@@ -477,7 +477,7 @@ class FasterTokenizerOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const framework::Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     return framework::OpKernelType(expected_kernel_type.data_type_,
                                    expected_kernel_type.place_,
diff --git a/paddle/fluid/operators/string/faster_tokenizer_op.h b/paddle/fluid/operators/string/faster_tokenizer_op.h
index d02313ea8d0c5..0c98190252419 100644
--- a/paddle/fluid/operators/string/faster_tokenizer_op.h
+++ b/paddle/fluid/operators/string/faster_tokenizer_op.h
@@ -129,8 +129,8 @@ class FasterTokenizerKernel : public framework::OpKernel<T> {
     auto* text = ctx.Input<framework::Strings>("Text");
     auto* vocab = ctx.Input<framework::Vocab>("Vocab");
 
-    auto* input_ids = ctx.Output<framework::Tensor>("InputIds");
-    auto* seg_ids = ctx.Output<framework::Tensor>("SegmentIds");
+    auto* input_ids = ctx.Output<phi::DenseTensor>("InputIds");
+    auto* seg_ids = ctx.Output<phi::DenseTensor>("SegmentIds");
 
     auto do_lower_case = static_cast<bool>(ctx.Attr<bool>("do_lower_case"));
     auto is_split_into_words =
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index ec570f709c35c..8cf6a095e2304 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -28,7 +28,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using framework::Tensor;
 
 class SumOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/sum_op_mlu.cc b/paddle/fluid/operators/sum_op_mlu.cc
index 0bb51581e9360..7c741632c1e1f 100644
--- a/paddle/fluid/operators/sum_op_mlu.cc
+++ b/paddle/fluid/operators/sum_op_mlu.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using SelectedRows = phi::SelectedRows;
 using LoDTensor = framework::LoDTensor;
 
@@ -31,7 +31,7 @@ class SumMLUKernel : public framework::OpKernel<T> {
     if (out_var->IsType<framework::LoDTensor>()) {
       // init
       auto *out = out_var->GetMutable<framework::LoDTensor>();
-      auto ins = ctx.MultiInput<Tensor>("X");
+      auto ins = ctx.MultiInput<phi::DenseTensor>("X");
       out->mutable_data<T>(ctx.GetPlace());
       auto place = ctx.GetPlace();
       int ins_size = static_cast<int>(ins.size());
diff --git a/paddle/fluid/operators/sum_op_npu.cc b/paddle/fluid/operators/sum_op_npu.cc
index a7bb442fa650c..c5ad250bb3cd1 100644
--- a/paddle/fluid/operators/sum_op_npu.cc
+++ b/paddle/fluid/operators/sum_op_npu.cc
@@ -23,7 +23,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using SelectedRows = phi::SelectedRows;
 using LoDTensor = framework::LoDTensor;
 
@@ -34,7 +34,7 @@ class SumNPUKernel : public framework::OpKernel<T> {
     auto out_var = ctx.OutputVar("Out");
     if (out_var->IsType<framework::LoDTensor>()) {
       auto *out = out_var->GetMutable<framework::LoDTensor>();
-      auto x = ctx.MultiInput<Tensor>("X");
+      auto x = ctx.MultiInput<phi::DenseTensor>("X");
       out->mutable_data<T>(ctx.GetPlace());
 
       auto place = ctx.GetPlace();
@@ -45,7 +45,7 @@ class SumNPUKernel : public framework::OpKernel<T> {
         return;
       }
 
-      std::vector<framework::Tensor> inputs;
+      std::vector<phi::DenseTensor> inputs;
       std::vector<std::string> names;
       for (int i = 0; i < n; ++i) {
         if (x[i] && x[i]->numel() > 0) {
diff --git a/paddle/fluid/operators/sum_op_xpu.cc b/paddle/fluid/operators/sum_op_xpu.cc
index a445868153452..82f4ba7a50b6e 100644
--- a/paddle/fluid/operators/sum_op_xpu.cc
+++ b/paddle/fluid/operators/sum_op_xpu.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using framework::Tensor;
+
 using SelectedRows = phi::SelectedRows;
 using LoDTensor = framework::LoDTensor;
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/svd_helper.h b/paddle/fluid/operators/svd_helper.h
index a796aa9d54444..3fb7994566a2f 100644
--- a/paddle/fluid/operators/svd_helper.h
+++ b/paddle/fluid/operators/svd_helper.h
@@ -36,9 +36,9 @@
 namespace paddle {
 namespace operators {
 namespace math {
-using Tensor = framework::Tensor;
-using InTensors = std::vector<const Tensor*>;
-using OutTensors = std::vector<Tensor*>;
+using Tensor = phi::DenseTensor;
+using InTensors = std::vector<const phi::DenseTensor*>;
+using OutTensors = std::vector<phi::DenseTensor*>;
 using OpName = std::string;
 template <typename T,
           int MajorType = Eigen::RowMajor,
@@ -279,15 +279,15 @@ struct DeviceIndependenceTensorOperations {
   // 5. The Reused Operator Kernel should only be considered as
   //    a wrap function
   using NameInTensorMap =
-      std::map<std::string, std::vector<const framework::Tensor*>>;
+      std::map<std::string, std::vector<const phi::DenseTensor*>>;
   using NameOutTensor = std::vector<std::string>;
 
   explicit DeviceIndependenceTensorOperations(
       const framework::ExecutionContext& context)
       : context(context) {}
 
-  framework::Tensor Pow(const framework::Tensor& x, T exp) {
-    framework::Tensor out;
+  phi::DenseTensor Pow(const phi::DenseTensor& x, T exp) {
+    phi::DenseTensor out;
     auto for_range = GetForRange(x.numel());
     int numel = x.numel();
     PowFunctor<T> functor(
@@ -295,11 +295,11 @@ struct DeviceIndependenceTensorOperations {
     for_range(functor);
     return out;
   }
-  framework::Tensor Matmul(const framework::Tensor& mat_a,
-                           const framework::Tensor& mat_b,
-                           bool trans_a = false,
-                           bool trans_b = false) {
-    framework::Tensor ret;
+  phi::DenseTensor Matmul(const phi::DenseTensor& mat_a,
+                          const phi::DenseTensor& mat_b,
+                          bool trans_a = false,
+                          bool trans_b = false) {
+    phi::DenseTensor ret;
     auto a_dim = mat_a.dims();
     auto b_dim = mat_b.dims();
     std::vector<int> x_vec = phi::vectorize<int>(a_dim);
@@ -315,9 +315,9 @@ struct DeviceIndependenceTensorOperations {
     return ret;
   }
 
-  framework::Tensor Transpose(const framework::Tensor& x) {
+  phi::DenseTensor Transpose(const phi::DenseTensor& x) {
     // transpose the last two dimision
-    framework::Tensor ret;
+    phi::DenseTensor ret;
     auto x_dim = x.dims();
     auto x_vec = phi::vectorize<int>(x_dim);
     int rank = x_vec.size();
@@ -345,10 +345,10 @@ struct DeviceIndependenceTensorOperations {
     }
     return ret;
   }
-  framework::Tensor Diag(const framework::Tensor& x,
-                         int offset = 0,
-                         // FIXME  link error
-                         int padding_value = 0) {
+  phi::DenseTensor Diag(const phi::DenseTensor& x,
+                        int offset = 0,
+                        // FIXME  link error
+                        int padding_value = 0) {
     PADDLE_ENFORCE_EQ(padding_value,
                       0,
                       platform::errors::InvalidArgument(
@@ -359,7 +359,7 @@ struct DeviceIndependenceTensorOperations {
                           "Current diag only support offset = 0,"
                           "you can use DiagOp instead(not recommend)"));
 
-    framework::Tensor ret;
+    phi::DenseTensor ret;
     int x_rank = x.dims().size();
     std::vector<int> out_shape;
     if (x_rank == 2) {
@@ -382,7 +382,7 @@ struct DeviceIndependenceTensorOperations {
   }
 
   // batch_diag for CPU only
-  Tensor BatchDiag(const Tensor& x, int batch) {
+  Tensor BatchDiag(const phi::DenseTensor& x, int batch) {
     Tensor out;
     auto* x_data = x.data<phi::dtype::Real<T>>();
     auto numel = x.numel();
@@ -411,8 +411,8 @@ struct DeviceIndependenceTensorOperations {
   }
 
   // a complex number x times a real number y, which is represented as (a+0j)
-  Tensor RealMulComplex(const Tensor& x, const Tensor& y) {
-    framework::Tensor ret;
+  Tensor RealMulComplex(const phi::DenseTensor& x, const phi::DenseTensor& y) {
+    phi::DenseTensor ret;
     std::vector<int> out_shape = GetBroadcastShape({&x, &y});
     ret.Resize(phi::make_ddim(out_shape));
     ElementwiseComputeEx<RealMulComplexFunctor<T>, DeviceContext, T>(
@@ -420,9 +420,8 @@ struct DeviceIndependenceTensorOperations {
     return ret;
   }
 
-  framework::Tensor Div(const framework::Tensor& x,
-                        const framework::Tensor& y) {
-    framework::Tensor ret;
+  phi::DenseTensor Div(const phi::DenseTensor& x, const phi::DenseTensor& y) {
+    phi::DenseTensor ret;
     if (x.type() != y.type()) {
       ret.mutable_data<T>(x.dims(), context.GetPlace());
       auto x_vector = EigenVector<T>::Flatten(x);
@@ -439,19 +438,17 @@ struct DeviceIndependenceTensorOperations {
     }
     return ret;
   }
-  framework::Tensor Add(const framework::Tensor& x,
-                        const framework::Tensor& y) {
+  phi::DenseTensor Add(const phi::DenseTensor& x, const phi::DenseTensor& y) {
     // element wise add, support numpy broadcast.
-    framework::Tensor ret;
+    phi::DenseTensor ret;
     std::vector<int> out_shape = GetBroadcastShape({&x, &y});
     ret.Resize(phi::make_ddim(out_shape));
     ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(
         context, &x, &y, -1, AddFunctor<T>(), &ret);
     return ret;
   }
-  framework::Tensor Mul(const framework::Tensor& x,
-                        const framework::Tensor& y) {
-    framework::Tensor ret;
+  phi::DenseTensor Mul(const phi::DenseTensor& x, const phi::DenseTensor& y) {
+    phi::DenseTensor ret;
     std::vector<int> out_shape = GetBroadcastShape({&x, &y});
     ret.Resize(phi::make_ddim(out_shape));
     ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
@@ -459,16 +456,16 @@ struct DeviceIndependenceTensorOperations {
     return ret;
   }
 
-  framework::Tensor ReduceSum(const framework::Tensor& x,
-                              std::vector<int> out_dim) {
+  phi::DenseTensor ReduceSum(const phi::DenseTensor& x,
+                             std::vector<int> out_dim) {
     framework::AttributeMap attrs;
     attrs["dim"] = std::vector<int>{-1};
     NameInTensorMap inputs({{"X", {&x}}});
     return CreateOpRunAndReturnTensor("reduce_sum", inputs, attrs, out_dim);
   }
 
-  framework::Tensor ReduceMax(const framework::Tensor& x,
-                              std::vector<int> out_dim) {
+  phi::DenseTensor ReduceMax(const phi::DenseTensor& x,
+                             std::vector<int> out_dim) {
     framework::AttributeMap attrs;
     attrs["dim"] = std::vector<int>{-1};
     NameInTensorMap inputs({{"X", {&x}}});
@@ -476,9 +473,8 @@ struct DeviceIndependenceTensorOperations {
   }
   // Support float and complex type subtraction，the default is T type
   template <typename InT = T>
-  framework::Tensor Sub(const framework::Tensor& x,
-                        const framework::Tensor& y) {
-    framework::Tensor ret;
+  phi::DenseTensor Sub(const phi::DenseTensor& x, const phi::DenseTensor& y) {
+    phi::DenseTensor ret;
     std::vector<int> out_shape = GetBroadcastShape({&x, &y});
     ret.Resize(phi::make_ddim(out_shape));
     if (platform::is_gpu_place(context.GetPlace())) {
@@ -501,9 +497,9 @@ struct DeviceIndependenceTensorOperations {
     }
     return ret;
   }
-  const framework::Tensor Unsqueeze(const framework::Tensor& x, int axis = 0) {
+  const phi::DenseTensor Unsqueeze(const phi::DenseTensor& x, int axis = 0) {
     // don't copy data, only change the dims
-    framework::Tensor out;
+    phi::DenseTensor out;
     out.ShareDataWith(x);
     std::vector<int> out_shape = phi::vectorize<int>(x.dims());
     if (axis >= 0) {
@@ -516,28 +512,28 @@ struct DeviceIndependenceTensorOperations {
     out.Resize(phi::make_ddim(out_shape));
     return out;
   }
-  framework::Tensor Fill(std::vector<int> shape, float fill_value) {
-    framework::Tensor ret;
+  phi::DenseTensor Fill(std::vector<int> shape, float fill_value) {
+    phi::DenseTensor ret;
     ret.Resize(phi::make_ddim(shape));
     ret.mutable_data<T>(context.GetPlace());
     auto& dev_ctx = context.template device_context<DeviceContext>();
     phi::funcs::SetConstant<DeviceContext, T>()(dev_ctx, &ret, T(fill_value));
     return ret;
   }
-  framework::Tensor Infinits(std::vector<int> shape) {
+  phi::DenseTensor Infinits(std::vector<int> shape) {
     auto value = static_cast<T>(std::numeric_limits<double>::infinity());
     return Fill(shape, value);
   }
-  framework::Tensor Eye(int n) {
+  phi::DenseTensor Eye(int n) {
     auto output = Fill({n}, 1);
     auto ret = Diag(output);
     return ret;
   }
-  framework::Tensor Slice(const framework::Tensor& x,
-                          std::vector<int> axes,
-                          std::vector<int> starts,
-                          std::vector<int> ends) {
-    framework::Tensor ret;
+  phi::DenseTensor Slice(const phi::DenseTensor& x,
+                         std::vector<int> axes,
+                         std::vector<int> starts,
+                         std::vector<int> ends) {
+    phi::DenseTensor ret;
     std::vector<int> new_axes = axes;
     std::vector<int> out_shape = phi::vectorize<int>(x.dims());
     size_t rank = out_shape.size();
@@ -588,9 +584,9 @@ struct DeviceIndependenceTensorOperations {
     return ret;
   }
 
-  framework::Tensor TrilTriu(const framework::Tensor& x,
-                             int diagonal,
-                             bool lower) {
+  phi::DenseTensor TrilTriu(const phi::DenseTensor& x,
+                            int diagonal,
+                            bool lower) {
     framework::AttributeMap attrs;
     attrs["diagonal"] = diagonal;
     attrs["lower"] = lower;
@@ -604,11 +600,11 @@ struct DeviceIndependenceTensorOperations {
     return CreateOpRunAndReturnTensor("tril_triu", inputs, attrs, out_shape);
   }
 
-  framework::Tensor TriangularSolve(const framework::Tensor& x,
-                                    const framework::Tensor& y,
-                                    bool upper,
-                                    bool transpose,
-                                    bool unitriangular) {
+  phi::DenseTensor TriangularSolve(const phi::DenseTensor& x,
+                                   const phi::DenseTensor& y,
+                                   bool upper,
+                                   bool transpose,
+                                   bool unitriangular) {
     framework::AttributeMap attrs;
     attrs["upper"] = upper;
     attrs["transpose"] = transpose;
@@ -635,9 +631,9 @@ struct DeviceIndependenceTensorOperations {
         "triangular_solve", inputs, attrs, out_shape);
   }
 
-  framework::Tensor ConcatTwoTensors(const framework::Tensor& x,
-                                     const framework::Tensor& y,
-                                     int axis) {
+  phi::DenseTensor ConcatTwoTensors(const phi::DenseTensor& x,
+                                    const phi::DenseTensor& y,
+                                    int axis) {
     framework::AttributeMap attrs;
     attrs["axis"] = axis;
     std::vector<framework::DDim> inputs_dims({x.dims(), y.dims()});
@@ -654,7 +650,7 @@ struct DeviceIndependenceTensorOperations {
     return CreateOpRunAndReturnTensor("concat", inputs, attrs, out_shape);
   }
 
-  Tensor Conj(const Tensor& x) {
+  Tensor Conj(const phi::DenseTensor& x) {
     Tensor out;
     auto* out_data = out.mutable_data<T>(x.dims(), context.GetPlace());
     auto* x_data = x.data<T>();
@@ -664,7 +660,7 @@ struct DeviceIndependenceTensorOperations {
     return out;
   }
 
-  Tensor Real(const Tensor& x) {
+  Tensor Real(const phi::DenseTensor& x) {
     Tensor out;
     auto numel = x.numel();
     auto* out_data = out.mutable_data<phi::dtype::Real<T>>(
@@ -682,8 +678,8 @@ struct DeviceIndependenceTensorOperations {
                   const int n,
                   const int num_lower_diags,
                   const int num_upper_diags,
-                  const Tensor& scale,
-                  const Tensor& input) {
+                  const phi::DenseTensor& scale,
+                  const phi::DenseTensor& input) {
     Tensor out;
     auto& dev_ctx = context.template device_context<DeviceContext>();
     platform::ForRange<DeviceContext> for_range(dev_ctx, input.numel());
@@ -709,10 +705,10 @@ struct DeviceIndependenceTensorOperations {
     return platform::ForRange<DeviceContext>(dev_ctx, numel);
   }
   template <size_t D>
-  void EigenSliceWrapper(const framework::Tensor* in,
+  void EigenSliceWrapper(const phi::DenseTensor* in,
                          const std::vector<int>& start,
                          const std::vector<int>& end,
-                         framework::Tensor* out) {
+                         phi::DenseTensor* out) {
     // Slice by call Eigen Tensor Function `.slice()`
     size_t rank = in->dims().size();
     PADDLE_ENFORCE_EQ(start.size(),
@@ -742,7 +738,7 @@ struct DeviceIndependenceTensorOperations {
         offsets_32bit,
         extents_32bit);
   }
-  framework::Tensor CreateOpRunAndReturnTensor(
+  phi::DenseTensor CreateOpRunAndReturnTensor(
       const std::string& type,
       const NameInTensorMap& inputs,
       const framework::AttributeMap& attrs,
@@ -781,7 +777,7 @@ struct DeviceIndependenceTensorOperations {
     auto op =
         framework::OpRegistry::CreateOp(type, op_inputs, op_outputs, attrs);
     op->Run(local_scope, context.GetPlace());
-    framework::Tensor out;
+    phi::DenseTensor out;
     out.ShareDataWith(*(out_var->GetMutable<framework::LoDTensor>()));
     out.Resize(phi::make_ddim(out_shape));
     context.scope().DeleteScope(&local_scope);
diff --git a/paddle/fluid/operators/sync_batch_norm_op_mlu.cc b/paddle/fluid/operators/sync_batch_norm_op_mlu.cc
index 0a95088c31f2c..d2fa4f794efb6 100644
--- a/paddle/fluid/operators/sync_batch_norm_op_mlu.cc
+++ b/paddle/fluid/operators/sync_batch_norm_op_mlu.cc
@@ -26,7 +26,7 @@ namespace operators {
 #define NO_USE_CNCL 0
 #define GET_LAYOUT_OFFSET 2
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 static std::vector<cnnlTensorLayout_t> supported_input_layout = {
     CNNL_LAYOUT_NC, CNNL_LAYOUT_NLC, CNNL_LAYOUT_NHWC, CNNL_LAYOUT_NDHWC};
 
@@ -51,16 +51,16 @@ class SyncBatchNormMLUKernel : public framework::OpKernel<T> {
                           "to set use_global_stats True. Please use batch_norm "
                           "in this case."));
 
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-    const auto *mean = ctx.Input<Tensor>("Mean");
-    const auto *variance = ctx.Input<Tensor>("Variance");
-    auto *mean_out = ctx.Output<Tensor>("MeanOut");
-    auto *variance_out = ctx.Output<Tensor>("VarianceOut");
-    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
-    auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
-    auto *y = ctx.Output<Tensor>("Y");
+    const auto *x = ctx.Input<phi::DenseTensor>("X");
+    const auto *scale = ctx.Input<phi::DenseTensor>("Scale");
+    const auto *bias = ctx.Input<phi::DenseTensor>("Bias");
+    const auto *mean = ctx.Input<phi::DenseTensor>("Mean");
+    const auto *variance = ctx.Input<phi::DenseTensor>("Variance");
+    auto *mean_out = ctx.Output<phi::DenseTensor>("MeanOut");
+    auto *variance_out = ctx.Output<phi::DenseTensor>("VarianceOut");
+    auto *saved_mean = ctx.Output<phi::DenseTensor>("SavedMean");
+    auto *saved_variance = ctx.Output<phi::DenseTensor>("SavedVariance");
+    auto *y = ctx.Output<phi::DenseTensor>("Y");
 
     const auto &x_dims = x->dims();
     PADDLE_ENFORCE_GE(x_dims.size(),
@@ -136,7 +136,7 @@ class SyncBatchNormMLUKernel : public framework::OpKernel<T> {
                               nullptr);
     } else {  // training
       if (ctx.HasInput("MomentumTensor")) {
-        const auto *mom_tensor = ctx.Input<Tensor>("MomentumTensor");
+        const auto *mom_tensor = ctx.Input<phi::DenseTensor>("MomentumTensor");
         Tensor mom_cpu;
         paddle::framework::TensorCopySync(
             *mom_tensor, platform::CPUPlace(), &mom_cpu);
@@ -287,17 +287,18 @@ class SyncBatchNormMLUGradKernel : public framework::OpKernel<T> {
     const std::string layout_str = ctx.Attr<std::string>("data_layout");
     const DataLayout layout = framework::StringToDataLayout(layout_str);
 
-    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
+    const auto *d_y = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
+    const auto *scale = ctx.Input<phi::DenseTensor>("Scale");
+    const auto *bias = ctx.Input<phi::DenseTensor>("Bias");
 
     // init output
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+    auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto *d_scale =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
+    auto *d_bias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
 
-    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
-    const auto *saved_inv_var = ctx.Input<Tensor>("SavedVariance");
+    const auto *saved_mean = ctx.Input<phi::DenseTensor>("SavedMean");
+    const auto *saved_inv_var = ctx.Input<phi::DenseTensor>("SavedVariance");
 
     const Tensor *x;
     if (ctx.HasInput("Y")) {
@@ -306,7 +307,7 @@ class SyncBatchNormMLUGradKernel : public framework::OpKernel<T> {
                         platform::errors::InvalidArgument(
                             "sync_batch_norm_grad doesn't support input Y"));
     } else {
-      x = ctx.Input<Tensor>("X");
+      x = ctx.Input<phi::DenseTensor>("X");
     }
 
     const auto &x_dims = x->dims();
diff --git a/paddle/fluid/operators/sync_batch_norm_op_npu.cc b/paddle/fluid/operators/sync_batch_norm_op_npu.cc
index 1789110a18af2..08136d7fe2ea4 100644
--- a/paddle/fluid/operators/sync_batch_norm_op_npu.cc
+++ b/paddle/fluid/operators/sync_batch_norm_op_npu.cc
@@ -20,7 +20,7 @@ limitations under the Licnse. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 void training_or_inference(const framework::ExecutionContext &ctx,
@@ -325,16 +325,16 @@ class SyncBatchNormNPUKernel : public framework::OpKernel<T> {
                           "to set use_global_stats True. Please use batch_norm "
                           "in this case."));
 
-    const auto *x = ctx.Input<Tensor>("X");
-    auto *y = ctx.Output<Tensor>("Y");
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-    const auto *mean = ctx.Input<Tensor>("Mean");
-    const auto *variance = ctx.Input<Tensor>("Variance");
-    auto *mean_out = ctx.Output<Tensor>("MeanOut");
-    auto *variance_out = ctx.Output<Tensor>("VarianceOut");
-    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
-    auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
+    const auto *x = ctx.Input<phi::DenseTensor>("X");
+    auto *y = ctx.Output<phi::DenseTensor>("Y");
+    const auto *scale = ctx.Input<phi::DenseTensor>("Scale");
+    const auto *bias = ctx.Input<phi::DenseTensor>("Bias");
+    const auto *mean = ctx.Input<phi::DenseTensor>("Mean");
+    const auto *variance = ctx.Input<phi::DenseTensor>("Variance");
+    auto *mean_out = ctx.Output<phi::DenseTensor>("MeanOut");
+    auto *variance_out = ctx.Output<phi::DenseTensor>("VarianceOut");
+    auto *saved_mean = ctx.Output<phi::DenseTensor>("SavedMean");
+    auto *saved_variance = ctx.Output<phi::DenseTensor>("SavedVariance");
 
     const auto &x_dims = x->dims();
     PADDLE_ENFORCE_EQ(x_dims.size(),
@@ -398,7 +398,7 @@ class SyncBatchNormNPUKernel : public framework::OpKernel<T> {
 
     } else {  // training
       if (ctx.HasInput("MomentumTensor")) {
-        const auto *mom_tensor = ctx.Input<Tensor>("MomentumTensor");
+        const auto *mom_tensor = ctx.Input<phi::DenseTensor>("MomentumTensor");
         Tensor mom_cpu;
         paddle::framework::TensorCopySync(
             *mom_tensor, platform::CPUPlace(), &mom_cpu);
@@ -581,12 +581,13 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
     const std::string layout_str = ctx.Attr<std::string>("data_layout");
     const DataLayout layout = framework::StringToDataLayout(layout_str);
 
-    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
+    const auto *d_y = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
+    const auto *scale = ctx.Input<phi::DenseTensor>("Scale");
+    auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto *d_scale =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
+    auto *d_bias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
+    const auto *saved_mean = ctx.Input<phi::DenseTensor>("SavedMean");
 
     const Tensor *x;
     if (ctx.HasInput("Y")) {
@@ -595,7 +596,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
                         platform::errors::InvalidArgument(
                             "sync_batch_norm_grad doesn't support input Y"));
     } else {
-      x = ctx.Input<Tensor>("X");
+      x = ctx.Input<phi::DenseTensor>("X");
     }
 
     int N, C, H, W, D;
diff --git a/paddle/fluid/operators/take_along_axis_op.cc b/paddle/fluid/operators/take_along_axis_op.cc
index 544c23d8658d2..0856645ad67e3 100644
--- a/paddle/fluid/operators/take_along_axis_op.cc
+++ b/paddle/fluid/operators/take_along_axis_op.cc
@@ -39,7 +39,7 @@ class TakeAlongAxisOp : public framework::OperatorWithKernel {
   }
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const framework::Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     return framework::OpKernelType(
         expected_kernel_type.data_type_, tensor.place(), tensor.layout());
@@ -79,7 +79,7 @@ class TakeAlongAxisGradOp : public framework::OperatorWithKernel {
   }
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const framework::Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     return framework::OpKernelType(
         expected_kernel_type.data_type_, tensor.place(), tensor.layout());
diff --git a/paddle/fluid/operators/take_along_axis_op_npu.cc b/paddle/fluid/operators/take_along_axis_op_npu.cc
index ab2c42a86b72a..d4f06e6446887 100644
--- a/paddle/fluid/operators/take_along_axis_op_npu.cc
+++ b/paddle/fluid/operators/take_along_axis_op_npu.cc
@@ -22,16 +22,16 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class NPUTakeAlongAxisKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto input = ctx.Input<Tensor>("Input");
+    auto input = ctx.Input<phi::DenseTensor>("Input");
     auto axis = ctx.Attr<int>("Axis");
-    auto index = ctx.Input<Tensor>("Index");
-    auto result = ctx.Output<Tensor>("Result");
+    auto index = ctx.Input<phi::DenseTensor>("Index");
+    auto result = ctx.Output<phi::DenseTensor>("Result");
     result->mutable_data<T>(ctx.GetPlace());
 
     auto stream =
@@ -48,10 +48,12 @@ class NPUTakeAlongAxisGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto axis = ctx.Attr<int>("Axis");
-    auto index = ctx.Input<Tensor>("Index");
-    auto result_grad = ctx.Input<Tensor>(framework::GradVarName("Result"));
+    auto index = ctx.Input<phi::DenseTensor>("Index");
+    auto result_grad =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Result"));
 
-    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    auto input_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
     input_grad->mutable_data<T>(ctx.GetPlace());
 
     auto stream =
diff --git a/paddle/fluid/operators/tdm_child_op.h b/paddle/fluid/operators/tdm_child_op.h
index b1ca81d566063..445b2fa89e4aa 100644
--- a/paddle/fluid/operators/tdm_child_op.h
+++ b/paddle/fluid/operators/tdm_child_op.h
@@ -28,7 +28,7 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 using DDim = framework::DDim;
 using LoD = framework::LoD;
diff --git a/paddle/fluid/operators/tdm_sampler_op.h b/paddle/fluid/operators/tdm_sampler_op.h
index 28a1260b3efd3..ab24d6b763546 100644
--- a/paddle/fluid/operators/tdm_sampler_op.h
+++ b/paddle/fluid/operators/tdm_sampler_op.h
@@ -29,7 +29,7 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using Sampler = math::Sampler;
 using DDim = framework::DDim;
 using LoD = framework::LoD;
diff --git a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
index 4525d431ff136..f880181662e24 100644
--- a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
+++ b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 class TeacherStudentSigmoidLossOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h
index 41d2662ae2a4d..40bac8c364583 100644
--- a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h
+++ b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h
@@ -19,14 +19,14 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 template <typename T>
 class TeacherStudentSigmoidLossOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    Tensor* y = context.Output<Tensor>("Y");
-    const Tensor* x = context.Input<Tensor>("X");
-    const Tensor* labels = context.Input<Tensor>("Label");
+    phi::DenseTensor* y = context.Output<phi::DenseTensor>("Y");
+    const phi::DenseTensor* x = context.Input<phi::DenseTensor>("X");
+    const phi::DenseTensor* labels = context.Input<phi::DenseTensor>("Label");
     T* y_data = y->mutable_data<T>(context.GetPlace());
     const T* x_data = x->data<T>();
     const T* label_data = labels->data<T>();
@@ -68,13 +68,14 @@ template <typename T>
 class TeacherStudentSigmoidLossGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* x = context.Input<Tensor>("X");
+    const phi::DenseTensor* x = context.Input<phi::DenseTensor>("X");
     const T* x_data = x->data<T>();
 
-    Tensor* dx = context.Output<Tensor>(framework::GradVarName("X"));
+    phi::DenseTensor* dx =
+        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
     T* dx_data = dx->mutable_data<T>(context.GetPlace());
 
-    const Tensor* labels = context.Input<Tensor>("Label");
+    const phi::DenseTensor* labels = context.Input<phi::DenseTensor>("Label");
     const T* label_data = labels->data<T>();
 
     T soft_max_up_bound =
@@ -84,8 +85,8 @@ class TeacherStudentSigmoidLossGradOpKernel : public framework::OpKernel<T> {
 
     int64_t batch_size = x->dims()[0];
 
-    const framework::Tensor* dOut =
-        context.Input<framework::Tensor>(framework::GradVarName("Y"));
+    const phi::DenseTensor* dOut =
+        context.Input<phi::DenseTensor>(framework::GradVarName("Y"));
 
     const T* dout_data = dOut->data<T>();
 
diff --git a/paddle/fluid/operators/temporal_shift_op.cc b/paddle/fluid/operators/temporal_shift_op.cc
index ca446fcb97236..119fcf4f49bc5 100644
--- a/paddle/fluid/operators/temporal_shift_op.cc
+++ b/paddle/fluid/operators/temporal_shift_op.cc
@@ -23,8 +23,6 @@
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class TemporalShiftOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/temporal_shift_op.cu b/paddle/fluid/operators/temporal_shift_op.cu
index fe6a9dd36c8e8..979cc129e9d1e 100644
--- a/paddle/fluid/operators/temporal_shift_op.cu
+++ b/paddle/fluid/operators/temporal_shift_op.cu
@@ -16,8 +16,6 @@
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 template <typename T>
 __global__ void KeTemporalShiftFwNCHW(const T* input,
                                       T* output,
@@ -162,8 +160,8 @@ class TemporalShiftOpCUDAKernel : public framework::OpKernel<T> {
                       true,
                       platform::errors::InvalidArgument(
                           "This kernel only runs on GPU device."));
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
+    auto* input = ctx.Input<phi::DenseTensor>("X");
+    auto* output = ctx.Output<phi::DenseTensor>("Out");
     int t = ctx.Attr<int>("seg_num");
     float shift_ratio = ctx.Attr<float>("shift_ratio");
     const std::string data_format_str = ctx.Attr<std::string>("data_format");
@@ -215,8 +213,10 @@ template <typename T>
 class TemporalShiftGradOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* input_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* output_grad =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
     int t = ctx.Attr<int>("seg_num");
     float shift_ratio = ctx.Attr<float>("shift_ratio");
     const std::string data_format_str = ctx.Attr<std::string>("data_format");
diff --git a/paddle/fluid/operators/temporal_shift_op.h b/paddle/fluid/operators/temporal_shift_op.h
index 688cd816b50cc..abc00c7e600a1 100644
--- a/paddle/fluid/operators/temporal_shift_op.h
+++ b/paddle/fluid/operators/temporal_shift_op.h
@@ -16,7 +16,7 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using DataLayout = framework::DataLayout;
 
 template <typename T>
@@ -91,8 +91,10 @@ template <typename T>
 class TemporalShiftGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* input_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* output_grad =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
     int t = ctx.Attr<int>("seg_num");
     float shift_ratio = ctx.Attr<float>("shift_ratio");
     const std::string data_format_str = ctx.Attr<std::string>("data_format");
diff --git a/paddle/fluid/operators/tensor_array_to_tensor_op.cc b/paddle/fluid/operators/tensor_array_to_tensor_op.cc
index fa25d0b3494bd..bbab23530fa1c 100644
--- a/paddle/fluid/operators/tensor_array_to_tensor_op.cc
+++ b/paddle/fluid/operators/tensor_array_to_tensor_op.cc
@@ -21,7 +21,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using framework::Tensor;
 
 void LodTensorArray2LodTensorVector(const framework::Scope &scope,
                                     const std::string &base_name,
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index 0f8a3d1206264..e7ac8909ca691 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -505,7 +505,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
           inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
       // check the input_tensor
       if (!platform::is_gpu_place(t.place())) {
-        framework::Tensor out;
+        phi::DenseTensor out;
         platform::CUDAPlace dst_place;
         framework::TransDataDevice(t, dst_place, &out);
         t.ShareDataWith(out);
diff --git a/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h b/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
index 1162bf21592d5..31f913cc65b9c 100644
--- a/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
+++ b/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
@@ -25,9 +25,8 @@ namespace paddle {
 namespace operators {
 
 template <typename T>
-static void InitRandom(framework::Tensor *tensor,
-                       const platform::Place &place) {
-  framework::Tensor cpu_tensor;
+static void InitRandom(phi::DenseTensor *tensor, const platform::Place &place) {
+  phi::DenseTensor cpu_tensor;
   auto *cpu_ptr =
       cpu_tensor.mutable_data<T>(tensor->dims(), platform::CPUPlace());
   int64_t numel = cpu_tensor.numel();
@@ -69,23 +68,23 @@ static bool TestLeakyReluGradGradMain(const framework::DDim &dim,
   LeakyReluGradGradFunctor<T> functor;
   functor.alpha = alpha;
   auto &dev_ctx = *platform::DeviceContextPool::Instance().Get(place);
-  framework::Tensor *out = nullptr;
-  framework::Tensor *dout = nullptr;
-  framework::Tensor *dx = nullptr;
+  phi::DenseTensor *out = nullptr;
+  phi::DenseTensor *dout = nullptr;
+  phi::DenseTensor *dx = nullptr;
 
-  framework::Tensor x;
+  phi::DenseTensor x;
   x.Resize(dim);
   InitRandom<T>(&x, place);
 
-  framework::Tensor ddx;
+  phi::DenseTensor ddx;
   ddx.Resize(dim);
   InitRandom<T>(&ddx, place);
 
-  framework::Tensor ddout;
+  phi::DenseTensor ddout;
   ddout.Resize(dim);
   InitRandom<T>(&ddout, place);
 
-  framework::Tensor ddout_actual;
+  phi::DenseTensor ddout_actual;
   ddout_actual.mutable_data<T>(dim, place);
   LeakyReluGradGradEachElementFunctor<T> actual_functor(ddx.data<T>(),
                                                         x.data<T>(),
@@ -112,7 +111,7 @@ static bool TestLeakyReluGradGradMain(const framework::DDim &dim,
 
   dev_ctx.Wait();
 
-  framework::Tensor ddout_cpu, ddout_actual_cpu;
+  phi::DenseTensor ddout_cpu, ddout_actual_cpu;
   framework::TensorCopySync(ddout, platform::CPUPlace(), &ddout_cpu);
   framework::TensorCopySync(
       ddout_actual, platform::CPUPlace(), &ddout_actual_cpu);
diff --git a/paddle/fluid/operators/tile_op.cc b/paddle/fluid/operators/tile_op.cc
index 8cf132915402e..172e96737061d 100644
--- a/paddle/fluid/operators/tile_op.cc
+++ b/paddle/fluid/operators/tile_op.cc
@@ -24,8 +24,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class TileOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -40,7 +38,7 @@ class TileOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     if (var_name == "repeat_times_tensor" || var_name == "RepeatTimes") {
       return expected_kernel_type;
@@ -132,7 +130,7 @@ class TileGradOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
     if (var_name == "repeat_times_tensor" || var_name == "RepeatTimes") {
       return expected_kernel_type;
diff --git a/paddle/fluid/operators/tile_op_functor.h b/paddle/fluid/operators/tile_op_functor.h
index 03aa19c8817ab..16f77f4f17495 100644
--- a/paddle/fluid/operators/tile_op_functor.h
+++ b/paddle/fluid/operators/tile_op_functor.h
@@ -27,7 +27,7 @@ inline std::vector<int> get_repeat_times(
   if (ctx.HasInput("RepeatTimes")) {
     auto* repeat_tensor = ctx.Input<framework::LoDTensor>("RepeatTimes");
     auto* repeat_data = repeat_tensor->data<int>();
-    framework::Tensor cpu_repeat_tensor;
+    phi::DenseTensor cpu_repeat_tensor;
     if (platform::is_gpu_place(repeat_tensor->place()) ||
         platform::is_xpu_place(repeat_tensor->place()) ||
         platform::is_mlu_place(repeat_tensor->place()) ||
@@ -42,7 +42,7 @@ inline std::vector<int> get_repeat_times(
   }
 
   auto list_repeat_times_tensor =
-      ctx.MultiInput<framework::Tensor>("repeat_times_tensor");
+      ctx.MultiInput<phi::DenseTensor>("repeat_times_tensor");
   if (list_repeat_times_tensor.size() > 0) {
     // get tensor from
     std::vector<int> vec_repeat_times;
@@ -52,7 +52,7 @@ inline std::vector<int> get_repeat_times(
           platform::is_xpu_place(tensor->place()) ||
           platform::is_mlu_place(tensor->place()) ||
           platform::is_npu_place(tensor->place())) {
-        framework::Tensor temp;
+        phi::DenseTensor temp;
         paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
         vec_repeat_times.push_back(*temp.data<int32_t>());
       } else {
diff --git a/paddle/fluid/operators/tile_op_mlu.cc b/paddle/fluid/operators/tile_op_mlu.cc
index 80cb6340e4ca7..2b2b3df4431f1 100644
--- a/paddle/fluid/operators/tile_op_mlu.cc
+++ b/paddle/fluid/operators/tile_op_mlu.cc
@@ -18,13 +18,13 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class TileMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto rank = context.Input<Tensor>("X")->dims().size();
+    auto rank = context.Input<phi::DenseTensor>("X")->dims().size();
     PADDLE_ENFORCE_GE(
         rank,
         1,
@@ -58,7 +58,7 @@ class TileMLUKernel : public framework::OpKernel<T> {
             MAX_RANK_SUPPORTED,
             repeat_times_size));
 
-    auto* in0 = context.Input<framework::Tensor>("X");
+    auto* in0 = context.Input<phi::DenseTensor>("X");
     auto in_dims = in0->dims();
     for (size_t i = 0; i < repeat_times.size(); ++i) {
       PADDLE_ENFORCE_GT(
@@ -86,7 +86,7 @@ class TileMLUKernel : public framework::OpKernel<T> {
             vec_in_dims.size(),
             repeat_times.size()));
 
-    auto* out0 = context.Output<framework::Tensor>("Out");
+    auto* out0 = context.Output<phi::DenseTensor>("Out");
     bool repeat_one_times = true;
     for (size_t i = 0; i < repeat_times.size(); ++i) {
       if (repeat_times[i] != 1) {
diff --git a/paddle/fluid/operators/tile_op_npu.cc b/paddle/fluid/operators/tile_op_npu.cc
index 706e9f7c52797..2997052257d18 100644
--- a/paddle/fluid/operators/tile_op_npu.cc
+++ b/paddle/fluid/operators/tile_op_npu.cc
@@ -18,14 +18,14 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using NPUDeviceContext = platform::NPUDeviceContext;
 
 template <typename T>
 class TileNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto rank = context.Input<Tensor>("X")->dims().size();
+    auto rank = context.Input<phi::DenseTensor>("X")->dims().size();
     PADDLE_ENFORCE_GE(
         rank,
         1,
@@ -64,7 +64,7 @@ class TileNPUKernel : public framework::OpKernel<T> {
 
  protected:
   void Tile(const framework::ExecutionContext& context) const {
-    auto* in0 = context.Input<framework::Tensor>("X");
+    auto* in0 = context.Input<phi::DenseTensor>("X");
 
     auto in_dims = in0->dims();
     auto repeat_times = get_repeat_times(context);
@@ -93,7 +93,7 @@ class TileNPUKernel : public framework::OpKernel<T> {
             "'repeat_times' for tile op must match after promotion.",
             vec_in_dims.size(),
             repeat_times.size()));
-    auto* out0 = context.Output<framework::Tensor>("Out");
+    auto* out0 = context.Output<phi::DenseTensor>("Out");
 
     framework::DDim new_in_dims = phi::make_ddim(vec_in_dims);
     framework::DDim out_dims(new_in_dims);
diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h
index 4a038c93a1f49..7cc88b24efe78 100644
--- a/paddle/fluid/operators/top_k_function_cuda.h
+++ b/paddle/fluid/operators/top_k_function_cuda.h
@@ -55,7 +55,7 @@ struct NumericTraits<paddle::platform::float16>
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 inline void GetDims(
     const phi::DDim& dim, int axis, int* pre, int* n, int* post) {
@@ -903,12 +903,12 @@ __global__ void AssignGradWithAxis(const T* grad_out,
 // use the radix sort for the topk
 template <typename T>
 bool SortTopk(const phi::GPUContext& ctx,
-              const framework::Tensor* input_tensor,
+              const phi::DenseTensor* input_tensor,
               const int64_t num_cols,
               const int64_t num_rows,
               const int k,
-              framework::Tensor* out_tensor,
-              framework::Tensor* indices_tensor,
+              phi::DenseTensor* out_tensor,
+              phi::DenseTensor* indices_tensor,
               bool largest = true) {
   auto cu_stream = ctx.stream();
 
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
index 79236f590f7dc..c1df0a6b12eac 100644
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -30,7 +30,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 #define FIXED_BLOCK_DIM_BASE(dim, ...) \
   case (dim): {                        \
@@ -52,12 +52,12 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
         platform::is_gpu_place(ctx.GetPlace()),
         true,
         platform::errors::InvalidArgument("It must use CUDAPlace."));
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
-    auto* indices = ctx.Output<Tensor>("Indices");
+    auto* input = ctx.Input<phi::DenseTensor>("X");
+    auto* output = ctx.Output<phi::DenseTensor>("Out");
+    auto* indices = ctx.Output<phi::DenseTensor>("Indices");
     int k = static_cast<int>(ctx.Attr<int>("k"));
 
-    auto* k_t = ctx.Input<Tensor>("K");
+    auto* k_t = ctx.Input<phi::DenseTensor>("K");
     if (k_t) {
       Tensor k_host;
       framework::TensorCopySync(*k_t, platform::CPUPlace(), &k_host);
@@ -122,10 +122,12 @@ class TopkOpGradCUDAKernel : public framework::OpKernel<T> {
         platform::is_gpu_place(context.GetPlace()),
         true,
         platform::errors::InvalidArgument("It must use CUDAPlace."));
-    auto* x = context.Input<Tensor>("X");
-    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* indices = context.Input<Tensor>("Indices");
-    auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* x = context.Input<phi::DenseTensor>("X");
+    auto* out_grad =
+        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* indices = context.Input<phi::DenseTensor>("Indices");
+    auto* x_grad =
+        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
     const T* out_grad_data = out_grad->data<T>();
diff --git a/paddle/fluid/operators/top_k_op.h b/paddle/fluid/operators/top_k_op.h
index fa573da0109d5..cd29137d530f4 100644
--- a/paddle/fluid/operators/top_k_op.h
+++ b/paddle/fluid/operators/top_k_op.h
@@ -24,19 +24,19 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class TopkKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     // Get the top k elements of each row of input tensor
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
-    auto* indices = ctx.Output<Tensor>("Indices");
+    auto* input = ctx.Input<phi::DenseTensor>("X");
+    auto* output = ctx.Output<phi::DenseTensor>("Out");
+    auto* indices = ctx.Output<phi::DenseTensor>("Indices");
 
     size_t k = static_cast<int>(ctx.Attr<int>("k"));
-    auto* k_t = ctx.Input<Tensor>("K");
+    auto* k_t = ctx.Input<phi::DenseTensor>("K");
     if (k_t) {
       k = k_t->data<int>()[0];
       framework::DDim output_dims = output->dims();
@@ -94,10 +94,12 @@ template <typename DeviceContext, typename T>
 class TopkGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* indices = context.Input<Tensor>("Indices");
-    auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* x = context.Input<phi::DenseTensor>("X");
+    auto* out_grad =
+        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* indices = context.Input<phi::DenseTensor>("Indices");
+    auto* x_grad =
+        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
     const T* out_grad_data = out_grad->data<T>();
diff --git a/paddle/fluid/operators/top_k_op_mlu.cc b/paddle/fluid/operators/top_k_op_mlu.cc
index c38c4388997f6..a6b96466de442 100644
--- a/paddle/fluid/operators/top_k_op_mlu.cc
+++ b/paddle/fluid/operators/top_k_op_mlu.cc
@@ -28,7 +28,7 @@ class TopkMLUKernel : public framework::OpKernel<T> {
     const auto& place = ctx.GetPlace();
 
     size_t k = static_cast<int>(ctx.Attr<int>("k"));
-    auto* k_t = ctx.Input<Tensor>("K");
+    auto* k_t = ctx.Input<phi::DenseTensor>("K");
     if (k_t) {
       auto k_t_ptr = static_cast<const void*>(k_t->data<int>());
       auto size = k_t->numel() * sizeof(int);
@@ -51,7 +51,7 @@ class TopkMLUKernel : public framework::OpKernel<T> {
     const bool sorted = true;
     const int axis = -1;
     // cnnl only support int32/int16 type of indices
-    framework::Tensor indices_int32(framework::TransToPhiDataType(VT::INT32));
+    phi::DenseTensor indices_int32(framework::TransToPhiDataType(VT::INT32));
     indices_int32.Resize(indices->dims());
     indices_int32.mutable_data<int32_t>(place);
 
diff --git a/paddle/fluid/operators/top_k_op_npu.cc b/paddle/fluid/operators/top_k_op_npu.cc
index e0892af480070..4bf4204e79666 100644
--- a/paddle/fluid/operators/top_k_op_npu.cc
+++ b/paddle/fluid/operators/top_k_op_npu.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-void gen_assist_seq(framework::Tensor* assit_tensor,
+void gen_assist_seq(phi::DenseTensor* assit_tensor,
                     int64_t dim,
                     const framework::ExecutionContext& ctx) {
   const int64_t dimx2 = dim;
@@ -55,7 +55,7 @@ class TopkNPUKernel : public framework::OpKernel<T> {
     auto size = input->dims().size();
     // dim is the last dimension of input
     auto dim = input->dims()[size - 1];
-    framework::Tensor assist_seq_tensor;
+    phi::DenseTensor assist_seq_tensor;
     assist_seq_tensor.Resize({2 * dim});
     assist_seq_tensor.mutable_data<T>(ctx.GetPlace());
     gen_assist_seq(&assist_seq_tensor, dim, ctx);
diff --git a/paddle/fluid/operators/top_k_op_xpu.cc b/paddle/fluid/operators/top_k_op_xpu.cc
index 9ffcd4d46fc1a..46428a3596d56 100644
--- a/paddle/fluid/operators/top_k_op_xpu.cc
+++ b/paddle/fluid/operators/top_k_op_xpu.cc
@@ -23,7 +23,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 template <typename T>
 class TopkXPUKernel : public framework::OpKernel<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
@@ -31,15 +31,15 @@ class TopkXPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     // Get the top k elements of each row of input tensor
-    const auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
-    auto* indices = ctx.Output<Tensor>("Indices");
+    const auto* input = ctx.Input<phi::DenseTensor>("X");
+    auto* output = ctx.Output<phi::DenseTensor>("Out");
+    auto* indices = ctx.Output<phi::DenseTensor>("Indices");
 
     // get k from attr
     int k = static_cast<int>(ctx.Attr<int>("k"));
 
     // get k from input tensor
-    auto* k_t = ctx.Input<Tensor>("K");
+    auto* k_t = ctx.Input<phi::DenseTensor>("K");
     if (k_t) {
       memory::Copy(platform::CPUPlace(),
                    static_cast<void*>(&k),
diff --git a/paddle/fluid/operators/top_k_v2_op_mlu.cc b/paddle/fluid/operators/top_k_v2_op_mlu.cc
index bce76b1351fc8..b9e3d4ff0224e 100644
--- a/paddle/fluid/operators/top_k_v2_op_mlu.cc
+++ b/paddle/fluid/operators/top_k_v2_op_mlu.cc
@@ -38,7 +38,7 @@ class TopkV2MLUKernel : public framework::OpKernel<T> {
     }
 
     size_t k = static_cast<int>(ctx.Attr<int>("k"));
-    auto* k_t = ctx.Input<Tensor>("K");
+    auto* k_t = ctx.Input<phi::DenseTensor>("K");
     if (k_t) {
       auto k_t_ptr = static_cast<const void*>(k_t->data<int>());
       auto size = k_t->numel() * sizeof(int);
@@ -59,7 +59,7 @@ class TopkV2MLUKernel : public framework::OpKernel<T> {
     indices->mutable_data<int64_t>(place);
 
     // cnnl only support int32/int16 type of indices
-    framework::Tensor indices_int32(framework::TransToPhiDataType(VT::INT32));
+    phi::DenseTensor indices_int32(framework::TransToPhiDataType(VT::INT32));
     indices_int32.Resize(indices->dims());
     indices_int32.mutable_data<int32_t>(place);
 
diff --git a/paddle/fluid/operators/top_k_v2_op_npu.cc b/paddle/fluid/operators/top_k_v2_op_npu.cc
index 590f4f66fcbee..487938b142dce 100644
--- a/paddle/fluid/operators/top_k_v2_op_npu.cc
+++ b/paddle/fluid/operators/top_k_v2_op_npu.cc
@@ -26,10 +26,10 @@ template <typename T>
 class TopkV2NPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<Tensor>("X");
-    auto* k_tensor = context.Input<Tensor>("K");
-    auto* out = context.Output<Tensor>("Out");
-    auto* indices = context.Output<Tensor>("Indices");  // type: INT64
+    auto* input = context.Input<phi::DenseTensor>("X");
+    auto* k_tensor = context.Input<phi::DenseTensor>("K");
+    auto* out = context.Output<phi::DenseTensor>("Out");
+    auto* indices = context.Output<phi::DenseTensor>("Indices");  // type: INT64
 
     int32_t k = static_cast<int32_t>(context.Attr<int>("k"));
     int axis = static_cast<int>(context.Attr<int>("axis"));
@@ -58,7 +58,7 @@ class TopkV2NPUKernel : public framework::OpKernel<T> {
     out->mutable_data<T>(context.GetPlace());
     indices->mutable_data<int64_t>(context.GetPlace());
 
-    framework::Tensor indices_int32(experimental::DataType::INT32);
+    phi::DenseTensor indices_int32(experimental::DataType::INT32);
     indices_int32.Resize(output_dims);
     indices_int32.mutable_data<int32_t>(context.GetPlace());
 
diff --git a/paddle/fluid/operators/transfer_layout_op.cc b/paddle/fluid/operators/transfer_layout_op.cc
index 86862d4a10f7d..ae1ad94d9f978 100644
--- a/paddle/fluid/operators/transfer_layout_op.cc
+++ b/paddle/fluid/operators/transfer_layout_op.cc
@@ -64,7 +64,7 @@ class TransferLayoutOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const framework::Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     return framework::OpKernelType(expected_kernel_type.data_type_,
                                    expected_kernel_type.place_,
diff --git a/paddle/fluid/operators/transfer_layout_op.h b/paddle/fluid/operators/transfer_layout_op.h
index 940d26789c52e..a4c7b482ff596 100644
--- a/paddle/fluid/operators/transfer_layout_op.h
+++ b/paddle/fluid/operators/transfer_layout_op.h
@@ -127,8 +127,8 @@ class TransferLayoutFunctor {
 
  private:
   void TransDataLayout(const platform::DeviceContext &dev_ctx,
-                       const framework::Tensor &in,
-                       framework::Tensor *out) const {
+                       const phi::DenseTensor &in,
+                       phi::DenseTensor *out) const {
     PADDLE_ENFORCE_EQ(
         phi::arity(in.dims()),
         4,
diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc
index b342f01e46ff7..d04b1ffa94b92 100644
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -25,8 +25,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
-
 class TransposeOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -262,8 +260,8 @@ class Transpose2Op : public TransposeOp {
       library_ = framework::LibraryType::kMKLDNN;
       layout_ = framework::DataLayout::kMKLDNN;
       using framework::proto::VarType;
-      auto input_data_type =
-          framework::TransToProtoVarType(ctx.Input<Tensor>("X")->dtype());
+      auto input_data_type = framework::TransToProtoVarType(
+          ctx.Input<phi::DenseTensor>("X")->dtype());
       customized_type_value = (input_data_type == VarType::INT8 ||
                                input_data_type == VarType::UINT8)
                                   ? kTransposeMKLDNNINT8
diff --git a/paddle/fluid/operators/transpose_op.cu.h b/paddle/fluid/operators/transpose_op.cu.h
index f7c4597d43756..ac5f5adf2594c 100644
--- a/paddle/fluid/operators/transpose_op.cu.h
+++ b/paddle/fluid/operators/transpose_op.cu.h
@@ -26,7 +26,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using Dim3 = framework::Dim3;
 using Index3 = framework::Index3;
 
@@ -713,9 +713,9 @@ inline void CombineTransposeDim3(const framework::DDim& shape,
 template <typename T, typename IndexType = int>
 struct TransposeSimple {
   static bool run(const phi::GPUContext& ctx,
-                  const Tensor& in,
+                  const phi::DenseTensor& in,
                   const std::vector<int32_t> perm,
-                  Tensor* out) {
+                  phi::DenseTensor* out) {
     // First reduce the dimensions of the input tensor if possible.
     std::vector<int> new_perm;
     framework::DDim new_dims;
@@ -1157,8 +1157,8 @@ inline void LaunchWithDispatchIndex(const phi::GPUContext& ctx,
 template <typename DeviceContext, typename T>
 inline void SimplifyThenLaunch(const int rank,
                                const DeviceContext& ctx,
-                               const Tensor& in,
-                               Tensor* out,
+                               const phi::DenseTensor& in,
+                               phi::DenseTensor* out,
                                const std::vector<int32_t>& perm) {
   int sm_count = ctx.GetSMCount();
   auto src_dims = phi::vectorize<size_t>(in.dims());
@@ -1182,9 +1182,9 @@ inline void SimplifyThenLaunch(const int rank,
 
 template <typename T>
 void TransposeGPUKernelDriver(const phi::GPUContext& ctx,
-                              const Tensor& in,
+                              const phi::DenseTensor& in,
                               const std::vector<int32_t>& perm,
-                              Tensor* out) {
+                              phi::DenseTensor* out) {
   const int rank = perm.size();
   int64_t numel = in.numel();
   bool ret{false};
diff --git a/paddle/fluid/operators/transpose_op.h b/paddle/fluid/operators/transpose_op.h
index 2a6849b1d2584..8b0fe26eeaa30 100644
--- a/paddle/fluid/operators/transpose_op.h
+++ b/paddle/fluid/operators/transpose_op.h
@@ -28,8 +28,8 @@ enum { kTransposeMKLDNNFP32 = 1, kTransposeMKLDNNINT8 = 2 };
 template <typename DeviceContext, typename T>
 inline void TransCompute(const int dim,
                          const DeviceContext& dev_ctx,
-                         const framework::Tensor& in,
-                         framework::Tensor* out,
+                         const phi::DenseTensor& in,
+                         phi::DenseTensor* out,
                          const std::vector<int>& axis) {
   switch (dim) {
     case 1:
diff --git a/paddle/fluid/operators/tree_conv_op.h b/paddle/fluid/operators/tree_conv_op.h
index 8c479076175dd..ee37c2e9fe09b 100644
--- a/paddle/fluid/operators/tree_conv_op.h
+++ b/paddle/fluid/operators/tree_conv_op.h
@@ -22,7 +22,7 @@
 
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using DDim = framework::DDim;
 template <typename DeviceContext, typename T>
 class TreeConvKernel : public framework::OpKernel<T> {
@@ -31,10 +31,10 @@ class TreeConvKernel : public framework::OpKernel<T> {
     math::Tree2ColFunctor<DeviceContext, T> tree2col;
     phi::funcs::SetConstant<DeviceContext, T> constant;
 
-    auto *Edges = ctx.Input<Tensor>("EdgeSet");
-    auto *Embeddings = ctx.Input<Tensor>("NodesVector");
-    auto *Filter = ctx.Input<Tensor>("Filter");
-    auto *output_emb = ctx.Output<Tensor>("Out");
+    auto *Edges = ctx.Input<phi::DenseTensor>("EdgeSet");
+    auto *Embeddings = ctx.Input<phi::DenseTensor>("NodesVector");
+    auto *Filter = ctx.Input<phi::DenseTensor>("Filter");
+    auto *output_emb = ctx.Output<phi::DenseTensor>("Out");
     int max_depth = ctx.Attr<int>("max_depth");
 
     auto &dev_ctx = ctx.template device_context<DeviceContext>();
@@ -78,13 +78,15 @@ template <typename DeviceContext, typename T>
 class TreeConvGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *out_g = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto *in_g = ctx.Output<Tensor>(framework::GradVarName("NodesVector"));
-    auto *filter_g = ctx.Output<Tensor>(framework::GradVarName("Filter"));
+    auto *out_g = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto *in_g =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("NodesVector"));
+    auto *filter_g =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("Filter"));
     int max_depth = ctx.Attr<int>("max_depth");
-    auto *Embeddings = ctx.Input<Tensor>("NodesVector");
-    auto *edges = ctx.Input<Tensor>("EdgeSet");
-    auto *Filter = ctx.Input<Tensor>("Filter");
+    auto *Embeddings = ctx.Input<phi::DenseTensor>("NodesVector");
+    auto *edges = ctx.Input<phi::DenseTensor>("EdgeSet");
+    auto *Filter = ctx.Input<phi::DenseTensor>("Filter");
     math::Tree2ColFunctor<DeviceContext, T> tree2col;
     math::Col2TreeFunctor<DeviceContext, T> col2tree;
     phi::funcs::SetConstant<DeviceContext, T> constant;
diff --git a/paddle/fluid/operators/tril_triu_op_mlu.cc b/paddle/fluid/operators/tril_triu_op_mlu.cc
index a4c5a3bddbc58..892261d6693ce 100644
--- a/paddle/fluid/operators/tril_triu_op_mlu.cc
+++ b/paddle/fluid/operators/tril_triu_op_mlu.cc
@@ -18,8 +18,8 @@ template <typename T>
 class TrilTriuMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     int diagonal = ctx.Attr<int>("diagonal");
     bool lower = ctx.Attr<bool>("lower");
     bool upper;
diff --git a/paddle/fluid/operators/tril_triu_op_npu.cc b/paddle/fluid/operators/tril_triu_op_npu.cc
index aeb8691518c1d..d7ca6a6602c3f 100644
--- a/paddle/fluid/operators/tril_triu_op_npu.cc
+++ b/paddle/fluid/operators/tril_triu_op_npu.cc
@@ -22,8 +22,8 @@ template <typename DeviceContext, typename T>
 class TrilTriuNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     int diagonal = ctx.Attr<int>("diagonal");
     bool lower = ctx.Attr<bool>("lower");
 
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_mlu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_mlu.cc
index 6d5d9f8a3b1d8..d2d51c29371f8 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op_mlu.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op_mlu.cc
@@ -28,10 +28,10 @@ class TruncatedGaussianRandomMLUKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     float mean = context.Attr<float>("mean");
     float std = context.Attr<float>("std");
-    auto* tensor = context.Output<framework::Tensor>("Out");
+    auto* tensor = context.Output<phi::DenseTensor>("Out");
     tensor->mutable_data<T>(context.GetPlace());
 
-    framework::Tensor cpu_tensor(tensor->dtype());
+    phi::DenseTensor cpu_tensor(tensor->dtype());
     cpu_tensor.Resize(tensor->dims());
     T* data_cpu = cpu_tensor.mutable_data<T>(platform::CPUPlace());
 
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
index 433e7d79e1ac3..b5e67ccb24a9a 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
@@ -22,7 +22,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class TruncatedGaussianRandomNPUKernel : public framework::OpKernel<T> {
@@ -57,7 +57,7 @@ class TruncatedGaussianRandomNPUKernel : public framework::OpKernel<T> {
     float max_value = mean + std * 2.0;
     FillNpuTensorWithConstant<float>(&max_tensor, max_value);
 
-    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
@@ -80,7 +80,7 @@ class NPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     float mean = context.Attr<float>("mean");
     float std = context.Attr<float>("std");
-    auto* tensor = context.Output<framework::Tensor>("Out");
+    auto* tensor = context.Output<phi::DenseTensor>("Out");
     tensor->mutable_data<T>(context.GetPlace());
 
     Tensor cpu_tensor(tensor->dtype());
diff --git a/paddle/fluid/operators/unbind_op.cc b/paddle/fluid/operators/unbind_op.cc
index d059c626fe7ea..be64767cb27a4 100644
--- a/paddle/fluid/operators/unbind_op.cc
+++ b/paddle/fluid/operators/unbind_op.cc
@@ -22,7 +22,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using framework::Tensor;
 
 class UnbindOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/uniform_random_inplace_op_xpu.cc b/paddle/fluid/operators/uniform_random_inplace_op_xpu.cc
index 90076a67aafd3..bcd399ec08a7f 100644
--- a/paddle/fluid/operators/uniform_random_inplace_op_xpu.cc
+++ b/paddle/fluid/operators/uniform_random_inplace_op_xpu.cc
@@ -75,7 +75,7 @@ template <typename T>
 class XPUUniformRandomInplaceGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const paddle::framework::ExecutionContext &ctx) const override {
-    auto *dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto *dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     if (dx) {
       T *data = dx->mutable_data<T>(ctx.GetPlace());
       int64_t size = dx->numel();
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index 5324b9697c94a..154c6906ca7c9 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -65,14 +65,14 @@ template <typename T>
 class CPUUniformRandomKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    framework::Tensor *tensor = nullptr;
+    phi::DenseTensor *tensor = nullptr;
     auto out_var = ctx.OutputVar("Out");
     std::vector<int64_t> new_shape;
     auto list_new_shape_tensor =
-        ctx.MultiInput<framework::Tensor>("ShapeTensorList");
+        ctx.MultiInput<phi::DenseTensor>("ShapeTensorList");
     if (list_new_shape_tensor.size() > 0 || ctx.HasInput("ShapeTensor")) {
       if (ctx.HasInput("ShapeTensor")) {
-        auto *shape_tensor = ctx.Input<framework::Tensor>("ShapeTensor");
+        auto *shape_tensor = ctx.Input<phi::DenseTensor>("ShapeTensor");
         new_shape = GetNewDataFromShapeTensor(shape_tensor);
       } else if (list_new_shape_tensor.size() > 0) {
         new_shape = GetNewDataFromShapeTensorList(list_new_shape_tensor);
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index 2ceb8a68d863d..7065067ddd91a 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -20,14 +20,14 @@ template <typename T>
 class GPUUniformRandomKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    framework::Tensor* tensor = nullptr;
+    phi::DenseTensor* tensor = nullptr;
     auto out_var = context.OutputVar("Out");
     std::vector<int64_t> new_shape;
     auto list_new_shape_tensor =
-        context.MultiInput<framework::Tensor>("ShapeTensorList");
+        context.MultiInput<phi::DenseTensor>("ShapeTensorList");
     if (list_new_shape_tensor.size() > 0 || context.HasInput("ShapeTensor")) {
       if (context.HasInput("ShapeTensor")) {
-        auto* shape_tensor = context.Input<framework::Tensor>("ShapeTensor");
+        auto* shape_tensor = context.Input<phi::DenseTensor>("ShapeTensor");
         new_shape = GetNewDataFromShapeTensor(shape_tensor);
       } else if (list_new_shape_tensor.size() > 0) {
         new_shape = GetNewDataFromShapeTensorList(list_new_shape_tensor);
diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h
index 9f0f93f5573f5..bf2666deda28b 100644
--- a/paddle/fluid/operators/uniform_random_op.h
+++ b/paddle/fluid/operators/uniform_random_op.h
@@ -30,14 +30,14 @@
 
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 inline std::vector<int64_t> GetNewDataFromShapeTensor(
-    const Tensor* new_data_tensor) {
+    const phi::DenseTensor* new_data_tensor) {
   if (framework::TransToProtoVarType(new_data_tensor->dtype()) ==
       framework::proto::VarType::INT64) {
     auto* new_data = new_data_tensor->data<int64_t>();
-    framework::Tensor cpu_starts_tensor;
+    phi::DenseTensor cpu_starts_tensor;
     if (platform::is_gpu_place(new_data_tensor->place())) {
       paddle::framework::TensorCopySync(
           *new_data_tensor, platform::CPUPlace(), &cpu_starts_tensor);
@@ -50,7 +50,7 @@ inline std::vector<int64_t> GetNewDataFromShapeTensor(
              framework::proto::VarType::INT32) {
     auto* new_data = new_data_tensor->data<int32_t>();
     std::vector<int64_t> vec_new_data;
-    framework::Tensor cpu_starts_tensor;
+    phi::DenseTensor cpu_starts_tensor;
     if (platform::is_gpu_place(new_data_tensor->place())) {
       paddle::framework::TensorCopySync(
           *new_data_tensor, platform::CPUPlace(), &cpu_starts_tensor);
@@ -69,7 +69,7 @@ inline std::vector<int64_t> GetNewDataFromShapeTensor(
 }
 
 inline std::vector<int64_t> GetNewDataFromShapeTensorList(
-    const std::vector<const Tensor*>& list_new_shape_tensor) {
+    const std::vector<const phi::DenseTensor*>& list_new_shape_tensor) {
   std::vector<int64_t> vec_new_shape;
   vec_new_shape.reserve(list_new_shape_tensor.size());
   for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
@@ -85,7 +85,7 @@ inline std::vector<int64_t> GetNewDataFromShapeTensorList(
     if (framework::TransToProtoVarType(tensor->dtype()) ==
         framework::proto::VarType::INT32) {
       if (platform::is_gpu_place(tensor->place())) {
-        framework::Tensor temp;
+        phi::DenseTensor temp;
         paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
         vec_new_shape.push_back(static_cast<int64_t>(*temp.data<int32_t>()));
       } else {
@@ -94,7 +94,7 @@ inline std::vector<int64_t> GetNewDataFromShapeTensorList(
     } else if (framework::TransToProtoVarType(tensor->dtype()) ==
                framework::proto::VarType::INT64) {
       if (platform::is_gpu_place(tensor->place())) {
-        framework::Tensor temp;
+        phi::DenseTensor temp;
         paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
         vec_new_shape.push_back(*temp.data<int64_t>());
       } else {
@@ -148,7 +148,7 @@ struct UniformGenerator {
 
 template <typename T>
 void UniformRandom(const framework::ExecutionContext& context,
-                   framework::Tensor* tensor) {
+                   phi::DenseTensor* tensor) {
   int64_t size = tensor->numel();
   auto& dev_cxt = context.template device_context<phi::GPUContext>();
   T* data = tensor->mutable_data<T>(dev_cxt.GetPlace());
diff --git a/paddle/fluid/operators/uniform_random_op_mlu.cc b/paddle/fluid/operators/uniform_random_op_mlu.cc
index 644fdad7fdc75..c37cb5dd2f31c 100644
--- a/paddle/fluid/operators/uniform_random_op_mlu.cc
+++ b/paddle/fluid/operators/uniform_random_op_mlu.cc
@@ -23,15 +23,15 @@ template <typename T>
 class MLUUniformRandomKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    framework::Tensor *tensor = nullptr;
+    phi::DenseTensor *tensor = nullptr;
     auto out_var = ctx.OutputVar("Out");
 
     std::vector<int64_t> new_shape;
     auto list_new_shape_tensor =
-        ctx.MultiInput<framework::Tensor>("ShapeTensorList");
+        ctx.MultiInput<phi::DenseTensor>("ShapeTensorList");
     if (list_new_shape_tensor.size() > 0 || ctx.HasInput("ShapeTensor")) {
       if (ctx.HasInput("ShapeTensor")) {
-        auto *shape_tensor = ctx.Input<framework::Tensor>("ShapeTensor");
+        auto *shape_tensor = ctx.Input<phi::DenseTensor>("ShapeTensor");
         new_shape = GetNewDataFromShapeTensor(shape_tensor);
       } else if (list_new_shape_tensor.size() > 0) {
         new_shape = GetNewDataFromShapeTensorList(list_new_shape_tensor);
diff --git a/paddle/fluid/operators/uniform_random_op_npu.cc b/paddle/fluid/operators/uniform_random_op_npu.cc
index a16f8d25de3e7..81b84f5909d8e 100644
--- a/paddle/fluid/operators/uniform_random_op_npu.cc
+++ b/paddle/fluid/operators/uniform_random_op_npu.cc
@@ -26,14 +26,14 @@ template <typename T>
 class NPUUniformRandomKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    framework::Tensor *tensor = nullptr;
+    phi::DenseTensor *tensor = nullptr;
     auto out_var = ctx.OutputVar("Out");
     std::vector<int64_t> new_shape;
     auto list_new_shape_tensor =
-        ctx.MultiInput<framework::Tensor>("ShapeTensorList");
+        ctx.MultiInput<phi::DenseTensor>("ShapeTensorList");
     if (list_new_shape_tensor.size() > 0 || ctx.HasInput("ShapeTensor")) {
       if (ctx.HasInput("ShapeTensor")) {
-        auto *shape_tensor = ctx.Input<framework::Tensor>("ShapeTensor");
+        auto *shape_tensor = ctx.Input<phi::DenseTensor>("ShapeTensor");
         new_shape = GetNewDataFromShapeTensor(shape_tensor);
       } else if (list_new_shape_tensor.size() > 0) {
         new_shape = GetNewDataFromShapeTensorList(list_new_shape_tensor);
diff --git a/paddle/fluid/operators/unique_op.h b/paddle/fluid/operators/unique_op.h
index 6bcb4d2c609f7..45b1e3c435bdc 100644
--- a/paddle/fluid/operators/unique_op.h
+++ b/paddle/fluid/operators/unique_op.h
@@ -31,15 +31,15 @@ namespace operators {
 
 template <typename InT>
 struct UniqueOpFunctor {
-  framework::Tensor* out_;
-  framework::Tensor* index_;
-  const framework::Tensor* in_;
-  framework::Tensor* count_;
-
-  UniqueOpFunctor(framework::Tensor* out,
-                  framework::Tensor* index,
-                  const framework::Tensor* in,
-                  framework::Tensor* count = nullptr)
+  phi::DenseTensor* out_;
+  phi::DenseTensor* index_;
+  const phi::DenseTensor* in_;
+  phi::DenseTensor* count_;
+
+  UniqueOpFunctor(phi::DenseTensor* out,
+                  phi::DenseTensor* index,
+                  const phi::DenseTensor* in,
+                  phi::DenseTensor* count = nullptr)
       : out_(out), index_(index), in_(in), count_(count) {}
 
   template <typename IndexT>
@@ -113,9 +113,9 @@ struct UniqueOpFunctor {
   }
 };
 
-static std::vector<framework::Tensor> Unbind(const framework::Tensor& in) {
+static std::vector<phi::DenseTensor> Unbind(const phi::DenseTensor& in) {
   int64_t size = in.dims()[0];
-  std::vector<framework::Tensor> tensors(size);
+  std::vector<phi::DenseTensor> tensors(size);
   for (int64_t i = 0; i < size; ++i) {
     tensors[i] = in.Slice(i, i + 1);
   }
@@ -123,7 +123,7 @@ static std::vector<framework::Tensor> Unbind(const framework::Tensor& in) {
 }
 
 template <typename T>
-static bool Equal(const framework::Tensor& a, const framework::Tensor& b) {
+static bool Equal(const phi::DenseTensor& a, const phi::DenseTensor& b) {
   if (a.numel() != b.numel()) {
     return false;
   }
@@ -137,8 +137,8 @@ static bool Equal(const framework::Tensor& a, const framework::Tensor& b) {
 
 template <typename InT, typename IndexT>
 static void UniqueFlattendTensor(const framework::ExecutionContext& context,
-                                 const framework::Tensor& in,
-                                 framework::Tensor* out,
+                                 const phi::DenseTensor& in,
+                                 phi::DenseTensor* out,
                                  bool return_index,
                                  bool return_inverse,
                                  bool return_counts) {
@@ -149,7 +149,7 @@ static void UniqueFlattendTensor(const framework::ExecutionContext& context,
   std::copy(unique.begin(), unique.end(), out_data);
 
   if (return_index) {
-    auto* indices = context.Output<framework::Tensor>("Indices");
+    auto* indices = context.Output<phi::DenseTensor>("Indices");
     indices->Resize(phi::make_ddim({out->numel()}));
     auto indices_data = indices->mutable_data<IndexT>(context.GetPlace());
     std::unordered_map<InT, IndexT> indices_map;
@@ -164,7 +164,7 @@ static void UniqueFlattendTensor(const framework::ExecutionContext& context,
   }
 
   if (return_inverse) {
-    auto* inverse = context.Output<framework::Tensor>("Index");
+    auto* inverse = context.Output<phi::DenseTensor>("Index");
     inverse->Resize(phi::make_ddim({in.numel()}));
     auto inverse_data = inverse->mutable_data<IndexT>(context.GetPlace());
     std::unordered_map<InT, IndexT> inverse_map;
@@ -178,7 +178,7 @@ static void UniqueFlattendTensor(const framework::ExecutionContext& context,
   }
 
   if (return_counts) {
-    auto* count = context.Output<framework::Tensor>("Counts");
+    auto* count = context.Output<phi::DenseTensor>("Counts");
     count->Resize(phi::make_ddim({out->numel()}));
     auto count_data = count->mutable_data<IndexT>(context.GetPlace());
     std::unordered_map<InT, IndexT> counts_map;
@@ -232,8 +232,8 @@ static ForwardIt UniqueDimImpl(const framework::ExecutionContext& context,
 
 template <typename DeviceContext, typename InT, typename IndexT>
 static void UniqueDim(const framework::ExecutionContext& context,
-                      const framework::Tensor& in,
-                      framework::Tensor* out,
+                      const phi::DenseTensor& in,
+                      phi::DenseTensor* out,
                       bool return_index,
                       bool return_inverse,
                       bool return_counts,
@@ -246,7 +246,7 @@ static void UniqueDim(const framework::ExecutionContext& context,
   std::vector<int64_t> in_trans_dims_vec(phi::vectorize(in.dims()));
   in_trans_dims_vec[axis] = in.dims()[0];
   in_trans_dims_vec[0] = in.dims()[axis];
-  framework::Tensor in_trans;
+  phi::DenseTensor in_trans;
   framework::DDim in_trans_dims = phi::make_ddim(in_trans_dims_vec);
   in_trans.Resize(in_trans_dims);
   in_trans.mutable_data<InT>(context.GetPlace());
@@ -278,7 +278,7 @@ static void UniqueDim(const framework::ExecutionContext& context,
             });
 
   // sort tensor according to indices
-  framework::Tensor input_sorted;
+  phi::DenseTensor input_sorted;
   input_sorted.Resize(in_trans_dims);
   input_sorted.mutable_data<InT>(context.GetPlace());
   InT* input_sorted_data = input_sorted.data<InT>();
@@ -288,11 +288,11 @@ static void UniqueDim(const framework::ExecutionContext& context,
            col * sizeof(InT));
   }
 
-  std::vector<framework::Tensor> input_unbind = Unbind(input_sorted);
+  std::vector<phi::DenseTensor> input_unbind = Unbind(input_sorted);
   std::vector<IndexT> inverse_vec(sorted_indices_vec.size(), 0);
   std::vector<IndexT> counts_vec(sorted_indices_vec.size(), 0);
   std::vector<IndexT> indices_vec(sorted_indices_vec.size(), 0);
-  auto last = UniqueDimImpl<std::vector<framework::Tensor>::iterator, InT>(
+  auto last = UniqueDimImpl<std::vector<phi::DenseTensor>::iterator, InT>(
       context,
       input_unbind.begin(),
       input_unbind.end(),
@@ -306,7 +306,7 @@ static void UniqueDim(const framework::ExecutionContext& context,
                     indices_vec.end());
 
   math::ConcatFunctor<DeviceContext, InT> concat_functor;
-  framework::Tensor out_trans;
+  phi::DenseTensor out_trans;
   std::vector<int64_t> out_trans_dims_vec = in_trans_dims_vec;
   out_trans_dims_vec[0] = input_unbind.size();
   out_trans.Resize(phi::make_ddim(out_trans_dims_vec));
@@ -319,17 +319,17 @@ static void UniqueDim(const framework::ExecutionContext& context,
       out_trans.dims().size(), dev_ctx, out_trans, out, permute);
 
   if (return_inverse) {
-    auto* inverse = context.Output<framework::Tensor>("Index");
+    auto* inverse = context.Output<phi::DenseTensor>("Index");
     framework::TensorFromVector(inverse_vec, context.device_context(), inverse);
   }
 
   if (return_counts) {
-    auto* count = context.Output<framework::Tensor>("Counts");
+    auto* count = context.Output<phi::DenseTensor>("Counts");
     framework::TensorFromVector(counts_vec, context.device_context(), count);
   }
 
   if (return_index) {
-    auto* indices = context.Output<framework::Tensor>("Indices");
+    auto* indices = context.Output<phi::DenseTensor>("Indices");
     framework::TensorFromVector(indices_vec, context.device_context(), indices);
   }
 }
@@ -337,15 +337,15 @@ static void UniqueDim(const framework::ExecutionContext& context,
 template <typename DeviceContext, typename InT>
 struct UniqueFlattendTensorFunctor {
   const framework::ExecutionContext& ctx_;
-  const framework::Tensor& in_;
-  framework::Tensor* out_;
+  const phi::DenseTensor& in_;
+  phi::DenseTensor* out_;
   const bool return_index_;
   const bool return_inverse_;
   const bool return_counts_;
 
   UniqueFlattendTensorFunctor(const framework::ExecutionContext& context,
-                              const framework::Tensor& in,
-                              framework::Tensor* out,
+                              const phi::DenseTensor& in,
+                              phi::DenseTensor* out,
                               bool return_index,
                               bool return_inverse,
                               bool return_counts)
@@ -366,16 +366,16 @@ struct UniqueFlattendTensorFunctor {
 template <typename DeviceContext, typename InT>
 struct UniqueDimFunctor {
   const framework::ExecutionContext& ctx_;
-  const framework::Tensor& in_;
-  framework::Tensor* out_;
+  const phi::DenseTensor& in_;
+  phi::DenseTensor* out_;
   const int axis_;
   const bool return_index_;
   const bool return_inverse_;
   const bool return_counts_;
 
   UniqueDimFunctor(const framework::ExecutionContext& context,
-                   const framework::Tensor& in,
-                   framework::Tensor* out,
+                   const phi::DenseTensor& in,
+                   phi::DenseTensor* out,
                    const int axis,
                    bool return_index,
                    bool return_inverse,
@@ -399,8 +399,8 @@ template <typename DeviceContext, typename T>
 class UniqueKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
+    auto* x = context.Input<phi::DenseTensor>("X");
+    auto* out = context.Output<phi::DenseTensor>("Out");
     auto data_type = static_cast<framework::proto::VarType::Type>(
         context.Attr<int>("dtype"));
     if (data_type == framework::proto::VarType::INT32) {
@@ -414,7 +414,7 @@ class UniqueKernel : public framework::OpKernel<T> {
               x->numel()));
     }
     if (!context.Attr<bool>("is_sorted")) {
-      auto* index = context.Output<framework::Tensor>("Index");
+      auto* index = context.Output<phi::DenseTensor>("Index");
 
       framework::VisitDataType(data_type, UniqueOpFunctor<T>(out, index, x));
       return;
diff --git a/paddle/fluid/operators/unique_with_counts_op.h b/paddle/fluid/operators/unique_with_counts_op.h
index 227fdef222432..eb3cc2d4731df 100644
--- a/paddle/fluid/operators/unique_with_counts_op.h
+++ b/paddle/fluid/operators/unique_with_counts_op.h
@@ -31,10 +31,10 @@ class UniqueWithCountsKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     auto data_type = static_cast<framework::proto::VarType::Type>(
         context.Attr<int>("dtype"));
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto* index = context.Output<framework::Tensor>("Index");
-    auto* count = context.Output<framework::Tensor>("Count");
+    auto* x = context.Input<phi::DenseTensor>("X");
+    auto* out = context.Output<phi::DenseTensor>("Out");
+    auto* index = context.Output<phi::DenseTensor>("Index");
+    auto* count = context.Output<phi::DenseTensor>("Count");
     framework::VisitDataType(data_type,
                              UniqueOpFunctor<T>(out, index, x, count));
   }
diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc
index f01ae5f142d28..2a4da567e5871 100644
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
@@ -154,7 +154,7 @@ class UnsqueezeOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const framework::Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     if (var_name == "AxesTensor" || var_name == "AxesTensorList") {
       return expected_kernel_type;
diff --git a/paddle/fluid/operators/unsqueeze_op.h b/paddle/fluid/operators/unsqueeze_op.h
index 774a8d553fd51..a082918c83dcc 100644
--- a/paddle/fluid/operators/unsqueeze_op.h
+++ b/paddle/fluid/operators/unsqueeze_op.h
@@ -37,11 +37,11 @@ class UnsqueezeKernel : public framework::OpKernel<T> {
     bool need_resize_out_dims = false;
     if (axes.empty()) {
       auto axes_tensor_list =
-          context.MultiInput<framework::Tensor>("AxesTensorList");
+          context.MultiInput<phi::DenseTensor>("AxesTensorList");
       if (axes_tensor_list.size() > 0) {
         axes = GetDataFromTensorList<int>(axes_tensor_list);
       } else if (context.HasInput("AxesTensor")) {
-        auto *axes_tensor = context.Input<framework::Tensor>("AxesTensor");
+        auto *axes_tensor = context.Input<phi::DenseTensor>("AxesTensor");
         axes = GetDataFromTensor<int>(axes_tensor);
       }
       need_resize_out_dims = true;
diff --git a/paddle/fluid/operators/unstack_op_mlu.cc b/paddle/fluid/operators/unstack_op_mlu.cc
index 1819e37df597f..55171364377e0 100644
--- a/paddle/fluid/operators/unstack_op_mlu.cc
+++ b/paddle/fluid/operators/unstack_op_mlu.cc
@@ -22,8 +22,8 @@ template <typename T>
 class UnStackMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *x = ctx.Input<Tensor>("X");
-    auto out = ctx.MultiOutput<Tensor>("Y");
+    auto *x = ctx.Input<phi::DenseTensor>("X");
+    auto out = ctx.MultiOutput<phi::DenseTensor>("Y");
     int axis = ctx.Attr<int>("axis");
     if (axis < 0) axis += x->dims().size();
     int num = x->dims()[axis];
@@ -56,8 +56,8 @@ template <typename T>
 class UnStackGradMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    auto x = ctx.MultiInput<Tensor>(framework::GradVarName("Y"));
-    auto *y = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto x = ctx.MultiInput<phi::DenseTensor>(framework::GradVarName("Y"));
+    auto *y = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     int axis = ctx.Attr<int>("axis");
     if (axis < 0) axis += (x[0]->dims().size() + 1);
     int num = static_cast<int>(x.size());
diff --git a/paddle/fluid/operators/unstack_op_npu.cc b/paddle/fluid/operators/unstack_op_npu.cc
index 0c3d40279b01b..18b7de754c0ed 100644
--- a/paddle/fluid/operators/unstack_op_npu.cc
+++ b/paddle/fluid/operators/unstack_op_npu.cc
@@ -22,8 +22,8 @@ template <typename DeviceContext, typename T>
 class UnStackNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *dy = ctx.Input<Tensor>("X");
-    auto dx = ctx.MultiOutput<Tensor>("Y");
+    auto *dy = ctx.Input<phi::DenseTensor>("X");
+    auto dx = ctx.MultiOutput<phi::DenseTensor>("Y");
     int axis = ctx.Attr<int>("axis");
     if (axis < 0) axis += dy->dims().size();
     int num = dy->dims()[axis];
@@ -32,7 +32,7 @@ class UnStackNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    std::vector<paddle::framework::Tensor> dx_list;
+    std::vector<phi::DenseTensor> dx_list;
     for (int i = 0; i < num; i++) {
       dx[i]->mutable_data<T>(ctx.GetPlace());
       dx_list.push_back(*dx[i]);
@@ -48,8 +48,8 @@ template <typename DeviceContext, typename T>
 class UnStackGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    auto x = ctx.MultiInput<Tensor>(framework::GradVarName("Y"));
-    auto *y = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto x = ctx.MultiInput<phi::DenseTensor>(framework::GradVarName("Y"));
+    auto *y = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     int axis = ctx.Attr<int>("axis");
     if (axis < 0) axis += (x[0]->dims().size() + 1);
     int num = static_cast<int>(x.size());
@@ -58,7 +58,7 @@ class UnStackGradNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    std::vector<paddle::framework::Tensor> x_list;
+    std::vector<phi::DenseTensor> x_list;
     for (int i = 0; i < num; i++) {
       x_list.push_back(*x[i]);
     }
diff --git a/paddle/fluid/operators/utils.h b/paddle/fluid/operators/utils.h
index 7315f3a287ab5..adce638e9dd40 100644
--- a/paddle/fluid/operators/utils.h
+++ b/paddle/fluid/operators/utils.h
@@ -22,12 +22,12 @@ namespace paddle {
 namespace operators {
 
 template <typename T = int32_t>
-inline std::vector<T> GetDataFromTensor(const framework::Tensor* x) {
+inline std::vector<T> GetDataFromTensor(const phi::DenseTensor* x) {
   std::vector<T> vec_new_data;
   if (framework::TransToProtoVarType(x->dtype()) ==
       framework::proto::VarType::INT32) {
     auto* data = x->data<int>();
-    framework::Tensor cpu_attr_tensor;
+    phi::DenseTensor cpu_attr_tensor;
     if (!platform::is_cpu_place(x->place())) {
       paddle::framework::TensorCopySync(
           *x, platform::CPUPlace(), &cpu_attr_tensor);
@@ -37,7 +37,7 @@ inline std::vector<T> GetDataFromTensor(const framework::Tensor* x) {
   } else if (framework::TransToProtoVarType(x->dtype()) ==
              framework::proto::VarType::INT64) {
     auto* data = x->data<int64_t>();
-    framework::Tensor cpu_attr_tensor;
+    phi::DenseTensor cpu_attr_tensor;
     if (!platform::is_cpu_place(x->place())) {
       paddle::framework::TensorCopySync(
           *x, platform::CPUPlace(), &cpu_attr_tensor);
@@ -55,7 +55,7 @@ inline std::vector<T> GetDataFromTensor(const framework::Tensor* x) {
 
 template <typename T = int32_t>
 inline std::vector<T> GetDataFromTensorList(
-    const std::vector<const framework::Tensor*>& list_tensor) {
+    const std::vector<const phi::DenseTensor*>& list_tensor) {
   std::vector<T> vec_new_data;
   for (size_t i = 0; i < list_tensor.size(); ++i) {
     auto tensor = list_tensor[i];
@@ -70,7 +70,7 @@ inline std::vector<T> GetDataFromTensorList(
     if (framework::TransToProtoVarType(tensor->dtype()) ==
         framework::proto::VarType::INT32) {
       if (!platform::is_cpu_place(tensor->place())) {
-        framework::Tensor temp;
+        phi::DenseTensor temp;
         paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
         vec_new_data.push_back(static_cast<T>(*temp.data<int>()));
       } else {
@@ -79,7 +79,7 @@ inline std::vector<T> GetDataFromTensorList(
     } else if (framework::TransToProtoVarType(tensor->dtype()) ==
                framework::proto::VarType::INT64) {
       if (!platform::is_cpu_place(tensor->place())) {
-        framework::Tensor temp;
+        phi::DenseTensor temp;
         paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
         // NOTE: Converting int64 to int32 may cause data overflow.
         vec_new_data.push_back(static_cast<T>(*temp.data<int64_t>()));
@@ -105,7 +105,7 @@ inline framework::DDim GetShape(const framework::ExecutionContext& ctx) {
   }
 
   // 2. shape is a list/tuple containing Tensor
-  auto shape_tensor_list = ctx.MultiInput<framework::Tensor>("ShapeTensorList");
+  auto shape_tensor_list = ctx.MultiInput<phi::DenseTensor>("ShapeTensorList");
   if (shape_tensor_list.size() > 0) {
     auto vec_shape = GetDataFromTensorList(shape_tensor_list);
     return phi::make_ddim(vec_shape);
@@ -117,10 +117,10 @@ inline framework::DDim GetShape(const framework::ExecutionContext& ctx) {
 }
 
 template <typename T>
-inline T GetValue(const framework::Tensor* x) {
+inline T GetValue(const phi::DenseTensor* x) {
   T value = static_cast<T>(0);
   if (!platform::is_cpu_place(x->place())) {
-    framework::Tensor cpu_x;
+    phi::DenseTensor cpu_x;
     framework::TensorCopy(*x, platform::CPUPlace(), &cpu_x);
 #if defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
diff --git a/paddle/fluid/operators/var_conv_2d_op.cc b/paddle/fluid/operators/var_conv_2d_op.cc
index eb7421019bd81..9ae05dd65a309 100644
--- a/paddle/fluid/operators/var_conv_2d_op.cc
+++ b/paddle/fluid/operators/var_conv_2d_op.cc
@@ -24,7 +24,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 using LoD = framework::LoD;
 
@@ -270,7 +270,7 @@ class CPUVarConv2dOPKernel : public framework::OpKernel<T> {
     auto* bottom = ctx.Input<LoDTensor>("X");
     auto* in_row = ctx.Input<LoDTensor>("ROW");
     auto* in_col = ctx.Input<LoDTensor>("COLUMN");
-    auto* w = ctx.Input<Tensor>("W");
+    auto* w = ctx.Input<phi::DenseTensor>("W");
     auto* top = ctx.Output<LoDTensor>("Out");
     auto* col = ctx.Output<LoDTensor>("Col");
 
@@ -451,7 +451,7 @@ class CPUVarConv2dOPGradKernel : public framework::OpKernel<T> {
 
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* x = ctx.Input<LoDTensor>("X");
-    auto* w = ctx.Input<Tensor>("W");
+    auto* w = ctx.Input<phi::DenseTensor>("W");
     auto* col = ctx.Input<LoDTensor>("Col");
     auto* out = ctx.Input<LoDTensor>("Out");
 
@@ -462,7 +462,7 @@ class CPUVarConv2dOPGradKernel : public framework::OpKernel<T> {
 
     auto* d_out = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
     auto* dx = ctx.Output<LoDTensor>(framework::GradVarName("X"));
-    auto* d_w = ctx.Output<Tensor>(framework::GradVarName("W"));
+    auto* d_w = ctx.Output<phi::DenseTensor>(framework::GradVarName("W"));
 
     Tensor col_grad;
     col_grad.Resize(col->dims());
diff --git a/paddle/fluid/operators/var_conv_2d_op.h b/paddle/fluid/operators/var_conv_2d_op.h
index b8d5de060934f..bb7bd25284a2b 100644
--- a/paddle/fluid/operators/var_conv_2d_op.h
+++ b/paddle/fluid/operators/var_conv_2d_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = framework::LoDTensor;
 using LoD = framework::LoD;
 
diff --git a/paddle/fluid/operators/where_index_op_mlu.cc b/paddle/fluid/operators/where_index_op_mlu.cc
index 389f7960bcdc1..85f463f723ef5 100644
--- a/paddle/fluid/operators/where_index_op_mlu.cc
+++ b/paddle/fluid/operators/where_index_op_mlu.cc
@@ -20,14 +20,14 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class MLUWhereIndexKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* condition = context.Input<Tensor>("Condition");
-    auto* out = context.Output<Tensor>("Out");
+    auto* condition = context.Input<phi::DenseTensor>("Condition");
+    auto* out = context.Output<phi::DenseTensor>("Out");
     auto dims = condition->dims();
     const int rank = dims.size();
 
@@ -54,7 +54,7 @@ class MLUWhereIndexKernel : public framework::OpKernel<T> {
     }
 
     auto& dev_ctx = context.template device_context<MLUDeviceContext>();
-    framework::Tensor out_int32 =
+    phi::DenseTensor out_int32 =
         context.AllocateTmpTensor<int32_t, MLUDeviceContext>(out->dims(),
                                                              dev_ctx);
     MLUCnnlTensorDesc out_int32_desc(out_int32);
diff --git a/paddle/fluid/operators/where_index_op_npu.cc b/paddle/fluid/operators/where_index_op_npu.cc
index cadb76d53f981..5b006cbdcf1b0 100644
--- a/paddle/fluid/operators/where_index_op_npu.cc
+++ b/paddle/fluid/operators/where_index_op_npu.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 
 template <typename T>
 class NPUWhereIndexKernel : public framework::OpKernel<T> {
@@ -29,8 +29,8 @@ class NPUWhereIndexKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     auto& dev_ctx =
         context.template device_context<platform::NPUDeviceContext>();
-    auto* condition = context.Input<Tensor>("Condition");
-    auto* out = context.Output<Tensor>("Out");
+    auto* condition = context.Input<phi::DenseTensor>("Condition");
+    auto* out = context.Output<phi::DenseTensor>("Out");
 
     auto dims = condition->dims();
     const int rank = dims.size();
diff --git a/paddle/fluid/operators/where_op_mlu.cc b/paddle/fluid/operators/where_op_mlu.cc
index 57c20ed14f1aa..53ae38bb48b27 100644
--- a/paddle/fluid/operators/where_op_mlu.cc
+++ b/paddle/fluid/operators/where_op_mlu.cc
@@ -24,10 +24,10 @@ template <typename DeviceContext, typename T>
 class WhereMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* condition = context.Input<framework::Tensor>("Condition");
-    auto* X = context.Input<framework::Tensor>("X");
-    auto* Y = context.Input<framework::Tensor>("Y");
-    auto* out = context.Output<framework::Tensor>("Out");
+    auto* condition = context.Input<phi::DenseTensor>("Condition");
+    auto* X = context.Input<phi::DenseTensor>("X");
+    auto* Y = context.Input<phi::DenseTensor>("Y");
+    auto* out = context.Output<phi::DenseTensor>("Out");
     auto place = context.GetPlace();
     out->mutable_data<T>(place);
     MLUCnnlTensorDesc x_desc(*X);
diff --git a/paddle/fluid/operators/where_op_npu.cc b/paddle/fluid/operators/where_op_npu.cc
index 68a5aef6f3097..bd30931580141 100644
--- a/paddle/fluid/operators/where_op_npu.cc
+++ b/paddle/fluid/operators/where_op_npu.cc
@@ -22,10 +22,10 @@ template <typename DeviceContext, typename T>
 class WhereNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* condition = ctx.Input<framework::Tensor>("Condition");
-    auto* X = ctx.Input<framework::Tensor>("X");
-    auto* Y = ctx.Input<framework::Tensor>("Y");
-    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* condition = ctx.Input<phi::DenseTensor>("Condition");
+    auto* X = ctx.Input<phi::DenseTensor>("X");
+    auto* Y = ctx.Input<phi::DenseTensor>("Y");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
 
     const auto& runner =
@@ -42,10 +42,10 @@ template <typename DeviceContext, typename T>
 class WhereGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* condition = ctx.Input<framework::Tensor>("Condition");
-    auto* dout_t = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx_t = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dy_t = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+    auto* condition = ctx.Input<phi::DenseTensor>("Condition");
+    auto* dout_t = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx_t = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* dy_t = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
 
     if (dx_t != nullptr) {
       dx_t->mutable_data<T>(ctx.GetPlace());
@@ -58,7 +58,7 @@ class WhereGradNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    framework::Tensor tensor_zeros(dout_t->dtype());
+    phi::DenseTensor tensor_zeros(dout_t->dtype());
     tensor_zeros.mutable_data<T>(dout_t->dims(), ctx.GetPlace());
     const auto& runner =
         NpuOpRunner("ZerosLike", {*dout_t}, {tensor_zeros}, {});
diff --git a/paddle/fluid/platform/device/gpu/cuda/cudnn_desc.h b/paddle/fluid/platform/device/gpu/cuda/cudnn_desc.h
index d2c96bd616861..4c949c66d1bc2 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cudnn_desc.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cudnn_desc.h
@@ -33,7 +33,6 @@ class DenseTensor;
 
 namespace paddle {
 namespace platform {
-using framework::Tensor;
 
 template <typename T>
 inline cudnnDataType_t ToCudnnDataType(const T& t) {
@@ -141,7 +140,7 @@ class TensorDescriptor {
   }
   T* desc() { return desc_.get(); }
   T* desc() const { return desc_.get(); }
-  void set(const Tensor& tensor, const int groups = 1) {
+  void set(const phi::DenseTensor& tensor, const int groups = 1) {
     auto dims = phi::vectorize<int>(tensor.dims());
     std::vector<int> strides(dims.size());
     strides[dims.size() - 1] = 1;
@@ -177,7 +176,7 @@ class TensorDescriptor {
                                               transformed_dims.data()));
   }
 
-  void set(const Tensor& tensor, const cudnnTensorFormat_t format) {
+  void set(const phi::DenseTensor& tensor, const cudnnTensorFormat_t format) {
     auto dims = phi::vectorize<int>(tensor.dims());
     auto dtype =
         ToCudnnDataType(framework::TransToProtoVarType(tensor.dtype()));
@@ -228,7 +227,7 @@ class FilterDescriptor {
                                             transformed_dims.data()));
   }
 
-  void set(const Tensor& tensor,
+  void set(const phi::DenseTensor& tensor,
            const cudnnTensorFormat_t format,
            const int groups = 1) {
     auto dims = phi::vectorize<int>(tensor.dims());
diff --git a/paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h b/paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h
index 427901c1a7fd5..2f63ee880b13e 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h
@@ -341,7 +341,7 @@ class ScopedDropoutDescriptor {
                                              const platform::Place& place,
                                              bool initialized,
                                              float dropout_prob_,
-                                             framework::Tensor* dropout_state_,
+                                             phi::DenseTensor* dropout_state_,
                                              int seed,
                                              size_t state_size) {
     if (dropout_state_ == nullptr) {  // for no dropout or test
diff --git a/paddle/fluid/platform/device/gpu/cudnn_desc_test.cc b/paddle/fluid/platform/device/gpu/cudnn_desc_test.cc
index 2e58e71cc2c06..cbe322ef0c48c 100644
--- a/paddle/fluid/platform/device/gpu/cudnn_desc_test.cc
+++ b/paddle/fluid/platform/device/gpu/cudnn_desc_test.cc
@@ -29,7 +29,7 @@ TEST(TensorDescriptor, Empty) {
 }
 
 TEST(TensorDescriptor, Normal) {
-  framework::Tensor tt;
+  phi::DenseTensor tt;
   tt.Resize({2, 3, 4});
   tt.mutable_data<float>(platform::CPUPlace());
 
diff --git a/paddle/fluid/platform/device/gpu/rocm/miopen_desc.h b/paddle/fluid/platform/device/gpu/rocm/miopen_desc.h
index 6f943ea352696..158693f5dad70 100644
--- a/paddle/fluid/platform/device/gpu/rocm/miopen_desc.h
+++ b/paddle/fluid/platform/device/gpu/rocm/miopen_desc.h
@@ -32,7 +32,6 @@ class DenseTensor;
 
 namespace paddle {
 namespace platform {
-using framework::Tensor;
 
 template <typename T>
 inline miopenDataType_t ToCudnnDataType(const T& t) {
diff --git a/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h b/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h
index 9cb5cdfbb164d..7a77a47189d11 100644
--- a/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h
+++ b/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h
@@ -285,7 +285,7 @@ class ScopedDropoutDescriptor {
                                               const platform::Place& place,
                                               bool initialized,
                                               float dropout_prob_,
-                                              framework::Tensor* dropout_state_,
+                                              phi::DenseTensor* dropout_state_,
                                               int seed,
                                               size_t state_size) {
     if (dropout_state_ == nullptr) {  // for no dropout or test
diff --git a/paddle/fluid/platform/device/ipu/ipu_backend.cc b/paddle/fluid/platform/device/ipu/ipu_backend.cc
index 9e960a99123c0..30ee14c44893a 100644
--- a/paddle/fluid/platform/device/ipu/ipu_backend.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_backend.cc
@@ -61,8 +61,8 @@ void IpuBackend::Compile(framework::ir::Graph* graph,
   VLOG(10) << "leave IpuBackend::Compile";
 }
 
-void IpuBackend::Run(const std::vector<const framework::Tensor*>& inputs,
-                     const std::vector<framework::Tensor*>& outputs,
+void IpuBackend::Run(const std::vector<const phi::DenseTensor*>& inputs,
+                     const std::vector<phi::DenseTensor*>& outputs,
                      const framework::ExecutionContext& ctx) {
   timer_->Start();
   executor_->Run(inputs, outputs, ctx);
diff --git a/paddle/fluid/platform/device/ipu/ipu_backend.h b/paddle/fluid/platform/device/ipu/ipu_backend.h
index 1e083e7a3518c..1f15f3832db7f 100644
--- a/paddle/fluid/platform/device/ipu/ipu_backend.h
+++ b/paddle/fluid/platform/device/ipu/ipu_backend.h
@@ -54,8 +54,8 @@ class IpuBackend {
                const std::vector<std::string> &fetch_list);
 
   // Run the compiled graph on ipu
-  void Run(const std::vector<const framework::Tensor *> &inputs,
-           const std::vector<framework::Tensor *> &outputs,
+  void Run(const std::vector<const phi::DenseTensor *> &inputs,
+           const std::vector<phi::DenseTensor *> &outputs,
            const framework::ExecutionContext &ctx);
 
   // Sync weights from IPU while training
diff --git a/paddle/fluid/platform/device/ipu/ipu_utils.h b/paddle/fluid/platform/device/ipu/ipu_utils.h
index 66094a0a4b0ab..9e075d3c06c90 100644
--- a/paddle/fluid/platform/device/ipu/ipu_utils.h
+++ b/paddle/fluid/platform/device/ipu/ipu_utils.h
@@ -25,7 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/float16.h"
 
 using float16 = paddle::platform::float16;
-using Tensor = paddle::framework::Tensor;
+using Tensor = phi::DenseTensor;
 using LoDTensor = paddle::framework::LoDTensor;
 using Scope = paddle::framework::Scope;
 using OpDesc = paddle::framework::OpDesc;
diff --git a/paddle/fluid/platform/device/npu/npu_op_runner.h b/paddle/fluid/platform/device/npu/npu_op_runner.h
index 220dd23c3f14c..bdd25dd462706 100644
--- a/paddle/fluid/platform/device/npu/npu_op_runner.h
+++ b/paddle/fluid/platform/device/npu/npu_op_runner.h
@@ -28,7 +28,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using Tensor = phi::DenseTensor;
 using DataLayout = framework::DataLayout;
 using NPUAttribute = framework::NPUAttribute;
 using NPUAttributeMap = framework::NPUAttributeMap;
diff --git a/paddle/fluid/platform/device_code_test.cc b/paddle/fluid/platform/device_code_test.cc
index cb2649686ec02..c4ac44603dd2f 100644
--- a/paddle/fluid/platform/device_code_test.cc
+++ b/paddle/fluid/platform/device_code_test.cc
@@ -56,9 +56,9 @@ TEST(DeviceCode, cuda) {
   paddle::platform::CUDAPlace place = paddle::platform::CUDAPlace(0);
   paddle::platform::CUDADeviceCode code(place, "saxpy_kernel", saxpy_code);
 
-  paddle::framework::Tensor cpu_x;
-  paddle::framework::Tensor cpu_y;
-  paddle::framework::Tensor cpu_z;
+  phi::DenseTensor cpu_x;
+  phi::DenseTensor cpu_y;
+  phi::DenseTensor cpu_z;
 
   float scale = 2;
   auto dims =
@@ -74,9 +74,9 @@ TEST(DeviceCode, cuda) {
     cpu_y.data<float>()[i] = static_cast<float>(0.5);
   }
 
-  paddle::framework::Tensor x;
-  paddle::framework::Tensor y;
-  paddle::framework::Tensor z;
+  phi::DenseTensor x;
+  phi::DenseTensor y;
+  phi::DenseTensor z;
 
   float* x_data = x.mutable_data<float>(dims, place);
   float* y_data = y.mutable_data<float>(dims, place);
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index 0e97a68edfc9d..07f5f3408a30c 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -21,7 +21,7 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include "dnnl.hpp"
+#include "dnnl.hpp"  // NOLINT
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
@@ -76,7 +76,7 @@ tf_pd<Type> MKLDNNBwdPrimitiveDesc(const Engine& e,
   return tf_pd<Type>(desc, e, p);
 }
 
-inline void MatchShapeToLayout(framework::Tensor* tensor_in,
+inline void MatchShapeToLayout(phi::DenseTensor* tensor_in,
                                framework::DataLayout from,
                                framework::DataLayout to) {
   auto print_dims = [](const std::vector<int>& dims) {
@@ -577,7 +577,7 @@ inline void GetGroupConvWeightsTz(std::vector<int64_t>& weights_tz,  // NOLINT
 }
 
 inline void RegisterModelLayout(
-    std::vector<std::unique_ptr<framework::OperatorBase>>& ops,
+    std::vector<std::unique_ptr<framework::OperatorBase>>& ops,  // NOLINT
     const platform::Place& place) {
   if (platform::is_cpu_place(place)) {
     // If there is already registered NHWC then quit this call
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index ca099cb65d67c..604ddb9555ea4 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -31,7 +31,7 @@ namespace paddle {
 namespace platform {
 
 using framework::DataLayout;
-using framework::Tensor;
+
 using user_function = std::function<std::shared_ptr<float>(const float*)>;
 using memory = dnnl::memory;
 
@@ -236,7 +236,7 @@ class MatMulV2MKLDNNHandler
     }
 
     if (ctx.HasInput("ResidualData")) {
-      auto* residual_data = ctx.Input<Tensor>("ResidualData");
+      auto* residual_data = ctx.Input<phi::DenseTensor>("ResidualData");
       auto residual_data_tz = phi::vectorize(residual_data->dims());
       auto residual_data_md = memory::desc(residual_data_tz,
                                            MKLDNNGetDataType<OT>(),
@@ -273,22 +273,20 @@ class MatMulV2MKLDNNHandler
     return fake_strides;
   }
 
-  std::shared_ptr<memory> AcquireWeightsMemory(const Tensor* input) {
+  std::shared_ptr<memory> AcquireWeightsMemory(const phi::DenseTensor* input) {
     const YT* input_data = input->data<YT>();
     return this->AcquireMemoryFromPrimitive(this->fwd_pd_->weights_desc(),
                                             to_void_cast<YT>(input_data));
   }
 
-  std::shared_ptr<dnnl::memory> AcquireDstMemory(
-      paddle::framework::Tensor* output) {
+  std::shared_ptr<dnnl::memory> AcquireDstMemory(phi::DenseTensor* output) {
     // We cannot use base AcquireDstMemory as it makes an allocation request
     // base on DST memory primitive size. This is fine in general, but in MatMul
     // we have primitive that covers only one batch of Data and then shift
-    // pointer for every new batch. Hence Tensor size is bigger that dst memory
-    // primitive size. So would we request less memory that is there and it
-    // triggers an
-    // assertion.  So as there is no 'any' format here we can leave default size
-    // of Tensor as computed in ComputeInferShape
+    // pointer for every new batch. Hence phi::DenseTensor size is bigger that
+    // dst memory primitive size. So would we request less memory that is there
+    // and it triggers an assertion.  So as there is no 'any' format here we can
+    // leave default size of phi::DenseTensor as computed in ComputeInferShape
     OT* ptr = output->mutable_data<OT>(this->place_);
     return this->AcquireMemoryFromPrimitive(this->fwd_pd_->dst_desc(), ptr);
   }
@@ -304,7 +302,7 @@ class ActivationMKLDNNHandler
                           const framework::ExecutionContext& ctx,
                           const dnnl::engine engine,
                           Place cpu_place,
-                          const framework::Tensor* x)
+                          const phi::DenseTensor* x)
       : platform::MKLDNNHandlerNoCachingT<T,
                                           dnnl::eltwise_forward,
                                           dnnl::eltwise_backward>(engine,
@@ -314,7 +312,7 @@ class ActivationMKLDNNHandler
 
     if (ctx.Type() == "scale") {
       bool bias_after_scale = ctx.Attr<bool>("bias_after_scale");
-      auto* scale_tensor = ctx.Input<Tensor>("ScaleTensor");
+      auto* scale_tensor = ctx.Input<phi::DenseTensor>("ScaleTensor");
       alpha = (scale_tensor == nullptr)
                   ? ctx.Attr<float>("scale")
                   : static_cast<float>(*(scale_tensor->data<T>()));
@@ -327,10 +325,12 @@ class ActivationMKLDNNHandler
         beta *= alpha;
       }
     } else if (ctx.Type() == "clip") {
-      alpha = ctx.HasInput("Min") ? ctx.Input<Tensor>("Min")->data<float>()[0]
-                                  : ctx.Attr<float>("min");
-      beta = ctx.HasInput("Max") ? ctx.Input<Tensor>("Max")->data<float>()[0]
-                                 : ctx.Attr<float>("max");
+      alpha = ctx.HasInput("Min")
+                  ? ctx.Input<phi::DenseTensor>("Min")->data<float>()[0]
+                  : ctx.Attr<float>("min");
+      beta = ctx.HasInput("Max")
+                 ? ctx.Input<phi::DenseTensor>("Max")->data<float>()[0]
+                 : ctx.Attr<float>("max");
     } else {
       // paddle uses beta but mkldnn uses alpha for swish
       if (algorithm == dnnl::algorithm::eltwise_swish) {
@@ -351,8 +351,8 @@ class ActivationMKLDNNHandler
                           const framework::ExecutionContext& ctx,
                           const dnnl::engine engine,
                           Place cpu_place,
-                          const framework::Tensor* x,
-                          const Tensor* dout)
+                          const phi::DenseTensor* x,
+                          const phi::DenseTensor* dout)
       : platform::MKLDNNHandlerNoCachingT<T,
                                           dnnl::eltwise_forward,
                                           dnnl::eltwise_backward>(engine,
@@ -368,10 +368,12 @@ class ActivationMKLDNNHandler
     }
 
     if (ctx.Type() == "clip_grad") {
-      alpha = ctx.HasInput("Min") ? ctx.Input<Tensor>("Min")->data<float>()[0]
-                                  : ctx.Attr<float>("min");
-      beta = ctx.HasInput("Max") ? ctx.Input<Tensor>("Max")->data<float>()[0]
-                                 : ctx.Attr<float>("max");
+      alpha = ctx.HasInput("Min")
+                  ? ctx.Input<phi::DenseTensor>("Min")->data<float>()[0]
+                  : ctx.Attr<float>("min");
+      beta = ctx.HasInput("Max")
+                 ? ctx.Input<phi::DenseTensor>("Max")->data<float>()[0]
+                 : ctx.Attr<float>("max");
     }
 
     this->AcquireForwardPrimitiveDescriptor(dnnl::prop_kind::forward_training,
@@ -384,7 +386,7 @@ class ActivationMKLDNNHandler
   }
 
   std::shared_ptr<dnnl::memory> AcquireBackwardSrcMemory(
-      const framework::Tensor* input) {
+      const phi::DenseTensor* input) {
     const T* input_data = input->data<T>();
     return this->AcquireMemoryFromPrimitive(this->bwd_pd_->src_desc(),
                                             to_void_cast<T>(input_data));
@@ -474,7 +476,7 @@ class ReorderMKLDNNHandler {
     return sub_mem_p;
   }
 
-  std::shared_ptr<dnnl::memory> AcquireDstMemory(framework::Tensor* output,
+  std::shared_ptr<dnnl::memory> AcquireDstMemory(phi::DenseTensor* output,
                                                  const MKLDNNMemoryFormat& fmt,
                                                  platform::Place place) {
     auto dst_md = platform::MKLDNNMemDesc(dims_, dtype_dst_, fmt);
@@ -484,7 +486,7 @@ class ReorderMKLDNNHandler {
   }
 
   std::shared_ptr<dnnl::memory> AcquireDstMemory(
-      framework::Tensor* output,
+      phi::DenseTensor* output,
       const dnnl::memory::desc& src_md,
       platform::Place place) {
     if (vtype_dst_ == vtype_) {
@@ -501,7 +503,7 @@ class ReorderMKLDNNHandler {
   }
 
   std::shared_ptr<dnnl::memory> AcquireDstMemory(
-      framework::Tensor* output,
+      phi::DenseTensor* output,
       const std::vector<int64_t>& dims,
       const MKLDNNMemoryFormat& fmt,
       platform::Place place) {
diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
index 03aace9b78e38..c616d3fbebf11 100644
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -206,7 +206,7 @@ void InitTensorWithTensor(TensorObject* self,
 }
 
 void InitTensorWithFrameworkTensor(TensorObject* self,
-                                   const framework::Tensor& src,
+                                   const phi::DenseTensor& src,
                                    const paddle::platform::Place& place,
                                    const std::string& name) {
   self->tensor.set_name(name);
@@ -382,7 +382,7 @@ void AutoInitTensorByPyArray(TensorObject* py_tensor_ptr,
   InitTensorWithNumpyValue(py_tensor_ptr, numpy_value, place, zero_copy);
 }
 
-// initialize Tensor by Tensor or framework::Tensor (mix args and
+// initialize Tensor by Tensor or phi::DenseTensor (mix args and
 // kwargs) automatically.
 void AutoInitTensorByTensor(TensorObject* py_tensor_ptr,
                             std::unordered_map<std::string, PyObject*> kws_map,
@@ -428,7 +428,7 @@ void AutoInitTensorByTensor(TensorObject* py_tensor_ptr,
     InitTensorWithTensor(py_tensor_ptr, src_tensor, place, act_name);
   } else {
     // init by framework tensor
-    framework::Tensor src_tensor;
+    phi::DenseTensor src_tensor;
     if (kw_order_map["value"] <= args_num) {
       src_tensor = CastPyArg2FrameworkTensor(
           PyTuple_GET_ITEM(args, kw_order_map["value"] - 1),
@@ -438,8 +438,8 @@ void AutoInitTensorByTensor(TensorObject* py_tensor_ptr,
         src_tensor = CastPyArg2FrameworkTensor(kws_map["value"], 0);
       } else {
         PADDLE_THROW(platform::errors::InvalidArgument(
-            "The first expected arguments is {value: framework::Tensor}, "
-            "but could not parse the first argument {value: framework::Tensor} "
+            "The first expected arguments is {value: phi::DenseTensor}, "
+            "but could not parse the first argument {value: phi::DenseTensor} "
             "successfully. "
             "Please check your input first and make sure you are on the right "
             "way."));
@@ -687,7 +687,7 @@ int TensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
           PADDLE_THROW(platform::errors::InvalidArgument(
               "Could not parse the first keyword argument successfully, "
               "the first keyword argument is value, but it should be PyArray "
-              "or Tensor or framework::Tensor. "
+              "or Tensor or phi::DenseTensor. "
               "Please check your input first and make sure you are on the "
               "right way."));
         }
@@ -753,7 +753,7 @@ int TensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
       } else {
         PADDLE_THROW(platform::errors::InvalidArgument(
             "We not only support construct Tensor from numpy value "
-            "or tensor(Tensor or framework::Tensor) "
+            "or tensor(Tensor or phi::DenseTensor) "
             "with python kwargs by this initializer, "
             "but also even support dtype to init a empty Tensor. "
             "Please check your input first and make sure you call the existed "
@@ -789,10 +789,10 @@ int TensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "We support construct Tensor from numpy value "
-          "or tensor(Tensor or framework::Tensor) "
+          "or tensor(Tensor or phi::DenseTensor) "
           "with python args and kwargs by this initializer, "
           "but the first argument should be PyArray or Tensor or "
-          "framework::Tensor. "
+          "phi::DenseTensor. "
           "Please check your input first and make sure you call the existed "
           "constructor."));
     }
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index 956d8e5814cc0..b2a59140d695b 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -192,7 +192,7 @@ static PyObject* eager_api_read_next_tensor_list(PyObject* self,
   {
     eager_gil_scoped_release guard;
     tensor_list.reserve(tensor_base_list.size());
-    auto func = [](framework::Tensor& tensor_base) {
+    auto func = [](phi::DenseTensor& tensor_base) {
       paddle::experimental::Tensor tensor(
           egr::Controller::Instance().GenerateUniqueName());
       auto autograd_meta = egr::EagerUtils::autograd_meta(&tensor);
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 5233bbc832935..999a9e7ce8f4b 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -581,13 +581,11 @@ static PyObject* tensor__share_buffer_to(TensorObject* self,
                         "Tensor %s has not been initialized! please initialize "
                         "src tensor before share_buffer_with to other.",
                         self->tensor.name()));
-  auto* src_tensor =
-      static_cast<paddle::framework::Tensor*>(self->tensor.impl().get());
+  auto* src_tensor = static_cast<phi::DenseTensor*>(self->tensor.impl().get());
   if (!dst_ptr->defined()) {
     dst_ptr->set_impl(std::make_shared<phi::DenseTensor>());
   }
-  auto dst_tensor =
-      static_cast<paddle::framework::Tensor*>(dst_ptr->impl().get());
+  auto dst_tensor = static_cast<phi::DenseTensor*>(dst_ptr->impl().get());
   dst_tensor->ShareBufferWith(*src_tensor);
   dst_tensor->ShareDataTypeWith(*src_tensor);
   RETURN_PY_NONE
@@ -611,10 +609,8 @@ static PyObject* tensor__is_shared_buffer_with(TensorObject* self,
   if (!self->tensor.defined() || !dst_ptr->defined()) {
     return ToPyObject(res);
   }
-  auto* self_ptr =
-      static_cast<paddle::framework::Tensor*>(self->tensor.impl().get());
-  auto dst_tensor =
-      static_cast<paddle::framework::Tensor*>(dst_ptr->impl().get());
+  auto* self_ptr = static_cast<phi::DenseTensor*>(self->tensor.impl().get());
+  auto dst_tensor = static_cast<phi::DenseTensor*>(dst_ptr->impl().get());
   res = dst_tensor->IsSharedBufferWith(*self_ptr);
   return ToPyObject(res);
   EAGER_CATCH_AND_THROW_RETURN_NULL
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index df09dd7ec0a70..944fbb7faaf84 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -428,10 +428,10 @@ platform::Place CastPyArg2Place(PyObject* obj, ssize_t arg_pos) {
   return place;
 }
 
-framework::Tensor CastPyArg2FrameworkTensor(PyObject* obj, ssize_t arg_pos) {
+phi::DenseTensor CastPyArg2FrameworkTensor(PyObject* obj, ssize_t arg_pos) {
   if (PyObject_IsInstance(
           obj, reinterpret_cast<PyObject*>(g_framework_tensor_pytype))) {
-    return ::pybind11::handle(obj).cast<framework::Tensor>();
+    return ::pybind11::handle(obj).cast<phi::DenseTensor>();
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "argument (position %d) must be "
@@ -441,8 +441,8 @@ framework::Tensor CastPyArg2FrameworkTensor(PyObject* obj, ssize_t arg_pos) {
   }
 }
 
-std::vector<framework::Tensor> CastPyArg2VectorOfTensorBase(PyObject* obj,
-                                                            ssize_t arg_pos) {
+std::vector<phi::DenseTensor> CastPyArg2VectorOfTensorBase(PyObject* obj,
+                                                           ssize_t arg_pos) {
   std::vector<framework::LoDTensor> result;
   if (PyList_Check(obj)) {
     Py_ssize_t len = PyList_Size(obj);
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index 1f4a93dab91eb..f0ca654122937 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -64,7 +64,7 @@ std::shared_ptr<imperative::VarBase> CastPyArg2VarBase(PyObject* obj,
 std::vector<paddle::experimental::Tensor> CastPyArg2VectorOfTensor(
     PyObject* obj, ssize_t arg_pos);
 platform::Place CastPyArg2Place(PyObject* obj, ssize_t arg_pos);
-framework::Tensor CastPyArg2FrameworkTensor(PyObject* obj, ssize_t arg_pos);
+phi::DenseTensor CastPyArg2FrameworkTensor(PyObject* obj, ssize_t arg_pos);
 std::vector<framework::LoDTensor> CastPyArg2VectorOfTensorBase(PyObject* obj,
                                                                ssize_t arg_pos);
 std::vector<int> CastPyArg2VectorOfInt(PyObject* obj, size_t arg_pos);
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 3dc87f0f7cc04..5e19c4b557c6b 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -282,7 +282,7 @@ static void InitVarBaseFromNumpyWithArgDefault(imperative::VarBase *self,
 }
 
 static void InitVarBaseFromTensorWithArgDefault(imperative::VarBase *self,
-                                                const framework::Tensor &tensor,
+                                                const phi::DenseTensor &tensor,
                                                 const std::string &name) {
   VLOG(4) << "Init VarBase";
   auto place = imperative::GetCurrentTracer()->ExpectedPlace();
@@ -306,7 +306,7 @@ static void InitVarBaseFromTensorWithArgDefault(imperative::VarBase *self,
 
 template <typename P>
 static void InitVarBaseFromTensorWithArg(imperative::VarBase *self,
-                                         const framework::Tensor &tensor,
+                                         const phi::DenseTensor &tensor,
                                          const P &place,
                                          const std::string &name) {
   VLOG(4) << "Init VarBase";
@@ -3031,9 +3031,9 @@ void BindImperative(py::module *m_ptr) {
         }
 
         // Select the index data to the buffer
-        auto index_select = [](const framework::Tensor &src_tensor,
-                               const framework::Tensor &index_tensor,
-                               framework::Tensor *buffer_tensor) {
+        auto index_select = [](const phi::DenseTensor &src_tensor,
+                               const phi::DenseTensor &index_tensor,
+                               phi::DenseTensor *buffer_tensor) {
           auto *src_data = src_tensor.data<float>();
           auto *index_data = index_tensor.data<int64_t>();
           auto *buffer_data =
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index eb395ed2a144a..96e5b9f5c6b68 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -236,7 +236,7 @@ paddle_infer::PlaceType ToPaddleInferPlace(
 }
 
 void PaddleInferShareExternalData(paddle_infer::Tensor &tensor,  // NOLINT
-                                  framework::Tensor input_tensor) {
+                                  phi::DenseTensor input_tensor) {
   std::vector<int> shape;
   for (int i = 0; i < input_tensor.dims().size(); ++i) {
     shape.push_back(input_tensor.dims()[i]);
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 9408429641a7f..9acacd5a0c7ab 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -679,7 +679,7 @@ PYBIND11_MODULE(libpaddle, m) {
 
     PyCapsule_SetName(dltensor->ptr(), "used_dltensor");
     DLTensor dl = dmt->dl_tensor;
-    framework::Tensor tensor;
+    phi::DenseTensor tensor;
 
     if (dl.device.device_type == kDLCPU) {
       paddle::framework::TensorFromDLPack(dl, &tensor);
diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc
index 8152a11c8193a..addc9b7c27d4c 100644
--- a/paddle/fluid/pybind/tensor.cc
+++ b/paddle/fluid/pybind/tensor.cc
@@ -196,8 +196,8 @@ namespace pybind {
 PyTypeObject *g_framework_tensor_pytype = nullptr;
 
 template <typename PlaceType>
-static void TensorCopyFrom(framework::Tensor *dst,
-                           const framework::Tensor &src,
+static void TensorCopyFrom(phi::DenseTensor *dst,
+                           const phi::DenseTensor &src,
                            const PlaceType &place,
                            int64_t batch_size) {
   if (batch_size < 0) {
@@ -210,134 +210,134 @@ static void TensorCopyFrom(framework::Tensor *dst,
 
 void BindTensor(pybind11::module &m) {  // NOLINT
   using namespace paddle::framework;    // NOLINT
-  py::class_<framework::Tensor> framework_tensor(
+  py::class_<phi::DenseTensor> framework_tensor(
       m, "Tensor", py::buffer_protocol());
   g_framework_tensor_pytype =
       reinterpret_cast<PyTypeObject *>(framework_tensor.ptr());
   framework_tensor
       .def("__array__",
-           [](framework::Tensor &self) { return TensorToPyArray(self); })
+           [](phi::DenseTensor &self) { return TensorToPyArray(self); })
       .def("_ptr",
-           [](const framework::Tensor &self) {
+           [](const phi::DenseTensor &self) {
              return reinterpret_cast<uintptr_t>(self.data());
            })
-      .def("_slice", &framework::Tensor::Slice)
-      .def("_numel", &framework::Tensor::numel)
+      .def("_slice", &phi::DenseTensor::Slice)
+      .def("_numel", &phi::DenseTensor::numel)
       .def("_is_initialized",
-           [](const framework::Tensor &self) { return self.IsInitialized(); })
+           [](const phi::DenseTensor &self) { return self.IsInitialized(); })
       .def("_get_dims",
-           [](const framework::Tensor &self) { return vectorize(self.dims()); })
+           [](const phi::DenseTensor &self) { return vectorize(self.dims()); })
       .def("_set_dims",
-           [](framework::Tensor &self, const std::vector<int64_t> &dim) {
+           [](phi::DenseTensor &self, const std::vector<int64_t> &dim) {
              self.Resize(phi::make_ddim(dim));
            })
       .def("_set_layout",
-           [](framework::Tensor &self, const std::string &layout) {
+           [](phi::DenseTensor &self, const std::string &layout) {
              self.set_layout(StringToDataLayout(layout));
            })
       .def("_alloc_float",
-           [](framework::Tensor &self, paddle::platform::CustomPlace &place) {
+           [](phi::DenseTensor &self, paddle::platform::CustomPlace &place) {
              self.mutable_data<float>(place);
            })
       .def("_alloc_float",
-           [](framework::Tensor &self, paddle::platform::CUDAPlace &place) {
+           [](phi::DenseTensor &self, paddle::platform::CUDAPlace &place) {
              self.mutable_data<float>(place);
            })
       .def("_alloc_float",
-           [](framework::Tensor &self, paddle::platform::XPUPlace &place) {
+           [](phi::DenseTensor &self, paddle::platform::XPUPlace &place) {
              self.mutable_data<float>(place);
            })
       .def("_alloc_float",
-           [](framework::Tensor &self, paddle::platform::CPUPlace &place) {
+           [](phi::DenseTensor &self, paddle::platform::CPUPlace &place) {
              self.mutable_data<float>(place);
            })
       .def("_alloc_float",
-           [](framework::Tensor &self, paddle::platform::NPUPlace &place) {
+           [](phi::DenseTensor &self, paddle::platform::NPUPlace &place) {
              self.mutable_data<float>(place);
            })
       .def("_alloc_float",
-           [](framework::Tensor &self, paddle::platform::MLUPlace &place) {
+           [](phi::DenseTensor &self, paddle::platform::MLUPlace &place) {
              self.mutable_data<float>(place);
            })
       .def("_alloc_double",
-           [](framework::Tensor &self, paddle::platform::CPUPlace &place) {
+           [](phi::DenseTensor &self, paddle::platform::CPUPlace &place) {
              self.mutable_data<double>(place);
            })
       .def("_alloc_int",
-           [](framework::Tensor &self, paddle::platform::CPUPlace &place) {
+           [](phi::DenseTensor &self, paddle::platform::CPUPlace &place) {
              self.mutable_data<int>(place);
            })
       .def("_alloc_int",
-           [](framework::Tensor &self, paddle::platform::CustomPlace &place) {
+           [](phi::DenseTensor &self, paddle::platform::CustomPlace &place) {
              self.mutable_data<int>(place);
            })
       .def("_alloc_int",
-           [](framework::Tensor &self, paddle::platform::XPUPlace &place) {
+           [](phi::DenseTensor &self, paddle::platform::XPUPlace &place) {
              self.mutable_data<int>(place);
            })
       .def("_alloc_int",
-           [](framework::Tensor &self, paddle::platform::CUDAPlace &place) {
+           [](phi::DenseTensor &self, paddle::platform::CUDAPlace &place) {
              self.mutable_data<int>(place);
            })
       .def("_alloc_int",
-           [](framework::Tensor &self, paddle::platform::MLUPlace &place) {
+           [](phi::DenseTensor &self, paddle::platform::MLUPlace &place) {
              self.mutable_data<int>(place);
            })
-      .def("_alloc_int",
-           [](framework::Tensor &self,
-              paddle::platform::CUDAPinnedPlace &place) {
-             self.mutable_data<int>(place);
-           })
-      .def("_alloc_float",
-           [](framework::Tensor &self,
-              paddle::platform::CUDAPinnedPlace &place) {
-             self.mutable_data<float>(place);
-           })
+      .def(
+          "_alloc_int",
+          [](phi::DenseTensor &self, paddle::platform::CUDAPinnedPlace &place) {
+            self.mutable_data<int>(place);
+          })
+      .def(
+          "_alloc_float",
+          [](phi::DenseTensor &self, paddle::platform::CUDAPinnedPlace &place) {
+            self.mutable_data<float>(place);
+          })
       .def("_mutable_data",
-           [](framework::Tensor &self,
+           [](phi::DenseTensor &self,
               paddle::platform::CPUPlace &place,
               paddle::framework::proto::VarType::Type type) {
              return reinterpret_cast<uintptr_t>(
                  self.mutable_data(place, framework::TransToPhiDataType(type)));
            })
       .def("_mutable_data",
-           [](framework::Tensor &self,
+           [](phi::DenseTensor &self,
               paddle::platform::CustomPlace &place,
               paddle::framework::proto::VarType::Type type) {
              return reinterpret_cast<uintptr_t>(
                  self.mutable_data(place, framework::TransToPhiDataType(type)));
            })
       .def("_mutable_data",
-           [](framework::Tensor &self,
+           [](phi::DenseTensor &self,
               paddle::platform::XPUPlace &place,
               paddle::framework::proto::VarType::Type type) {
              return reinterpret_cast<uintptr_t>(
                  self.mutable_data(place, framework::TransToPhiDataType(type)));
            })
       .def("_mutable_data",
-           [](framework::Tensor &self,
+           [](phi::DenseTensor &self,
               paddle::platform::CUDAPlace &place,
               paddle::framework::proto::VarType::Type type) {
              return reinterpret_cast<uintptr_t>(
                  self.mutable_data(place, framework::TransToPhiDataType(type)));
            })
       .def("_mutable_data",
-           [](framework::Tensor &self,
+           [](phi::DenseTensor &self,
               paddle::platform::CUDAPinnedPlace &place,
               paddle::framework::proto::VarType::Type type) {
              return reinterpret_cast<uintptr_t>(
                  self.mutable_data(place, framework::TransToPhiDataType(type)));
            })
       .def("_mutable_data",
-           [](framework::Tensor &self,
+           [](phi::DenseTensor &self,
               paddle::platform::MLUPlace &place,
               paddle::framework::proto::VarType::Type type) {
              return reinterpret_cast<uintptr_t>(
                  self.mutable_data(place, framework::TransToPhiDataType(type)));
            })
-      .def("_clear", &framework::Tensor::clear)
+      .def("_clear", &phi::DenseTensor::clear)
       .def("_mutable_data",
-           [](framework::Tensor &self,
+           [](phi::DenseTensor &self,
               paddle::platform::NPUPlace &place,
               paddle::framework::proto::VarType::Type type) {
              return reinterpret_cast<uintptr_t>(
@@ -453,7 +453,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
 
       .def(
           "shape",
-          [](framework::Tensor &self) { return vectorize(self.dims()); },
+          [](phi::DenseTensor &self) { return vectorize(self.dims()); },
           R"DOC(
            Return the shape of Tensor.
 
@@ -472,7 +472,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
                   print(t.shape())  # [5, 30]
            )DOC")
       .def("_to_dlpack",
-           [](framework::Tensor &self) {
+           [](phi::DenseTensor &self) {
              DLPackTensor dlpack_tensor(self, 1);
              DLManagedTensor *dmt = dlpack_tensor.ToDLManagedTensor();
              auto capsule = py::capsule(
@@ -496,25 +496,25 @@ void BindTensor(pybind11::module &m) {  // NOLINT
       .def("_get_float_element", TensorGetElement<float>)
       .def("_set_double_element", TensorSetElement<double>)
       .def("_get_double_element", TensorGetElement<double>)
-      .def("_place", [](framework::Tensor &self) { return self.place(); })
+      .def("_place", [](phi::DenseTensor &self) { return self.place(); })
       .def("_dtype",
-           [](framework::Tensor &self) {
+           [](phi::DenseTensor &self) {
              return framework::TransToProtoVarType(self.type());
            })
       .def("_layout",
-           [](framework::Tensor &self) {
+           [](phi::DenseTensor &self) {
              return DataLayoutToString(self.layout());
            })
-      .def("_share_data_with", &framework::Tensor::ShareDataWith)
+      .def("_share_data_with", &phi::DenseTensor::ShareDataWith)
       .def("__getitem__", PySliceTensor, py::return_value_policy::reference)
       .def("__str__",
-           [](const framework::Tensor &self) {
+           [](const phi::DenseTensor &self) {
              std::stringstream ostr;
              ostr << self;
              return ostr.str();
            }) /* ------ End of original Tensor ------ */
       .def("__init__",
-           [](framework::Tensor &instance,
+           [](phi::DenseTensor &instance,
               const std::vector<std::vector<size_t>>
                   &recursive_sequence_lengths) {
              LoD new_lod;
@@ -531,11 +531,11 @@ void BindTensor(pybind11::module &m) {  // NOLINT
                      "invalid, "
                      "the LoD converted by recursive_sequence_lengths is %s",
                      new_lod));
-             new (&instance) framework::Tensor(new_offset_lod);
+             new (&instance) phi::DenseTensor(new_offset_lod);
            })
       .def("__init__",
-           [](framework::Tensor &instance) {
-             new (&instance) framework::Tensor();
+           [](phi::DenseTensor &instance) {
+             new (&instance) phi::DenseTensor();
            })
       // We implement offset based LOD in C++ while we use length based with
       // Python API. So we changed set_lod to set_recursive_sequence_lengths
@@ -545,7 +545,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
       // https://github.com/PaddlePaddle/Paddle/issues/10855
       .def(
           "set_lod",
-          [](framework::Tensor &self,
+          [](phi::DenseTensor &self,
              const std::vector<std::vector<size_t>> &lod) {
             // the input lod is offset-based level-of-detail info
             LoD new_lod;
@@ -581,7 +581,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
            )DOC")
       .def(
           "set_recursive_sequence_lengths",
-          [](framework::Tensor &self,
+          [](phi::DenseTensor &self,
              const std::vector<std::vector<size_t>>
                  &recursive_sequence_lengths) {
             // the input recursive_sequence_lengths is length-based
@@ -631,7 +631,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
            )DOC")
       .def(
           "lod",
-          [](framework::Tensor &self) -> std::vector<std::vector<size_t>> {
+          [](phi::DenseTensor &self) -> std::vector<std::vector<size_t>> {
             // output the offset-based lod info
             LoD lod = self.lod();
             std::vector<std::vector<size_t>> new_lod;
@@ -659,7 +659,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
       // Set above comments of set_lod.
       .def(
           "recursive_sequence_lengths",
-          [](framework::Tensor &self) -> std::vector<std::vector<size_t>> {
+          [](phi::DenseTensor &self) -> std::vector<std::vector<size_t>> {
             // output the length-based lod info
             LoD lod = phi::ConvertToLengthBasedLoD(self.lod());
             std::vector<std::vector<size_t>> new_lod;
@@ -687,7 +687,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
            )DOC")
       .def(
           "has_valid_recursive_sequence_lengths",
-          [](framework::Tensor &self) -> bool {
+          [](phi::DenseTensor &self) -> bool {
             // Check that the lod info is valid and match the outermost
             // dimension of the Tensor data
             return CheckLoD(self.lod(), vectorize(self.dims()).front());
@@ -710,18 +710,18 @@ void BindTensor(pybind11::module &m) {  // NOLINT
                  print(t.has_valid_recursive_sequence_lengths()) # True
            )DOC")
       .def("_as_type",
-           [](const framework::Tensor &self,
+           [](const phi::DenseTensor &self,
               paddle::framework::proto::VarType::Type type) {
-             framework::Tensor dst;
+             phi::DenseTensor dst;
              if (self.IsInitialized() && self.numel() > 0) {
                TransDataType(self, type, &dst);
              }
              return dst;
            })
       .def("_copy",
-           [](const framework::Tensor &self, const platform::Place &place) {
+           [](const phi::DenseTensor &self, const platform::Place &place) {
              // follow fetch_op's inplementation
-             framework::Tensor dst;
+             phi::DenseTensor dst;
              if (self.IsInitialized() && self.numel() > 0) {
                TensorCopySync(self, place, &dst);
              } else {
@@ -737,7 +737,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
            })
 #ifdef PADDLE_WITH_CUDA
       .def("_share_buffer_with",
-           [](framework::Tensor &self, const framework::Tensor src,
+           [](phi::DenseTensor &self, const phi::DenseTensor src,
               py::tuple t) {
              auto *cuda_ipc_allocation =
                  dynamic_cast<memory::allocation::CudaIpcAllocation *>(
@@ -779,7 +779,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
 
        )DOC")
       .def("_share_cuda",
-           [](framework::Tensor self) {
+           [](phi::DenseTensor self) {
              if (!self.IsInitialized() || self.numel() == 0)
                throw std::runtime_error(
                    "Tensor not initialized or numel is 0.  could not pass "
@@ -841,7 +841,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
                    "Invalid Tensor meta info for shared cuda tensor!");
 
              // 1. Create a new C++ instance
-             framework::Tensor tensor;
+             phi::DenseTensor tensor;
 
              // 2. Rebuild Allocation from handle
              const std::string &handle = t[0].cast<std::string>();
@@ -883,7 +883,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
         )DOC")
 #endif
       .def("_share_filename",
-           [](framework::Tensor &self) {
+           [](phi::DenseTensor &self) {
              if (!self.IsInitialized() || self.numel() == 0)
                throw std::runtime_error(
                    "Tensor not initialized or numel is 0. could not pass to "
@@ -955,7 +955,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
              if (t.size() != 5)
                throw std::runtime_error("Invalid Tensor meta info state!");
 
-             framework::Tensor tensor;
+             phi::DenseTensor tensor;
 
              // 2. Rebuild Allocation
              const std::string &ipc_name = t[0].cast<std::string>();
@@ -993,7 +993,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
 
         )DOC")
       .def("_shared_incref",
-           [](framework::Tensor &self) {
+           [](phi::DenseTensor &self) {
              auto *mmap_allocation = dynamic_cast<
                  memory::allocation::RefcountedMemoryMapAllocation *>(
                  self.Holder().get());
@@ -1005,7 +1005,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
             Increase reference count of share_filename tensor.
       )DOC")
       .def("_shared_decref",
-           [](framework::Tensor &self) {
+           [](phi::DenseTensor &self) {
              auto *mmap_allocation = dynamic_cast<
                  memory::allocation::RefcountedMemoryMapAllocation *>(
                  self.Holder().get());
@@ -1017,7 +1017,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
             Decrease reference count of share_filename tensor.
       )DOC")
       .def(py::pickle(
-          [](const framework::Tensor &t) {  // __getstate__
+          [](const phi::DenseTensor &t) {  // __getstate__
             auto holder = t.Holder();
             PADDLE_ENFORCE_EQ(platform::is_cpu_place(holder->place()), true,
                               platform::errors::PreconditionNotMet(
@@ -1042,7 +1042,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
               throw std::runtime_error("Invalid Tensor state!");
 
             // 1. Create a new C++ instance
-            framework::Tensor tensor;
+            phi::DenseTensor tensor;
 
             // 2. Rebuild Allocation
             const std::string &ipc_name = t[0].cast<std::string>();
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 4b01f2b568b0f..0003111f0cad3 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -264,7 +264,7 @@ inline std::string TensorDTypeToPyDTypeStr(
 }  // namespace details
 
 template <typename T>
-T TensorGetElement(const framework::Tensor &self, size_t offset) {
+T TensorGetElement(const phi::DenseTensor &self, size_t offset) {
   PADDLE_ENFORCE_LT(offset,
                     self.numel(),
                     platform::errors::InvalidArgument(
@@ -314,7 +314,7 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) {
 }
 
 template <typename T>
-void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
+void TensorSetElement(phi::DenseTensor *self, size_t offset, T elem) {
   PADDLE_ENFORCE_LT(offset,
                     self->numel(),
                     platform::errors::InvalidArgument(
@@ -362,7 +362,7 @@ void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
 
 template <typename T, typename P>
 void SetTensorFromPyArrayT(
-    framework::Tensor *self,
+    phi::DenseTensor *self,
     const py::array_t<T, py::array::c_style | py::array::forcecast> &array,
     const P &place,
     bool zero_copy) {
@@ -502,7 +502,7 @@ void SetTensorFromPyArrayT(
 }
 
 template <typename P>
-void SetTensorFromPyArray(framework::Tensor *self,
+void SetTensorFromPyArray(phi::DenseTensor *self,
                           const py::object &obj,
                           const P &place,
                           bool zero_copy) {
@@ -679,8 +679,8 @@ void SetUVATensorFromPyArray(
 }
 
 template <typename T, size_t D>
-void _sliceCompute(const framework::Tensor *in,
-                   framework::Tensor *out,
+void _sliceCompute(const phi::DenseTensor *in,
+                   phi::DenseTensor *out,
                    const phi::CPUContext &ctx,
                    const std::vector<int> &axes,
                    const std::vector<int> &starts) {
@@ -714,8 +714,8 @@ void _sliceCompute(const framework::Tensor *in,
 }
 
 template <typename T>
-void _concatCompute(const std::vector<paddle::framework::Tensor> &ins,
-                    paddle::framework::Tensor *out,
+void _concatCompute(const std::vector<phi::DenseTensor> &ins,
+                    phi::DenseTensor *out,
                     const phi::CPUContext &ctx,
                     int64_t axis) {
   if (axis == 0 && ins.size() < 10) {
@@ -739,7 +739,7 @@ void _concatCompute(const std::vector<paddle::framework::Tensor> &ins,
   }
 }
 
-inline void _getSliceinfo(const framework::Tensor &self,
+inline void _getSliceinfo(const phi::DenseTensor &self,
                           py::object obj,
                           const int64_t dim,
                           int64_t *pstart,
@@ -791,9 +791,9 @@ inline void _getSliceinfo(const framework::Tensor &self,
   }
 }
 
-inline framework::Tensor *_getTensor(const framework::Tensor &self,
-                                     const framework::DDim &ddim) {
-  framework::Tensor *output = new framework::Tensor();
+inline phi::DenseTensor *_getTensor(const phi::DenseTensor &self,
+                                    const framework::DDim &ddim) {
+  phi::DenseTensor *output = new phi::DenseTensor();
   output->Resize(ddim);
   auto place = self.place();
   if (platform::is_cpu_place(place)) {
@@ -819,8 +819,8 @@ inline framework::Tensor *_getTensor(const framework::Tensor &self,
 }
 
 template <typename T>
-void _sliceDapper(const framework::Tensor *in,
-                  framework::Tensor *out,
+void _sliceDapper(const phi::DenseTensor *in,
+                  phi::DenseTensor *out,
                   const phi::CPUContext &ctx,
                   const std::vector<int> &axes,
                   const std::vector<int> &starts,
@@ -861,32 +861,32 @@ void _sliceDapper(const framework::Tensor *in,
 }
 
 template <typename T>
-inline framework::Tensor *_sliceWrapper(const framework::Tensor &self,
-                                        const phi::CPUContext &ctx,
-                                        py::object obj,
-                                        int dim,
-                                        int64_t start,
-                                        int64_t slicelength) {
+inline phi::DenseTensor *_sliceWrapper(const phi::DenseTensor &self,
+                                       const phi::CPUContext &ctx,
+                                       py::object obj,
+                                       int dim,
+                                       int64_t start,
+                                       int64_t slicelength) {
   framework::DDim dstDDim = self.dims();
   dstDDim[dim] = static_cast<int64_t>(slicelength);
   std::vector<int> axes({dim});
   std::vector<int> starts({static_cast<int>(start)});
-  framework::Tensor *output = _getTensor(self, dstDDim);
+  phi::DenseTensor *output = _getTensor(self, dstDDim);
   _sliceDapper<T>(&self, output, ctx, axes, starts, dstDDim.size());
   return output;
 }
 
 template <typename T>
-inline framework::Tensor *_sliceAndConcat(const framework::Tensor &self,
-                                          py::object obj,
-                                          int dim) {
+inline phi::DenseTensor *_sliceAndConcat(const phi::DenseTensor &self,
+                                         py::object obj,
+                                         int dim) {
   phi::CPUContext ctx;
   int64_t start, stop, step, slicelength;
   _getSliceinfo(self, obj, dim, &start, &stop, &step, &slicelength);
   if (step == 1 || slicelength == 1) {
     return _sliceWrapper<T>(self, ctx, obj, dim, start, slicelength);
   } else {
-    std::vector<framework::Tensor> ins;
+    std::vector<phi::DenseTensor> ins;
     for (auto i = 0; i < slicelength; ++i, start += step) {
       ins.emplace_back(*_sliceWrapper<T>(self, ctx, obj, dim, start, 1));
     }
@@ -894,15 +894,15 @@ inline framework::Tensor *_sliceAndConcat(const framework::Tensor &self,
     // do the concat operation
     framework::DDim dstDDim = self.dims();
     dstDDim[dim] = static_cast<int64_t>(slicelength);
-    framework::Tensor *output1 = _getTensor(self, dstDDim);
+    phi::DenseTensor *output1 = _getTensor(self, dstDDim);
     _concatCompute<T>(ins, output1, ctx, dim);
     return output1;
   }
 }
 
-inline framework::Tensor *_sliceTensor(const framework::Tensor &self,
-                                       py::object obj,
-                                       int dim) {
+inline phi::DenseTensor *_sliceTensor(const phi::DenseTensor &self,
+                                      py::object obj,
+                                      int dim) {
   auto src_type = framework::TransToProtoVarType(self.dtype());
   switch (src_type) {
     case framework::proto::VarType::FP16:
@@ -936,12 +936,12 @@ inline framework::Tensor *_sliceTensor(const framework::Tensor &self,
   }
 }
 
-inline framework::Tensor *_pySliceTensor(const framework::Tensor &self,
-                                         py::object obj) {
+inline phi::DenseTensor *_pySliceTensor(const phi::DenseTensor &self,
+                                        py::object obj) {
   if (py::isinstance<py::tuple>(obj)) {
     py::list l = static_cast<py::list>(obj);
-    std::unique_ptr<framework::Tensor> target;
-    framework::Tensor *src = const_cast<framework::Tensor *>(&self);
+    std::unique_ptr<phi::DenseTensor> target;
+    phi::DenseTensor *src = const_cast<phi::DenseTensor *>(&self);
     for (auto i = 0; i < static_cast<int>(l.size()); ++i) {
       src = _sliceTensor(*src, l[i], i);
       if (i + 1 == static_cast<int>(l.size())) {
@@ -956,15 +956,15 @@ inline framework::Tensor *_pySliceTensor(const framework::Tensor &self,
   }
 }
 
-inline framework::Tensor *PySliceTensor(const framework::Tensor &self,
-                                        py::object obj) {
+inline phi::DenseTensor *PySliceTensor(const phi::DenseTensor &self,
+                                       py::object obj) {
   if (platform::is_gpu_place(self.place())) {
-    std::unique_ptr<framework::Tensor> holder;
-    framework::Tensor src;
+    std::unique_ptr<phi::DenseTensor> holder;
+    phi::DenseTensor src;
     framework::TensorCopySync(self, platform::CPUPlace(), &src);
-    framework::Tensor *output = _pySliceTensor(src, obj);
+    phi::DenseTensor *output = _pySliceTensor(src, obj);
     holder.reset(output);
-    framework::Tensor *dst = _getTensor(*output, output->dims());
+    phi::DenseTensor *dst = _getTensor(*output, output->dims());
     framework::TensorCopySync(*output, self.place(), dst);
     return dst;
   } else {
@@ -972,7 +972,7 @@ inline framework::Tensor *PySliceTensor(const framework::Tensor &self,
   }
 }
 
-inline py::array TensorToPyArray(const framework::Tensor &tensor,
+inline py::array TensorToPyArray(const phi::DenseTensor &tensor,
                                  bool need_deep_copy = false) {
   if (!tensor.IsInitialized()) {
     return py::array();
diff --git a/paddle/infrt/api/infrt_api.cc b/paddle/infrt/api/infrt_api.cc
index a58c6cc5b86ef..e2fdd9a487121 100644
--- a/paddle/infrt/api/infrt_api.cc
+++ b/paddle/infrt/api/infrt_api.cc
@@ -121,11 +121,11 @@ class PredictExecutor : public MlirToRuntimeTranslator {
 
   int GetInputNum() { return inputs_.size(); }
 
-  ::phi::DenseTensor* GetInput(int i) { return inputs_[i]; }
+  ::Tensor* GetInput(int i) { return inputs_[i]; }
 
   int GetOutputNum() { return outputs_.size(); }
 
-  ::phi::DenseTensor* GetOutput(int i) { return outputs_[i]; }
+  ::Tensor* GetOutput(int i) { return outputs_[i]; }
 
  private:
   void Init(::infrt::phi::DenseTensorMap&& map) {
@@ -158,10 +158,10 @@ class PredictExecutor : public MlirToRuntimeTranslator {
         AddValue(predict_func.getArgument(i), value);
       } else if (type.isa<::infrt::DenseTensorType>()) {
         // this param is an input Tensor
-        auto dht = ::phi::DenseTensor();
+        auto dht = ::Tensor();
         auto* value = new host_context::Value(std::move(dht));
         arguments_.push_back(value);
-        inputs_.push_back(&(value->get<::phi::DenseTensor>()));
+        inputs_.push_back(&(value->get<::Tensor>()));
       } else {
         llvm_unreachable("The input type has not been supported by predictor.");
       }
@@ -174,12 +174,12 @@ class PredictExecutor : public MlirToRuntimeTranslator {
         auto operand = last_op.getOperand(i);
         if (operand.getType().isa<::infrt::DenseTensorType>()) {
           auto r = impl_->value_map.try_emplace(
-              operand, ValueRef(new host_context::Value(::phi::DenseTensor())));
+              operand, ValueRef(new host_context::Value(::Tensor())));
           CHECK(r.second) << "Duplicate add mlir value ["
                           << DumpToString(operand) << "]";
           auto* value = r.first->second.get();
           results_.push_back(ValueRef(value));
-          outputs_.push_back(&(value->get<::phi::DenseTensor>()));
+          outputs_.push_back(&(value->get<::Tensor>()));
         } else {
           llvm_unreachable("infrt.return only supports DenseTensor now.");
         }
@@ -200,9 +200,9 @@ class PredictExecutor : public MlirToRuntimeTranslator {
  private:
   KernelRegistry* registry_{};
   MlirFunctionExecutable* function_executable_;
-  llvm::SmallVector<::phi::DenseTensor*, 1> inputs_;
+  llvm::SmallVector<::Tensor*, 1> inputs_;
   llvm::SmallVector<host_context::Value*, 2> arguments_;
-  llvm::SmallVector<::phi::DenseTensor*, 1> outputs_;
+  llvm::SmallVector<::Tensor*, 1> outputs_;
   llvm::SmallVector<ValueRef, 1> results_;
 };
 
@@ -322,13 +322,13 @@ int InfRtPredictor::Init(const InfRtConfig& config) {
 
 int InfRtPredictor::GetInputNum() { return impl_->executor->GetInputNum(); }
 
-::phi::DenseTensor* InfRtPredictor::GetInput(int i) {
+::Tensor* InfRtPredictor::GetInput(int i) {
   return impl_->executor->GetInput(i);
 }
 
 int InfRtPredictor::GetOutputNum() { return impl_->executor->GetOutputNum(); }
 
-::phi::DenseTensor* InfRtPredictor::GetOutput(int i) {
+::Tensor* InfRtPredictor::GetOutput(int i) {
   return impl_->executor->GetOutput(i);
 }
 
diff --git a/paddle/infrt/api/infrt_api.h b/paddle/infrt/api/infrt_api.h
index fcaed78bdd9ae..511d51648d7f3 100644
--- a/paddle/infrt/api/infrt_api.h
+++ b/paddle/infrt/api/infrt_api.h
@@ -61,9 +61,9 @@ class InfRtPredictor {
   void Run();
   int Init(const InfRtConfig& config);
   int GetInputNum();
-  ::phi::DenseTensor* GetInput(int i);
+  ::Tensor* GetInput(int i);
   int GetOutputNum();
-  ::phi::DenseTensor* GetOutput(int i);
+  ::Tensor* GetOutput(int i);
 
  protected:
   struct Impl;
diff --git a/paddle/infrt/api/infrt_api_test.cc.in b/paddle/infrt/api/infrt_api_test.cc.in
index f7d1c97603c63..32c8c25cd29c3 100644
--- a/paddle/infrt/api/infrt_api_test.cc.in
+++ b/paddle/infrt/api/infrt_api_test.cc.in
@@ -40,7 +40,7 @@ TEST(InfRtPredictor, predictor) {
   std::unique_ptr<InfRtPredictor> predictor = CreateInfRtPredictor(config);
 
   ::infrt::backends::CpuPhiAllocator cpu_allocator;
-  ::phi::DenseTensor* input = predictor->GetInput(0);
+  ::Tensor* input = predictor->GetInput(0);
   input->Resize({16, 784});
   input->AllocateFrom(&cpu_allocator, ::phi::DataType::FLOAT32);
   auto* input_data = reinterpret_cast<float*>(input->data());
@@ -68,7 +68,7 @@ TEST(InfRtPredictor, cpu_predictor) {
   std::unique_ptr<InfRtPredictor> predictor = CreateInfRtPredictor(config);
 
   ::infrt::backends::CpuPhiAllocator cpu_allocator;
-  ::phi::DenseTensor* input = predictor->GetInput(0);
+  ::Tensor* input = predictor->GetInput(0);
   input->Resize({2, 3, 256, 256});
   input->AllocateFrom(&cpu_allocator, ::phi::DataType::FLOAT32);
   auto* input_data = reinterpret_cast<float*>(input->data());
@@ -121,7 +121,7 @@ TEST(InfRtPredictor, trt_predictor) {
   std::unique_ptr<InfRtPredictor> predictor = CreateInfRtPredictor(config);
 
   ::infrt::backends::CpuPhiAllocator cpu_allocator;
-  ::phi::DenseTensor* input = predictor->GetInput(0);
+  ::Tensor* input = predictor->GetInput(0);
   input->Resize({2, 3, 256, 256});
   input->AllocateFrom(&cpu_allocator, ::phi::DataType::FLOAT32);
   auto* input_data = reinterpret_cast<float*>(input->data());
@@ -166,7 +166,7 @@ TEST(InfRtPredictor, gpu_predictor) {
   ::infrt::backends::GpuPhiAllocator gpu_allocator;
 
 
-  ::phi::DenseTensor* input = predictor->GetInput(0);
+  ::Tensor* input = predictor->GetInput(0);
   input->Resize({2, 3, 256, 256});
   input->AllocateFrom(&gpu_allocator, ::phi::DataType::FLOAT32);
   auto* data = reinterpret_cast<float*>(input->data());
diff --git a/paddle/infrt/backends/tensorrt/trt_engine.cc b/paddle/infrt/backends/tensorrt/trt_engine.cc
index a539078e4af4d..97f36829ddaee 100644
--- a/paddle/infrt/backends/tensorrt/trt_engine.cc
+++ b/paddle/infrt/backends/tensorrt/trt_engine.cc
@@ -244,11 +244,11 @@ bool TrtEngine::SetupNetworkAndConfig(const BuildOptions& build,
 }
 
 void TrtEngine::PrepareOutputHandle(const std::string& out_name) {
-  ::phi::DenseTensor t;
+  ::Tensor t;
   outputs_.emplace(out_name, t);
 }
 
-::phi::DenseTensor* TrtEngine::GetOutput(const std::string& name) {
+::Tensor* TrtEngine::GetOutput(const std::string& name) {
   return &outputs_[name];
 }
 
@@ -256,7 +256,7 @@ size_t TrtEngine::GetOutputNum() const { return outputs_.size(); }
 
 bool TrtEngine::SetUpInference(
     const InferenceOptions& inference,
-    const std::unordered_map<std::string, ::phi::DenseTensor*>& inputs) {
+    const std::unordered_map<std::string, ::Tensor*>& inputs) {
   // TODO(wilber): now only create one exec_context
   FreshDeviceId();
   CHECK(engine_ != nullptr);
diff --git a/paddle/infrt/backends/tensorrt/trt_engine.h b/paddle/infrt/backends/tensorrt/trt_engine.h
index 44f36a84cb5dc..5d7787f68a0fd 100644
--- a/paddle/infrt/backends/tensorrt/trt_engine.h
+++ b/paddle/infrt/backends/tensorrt/trt_engine.h
@@ -80,16 +80,15 @@ class TrtEngine {
   void Run(const ::phi::GPUContext& ctx);
 
   // TODO(wilber): How to support multiple execution contexts?
-  bool SetUpInference(
-      const InferenceOptions& inference,
-      const std::unordered_map<std::string, ::phi::DenseTensor*>& inputs);
+  bool SetUpInference(const InferenceOptions& inference,
+                      const std::unordered_map<std::string, ::Tensor*>& inputs);
 
   void GetEngineInfo();
 
   void PrepareOutputHandle(const std::string& out_name);
 
   // TODO(wilber): The output tensor names are: output_0, output_1, ...
-  ::phi::DenseTensor* GetOutput(const std::string&);
+  ::Tensor* GetOutput(const std::string&);
 
   size_t GetOutputNum() const;
 
@@ -119,7 +118,7 @@ class TrtEngine {
   std::vector<std::unique_ptr<Bindings>> bindings_;
   int device_id_{0};
   bool is_dynamic_shape_{false};
-  std::unordered_map<std::string, ::phi::DenseTensor> outputs_;
+  std::unordered_map<std::string, ::Tensor> outputs_;
 };
 
 }  // namespace tensorrt
diff --git a/paddle/infrt/backends/tensorrt/trt_utils.h b/paddle/infrt/backends/tensorrt/trt_utils.h
index b2d5659fd2520..e61b76e542e12 100644
--- a/paddle/infrt/backends/tensorrt/trt_utils.h
+++ b/paddle/infrt/backends/tensorrt/trt_utils.h
@@ -93,7 +93,7 @@ class TrtLogger : public nvinfer1::ILogger {
 struct Binding {
   bool is_input{false};
   nvinfer1::DataType data_type{nvinfer1::DataType::kFLOAT};
-  ::phi::DenseTensor* buffer{nullptr};
+  ::Tensor* buffer{nullptr};
   std::string name;
 };
 
@@ -104,7 +104,7 @@ class Bindings {
   void AddBinding(int32_t b,
                   const std::string& name,
                   bool is_input,
-                  ::phi::DenseTensor* buffer,
+                  ::Tensor* buffer,
                   nvinfer1::DataType data_type) {
     while (bindings_.size() <= static_cast<size_t>(b)) {
       bindings_.emplace_back();
diff --git a/paddle/infrt/dialect/infrt/pass/infrt_weights_unfold_pass.cc b/paddle/infrt/dialect/infrt/pass/infrt_weights_unfold_pass.cc
index 6a9f828dc9524..23d411021f969 100644
--- a/paddle/infrt/dialect/infrt/pass/infrt_weights_unfold_pass.cc
+++ b/paddle/infrt/dialect/infrt/pass/infrt_weights_unfold_pass.cc
@@ -80,7 +80,7 @@ void InfrtWeightsFoldPass::runOnFunction() {
           if (auto tensor_map_get_op =
                   llvm::dyn_cast<::infrt::phi::TensorMapGetTensorOp>(user_op)) {
             ::llvm::StringRef arg_name = tensor_map_get_op.name();
-            ::phi::DenseTensor* tensor = map.GetDenseTensor(arg_name.str());
+            ::Tensor* tensor = map.GetDenseTensor(arg_name.str());
             if (tensor->dtype() != ::phi::DataType::FLOAT32) {
               CHECK(false)
                   << "the weight tensor type now only support float32.";
diff --git a/paddle/infrt/host_context/kernel_frame.cc b/paddle/infrt/host_context/kernel_frame.cc
index 266c145f47839..a03ed0d156eaf 100644
--- a/paddle/infrt/host_context/kernel_frame.cc
+++ b/paddle/infrt/host_context/kernel_frame.cc
@@ -38,7 +38,7 @@ std::string KernelFrame::DumpArgTypes() const {
     DUMP(tensor::DenseHostTensor);
     DUMP(float);
     DUMP(int);
-    DUMP(::phi::DenseTensor);
+    DUMP(::Tensor);
     DUMP(::phi::MetaTensor);
     DUMP(::phi::CPUContext);
     DUMP(host_context::None);
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.cc b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
index 81b41d61ded3e..9b2190be23c1e 100644
--- a/paddle/infrt/host_context/mlir_to_runtime_translate.cc
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
@@ -308,7 +308,7 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(
           arg_value = GetOpResult(upstream_op);
         }
       }
-      if (arg_value->is_type<::phi::DenseTensor>()) {
+      if (arg_value->is_type<::Tensor>()) {
         impl_->runtime->FeedInArgs(
             std::make_pair(std::to_string(i), ValueRef(arg_value)));
       }
@@ -462,8 +462,8 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(
   for (int i = 0, e = op->getNumResults(); i < e; i++) {
     auto res = op->getResult(i);
     if (res.getType().isa<::infrt::DenseTensorType>()) {
-      auto r = impl_->value_map.try_emplace(
-          res, ValueRef(new Value{::phi::DenseTensor()}));
+      auto r =
+          impl_->value_map.try_emplace(res, ValueRef(new Value{::Tensor()}));
       CHECK(r.second) << "Duplicate add mlir value [" << DumpToString(res)
                       << "]";
       res_values.push_back(r.first->second.get());
diff --git a/paddle/infrt/host_context/value.cc b/paddle/infrt/host_context/value.cc
index 822ee108c897c..1bfdc59b96012 100644
--- a/paddle/infrt/host_context/value.cc
+++ b/paddle/infrt/host_context/value.cc
@@ -60,8 +60,8 @@ void CopyTo(const Value& from, Value* to) {
         else if (std::is_same<T, tensor::TensorMap>::value)
           to->data = reinterpret_cast<tensor::TensorMap const&>(arg);
 #ifdef INFRT_WITH_PHI
-        else if (std::is_same<T, ::phi::DenseTensor>::value)
-          to->data = reinterpret_cast<::phi::DenseTensor const&>(arg);
+        else if (std::is_same<T, ::Tensor>::value)
+          to->data = reinterpret_cast<::Tensor const&>(arg);
 #endif
         else
           LOG(FATAL) << "Not supported Value copy: " << typeid(T).name();
diff --git a/paddle/infrt/host_context/value.h b/paddle/infrt/host_context/value.h
index af785c13349fd..b5e47196d57bd 100644
--- a/paddle/infrt/host_context/value.h
+++ b/paddle/infrt/host_context/value.h
@@ -80,17 +80,17 @@ using ValueVariantType =
             ::infrt::TargetType,
 #ifdef INFRT_WITH_PHI
             ::phi::MetaTensor,
-            ::phi::DenseTensor,
+            ::Tensor,
             backends::CpuPhiContext,
 #ifdef INFRT_WITH_GPU
             backends::GpuPhiContext,
             ::phi::GPUContext,
 #endif  // INFRT_WITH_GPU
             ::phi::CPUContext,
-            std::vector<const ::phi::DenseTensor*>,
-            std::vector<::phi::DenseTensor*>,
-            paddle::experimental::ScalarBase<::phi::DenseTensor>,
-            paddle::experimental::IntArrayBase<::phi::DenseTensor>,
+            std::vector<const ::Tensor*>,
+            std::vector<::Tensor*>,
+            paddle::experimental::ScalarBase<::Tensor>,
+            paddle::experimental::IntArrayBase<::Tensor>,
             std::vector<const ::phi::MetaTensor*>,
             std::vector<::phi::MetaTensor*>,
             ::phi::MetaConfig,
@@ -146,7 +146,7 @@ class Value : public common::Object {
   explicit Value(::phi::GPUContext&& x) : data(std::move(x)) {}
   explicit Value(backends::GpuPhiContext&& x) : data(std::move(x)) {}
 #endif
-  explicit Value(::phi::DenseTensor&& x) : data(std::move(x)) {}
+  explicit Value(::Tensor&& x) : data(std::move(x)) {}
   explicit Value(::phi::MetaTensor&& x) : data(std::move(x)) {}
   explicit Value(::phi::MetaConfig&& x) : data(std::move(x)) {}
 #ifdef INFRT_WITH_TRT
diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
index 8c49f47e7d873..645df69171048 100644
--- a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
+++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
@@ -37,27 +37,26 @@ namespace infrt {
 namespace kernel {
 namespace phi {
 
-::phi::DenseTensor CreateDenseTensor(
+::Tensor CreateDenseTensor(
     const ::phi::CPUContext& context,
     host_context::Attribute<std::vector<int64_t>> dims,
     host_context::Attribute<std::vector<int64_t>> lod,
     host_context::Attribute<::infrt::LayoutType> layout,
     host_context::Attribute<::infrt::PrecisionType> precision) {
-  return ::phi::DenseTensor(
-      const_cast<::phi::Allocator*>(&context.GetAllocator()),
-      ::phi::DenseTensorMeta(ConvertPrecisionToPhi(precision.get()),
-                             ::phi::make_ddim(dims.get()),
-                             ConvertLayoutToPhi(layout.get()),
-                             {}));
+  return ::Tensor(const_cast<::phi::Allocator*>(&context.GetAllocator()),
+                  ::phi::DenseTensorMeta(ConvertPrecisionToPhi(precision.get()),
+                                         ::phi::make_ddim(dims.get()),
+                                         ConvertLayoutToPhi(layout.get()),
+                                         {}));
 }
 
-::phi::DenseTensor CreateInitedDenseTensorF32(
+::Tensor CreateInitedDenseTensorF32(
     const ::phi::CPUContext& context,
     host_context::Attribute<std::vector<int64_t>> dims,
     host_context::Attribute<std::vector<int64_t>> lod,
     host_context::Attribute<::infrt::LayoutType> layout,
     host_context::Attribute<float> value) {
-  ::phi::DenseTensor dense_tensor(
+  ::Tensor dense_tensor(
       const_cast<::phi::Allocator*>(&context.GetAllocator()),
       ::phi::DenseTensorMeta(
           ConvertPrecisionToPhi(::infrt::PrecisionType::FLOAT32),
@@ -71,13 +70,13 @@ ::phi::DenseTensor CreateInitedDenseTensorF32(
   return dense_tensor;
 }
 
-::phi::DenseTensor CreateHostInitedDenseTensorF32(
+::Tensor CreateHostInitedDenseTensorF32(
     const ::phi::CPUContext& context,
     host_context::Attribute<std::vector<int64_t>> dims,
     host_context::Attribute<std::vector<int64_t>> lod,
     host_context::Attribute<::infrt::LayoutType> layout,
     host_context::Attribute<std::vector<float>> values) {
-  ::phi::DenseTensor dense_tensor(
+  ::Tensor dense_tensor(
       const_cast<::phi::Allocator*>(&context.GetAllocator()),
       ::phi::DenseTensorMeta(
           ConvertPrecisionToPhi(::infrt::PrecisionType::FLOAT32),
@@ -92,21 +91,20 @@ ::phi::DenseTensor CreateHostInitedDenseTensorF32(
   return dense_tensor;
 }
 
-::phi::DenseTensor CreateGPUDenseTensor(
+::Tensor CreateGPUDenseTensor(
     const ::phi::GPUContext& context,
     host_context::Attribute<std::vector<int64_t>> dims,
     host_context::Attribute<std::vector<int64_t>> lod,
     host_context::Attribute<::infrt::LayoutType> layout,
     host_context::Attribute<::infrt::PrecisionType> precision) {
-  return ::phi::DenseTensor(
-      const_cast<::phi::Allocator*>(&context.GetAllocator()),
-      ::phi::DenseTensorMeta(ConvertPrecisionToPhi(precision.get()),
-                             ::phi::make_ddim(dims.get()),
-                             ConvertLayoutToPhi(layout.get()),
-                             {}));
+  return ::Tensor(const_cast<::phi::Allocator*>(&context.GetAllocator()),
+                  ::phi::DenseTensorMeta(ConvertPrecisionToPhi(precision.get()),
+                                         ::phi::make_ddim(dims.get()),
+                                         ConvertLayoutToPhi(layout.get()),
+                                         {}));
 }
 
-void FillDenseTensorF32(::phi::DenseTensor* dense_tensor,
+void FillDenseTensorF32(::Tensor* dense_tensor,
                         host_context::Attribute<std::vector<float>> value) {
   auto place = dense_tensor->place();
   float* a_data = dense_tensor->mutable_data<float>(place);
@@ -127,7 +125,7 @@ void FillDenseTensorF32(::phi::DenseTensor* dense_tensor,
   }
 }
 
-void PrintDenseTensor(::phi::DenseTensor* dense_tensor) {
+void PrintDenseTensor(::Tensor* dense_tensor) {
 #ifndef INFRT_WITH_GPU
 #define PRINT_META_DATA(PHI_DATATYPE, DTYPE)                \
   case ::phi::DataType::PHI_DATATYPE: {                     \
@@ -204,8 +202,7 @@ ::infrt::phi::DenseTensorMap LoadParameters(const std::string& file_path) {
     std::ifstream param_file(param_path, std::ios::binary);
     switch (var.type().type()) {
       case ::paddle::framework::proto::VarType_Type_LOD_TENSOR: {
-        std::unique_ptr<::phi::DenseTensor> tensor{
-            std::make_unique<::phi::DenseTensor>()};
+        std::unique_ptr<::Tensor> tensor{std::make_unique<::Tensor>()};
         ::infrt::paddle::DeserializeFromStream(param_file, tensor.get(), ctx);
         map.SetDenseTensor(var.name(), std::move(tensor));
       } break;
@@ -253,8 +250,7 @@ ::infrt::phi::DenseTensorMap LoadCombinedParameters(
   ctx.SetHostAllocator(allocator_ptr);
   ctx.SetZeroAllocator(allocator_ptr);
   for (auto& var : tmp) {
-    std::unique_ptr<::phi::DenseTensor> tensor{
-        std::make_unique<::phi::DenseTensor>()};
+    std::unique_ptr<::Tensor> tensor{std::make_unique<::Tensor>()};
     ::infrt::paddle::DeserializeFromStream(param_file, tensor.get(), ctx);
     map.SetDenseTensor(var, std::move(tensor));
   }
@@ -289,8 +285,7 @@ ::infrt::phi::DenseTensorMap LoadCombinedParamsToGpu(
   ctx.PartialInitWithoutAllocator();
 
   for (auto& var : tmp) {
-    std::unique_ptr<::phi::DenseTensor> tensor{
-        std::make_unique<::phi::DenseTensor>()};
+    std::unique_ptr<::Tensor> tensor{std::make_unique<::Tensor>()};
     ::paddle::framework::DeserializeFromStream(param_file, tensor.get(), ctx);
     map.SetDenseTensor(var, std::move(tensor));
   }
@@ -305,9 +300,8 @@ ::infrt::phi::DenseTensorMap LoadCombinedParams(
   return LoadCombinedParameters(model_path.get(), params_path.get());
 }
 
-::phi::DenseTensor TensorMapGetTensor(
-    const ::infrt::phi::DenseTensorMap& map,
-    host_context::Attribute<std::string> name) {
+::Tensor TensorMapGetTensor(const ::infrt::phi::DenseTensorMap& map,
+                            host_context::Attribute<std::string> name) {
   auto* tensor = map.GetDenseTensor(name.get());
   CHECK(tensor);
   return *tensor;
@@ -348,10 +342,10 @@ inline size_t SizeOfDataType(::phi::DataType data_type) {
   }
   return 0;
 }
-void GpuMemCpy(const ::phi::DenseTensor& input,
+void GpuMemCpy(const ::Tensor& input,
                const ::phi::GPUContext& context,
                bool d2h,
-               ::phi::DenseTensor* output) {
+               ::Tensor* output) {
   if (d2h) {
     CHECK(input.place().GetType() == ::phi::AllocationType::GPU);
 
diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.h b/paddle/infrt/kernel/phi/dense_tensor_kernels.h
index 573b8f102ec7c..4a41ccdcfa29d 100644
--- a/paddle/infrt/kernel/phi/dense_tensor_kernels.h
+++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.h
@@ -25,46 +25,45 @@ namespace infrt {
 namespace kernel {
 namespace phi {
 
-::phi::DenseTensor CreateDenseTensor(
+::Tensor CreateDenseTensor(
     const ::phi::CPUContext& context,
     host_context::Attribute<std::vector<int64_t>> dims,
     host_context::Attribute<std::vector<int64_t>> lod,
     host_context::Attribute<::infrt::LayoutType> layout,
     host_context::Attribute<::infrt::PrecisionType> precision);
 
-::phi::DenseTensor CreateInitedDenseTensorF32(
+::Tensor CreateInitedDenseTensorF32(
     const ::phi::CPUContext& context,
     host_context::Attribute<std::vector<int64_t>> dims,
     host_context::Attribute<std::vector<int64_t>> lod,
     host_context::Attribute<::infrt::LayoutType> layout,
     host_context::Attribute<float> value);
 
-::phi::DenseTensor CreateHostInitedDenseTensorF32(
+::Tensor CreateHostInitedDenseTensorF32(
     const ::phi::CPUContext& context,
     host_context::Attribute<std::vector<int64_t>> dims,
     host_context::Attribute<std::vector<int64_t>> lod,
     host_context::Attribute<::infrt::LayoutType> layout,
     host_context::Attribute<std::vector<float>> values);
 
-::phi::DenseTensor CreateGPUDenseTensor(
+::Tensor CreateGPUDenseTensor(
     const ::phi::GPUContext& context,
     host_context::Attribute<std::vector<int64_t>> dims,
     host_context::Attribute<std::vector<int64_t>> lod,
     host_context::Attribute<::infrt::LayoutType> layout,
     host_context::Attribute<::infrt::PrecisionType> precision);
 
-void FillDenseTensorF32(::phi::DenseTensor* dense_tensor,
+void FillDenseTensorF32(::Tensor* dense_tensor,
                         host_context::Attribute<std::vector<float>> values);
-void PrintDenseTensor(::phi::DenseTensor* dense_tensor);
+void PrintDenseTensor(::Tensor* dense_tensor);
 
 ::infrt::phi::DenseTensorMap LoadParameters(const std::string& path);
 
 ::infrt::phi::DenseTensorMap LoadParams(
     host_context::Attribute<std::string> path);
 
-::phi::DenseTensor TensorMapGetTensor(
-    const ::infrt::phi::DenseTensorMap& map,
-    host_context::Attribute<std::string> name);
+::Tensor TensorMapGetTensor(const ::infrt::phi::DenseTensorMap& map,
+                            host_context::Attribute<std::string> name);
 
 ::infrt::phi::DenseTensorMap LoadCombinedParams(
     host_context::Attribute<std::string> model_path,
@@ -79,10 +78,10 @@ ::infrt::phi::DenseTensorMap LoadCombinedParamsToGpu(
 int32_t TensorMapGetSize(const ::infrt::phi::DenseTensorMap& map);
 
 #ifdef INFRT_WITH_GPU
-void GpuMemCpy(const ::phi::DenseTensor& input,
+void GpuMemCpy(const ::Tensor& input,
                const ::phi::GPUContext& context,
                bool d2h,
-               ::phi::DenseTensor* output);
+               ::Tensor* output);
 #endif
 
 }  // namespace phi
diff --git a/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
index aa577da60c3ae..c37569f8b4cb0 100644
--- a/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
+++ b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
@@ -26,9 +26,9 @@ namespace infrt {
 namespace kernel {
 
 namespace {
-static void ElementwiseAddTest(const ::phi::DenseTensor& a,
-                               const ::phi::DenseTensor& b,
-                               ::phi::DenseTensor* c);
+static void ElementwiseAddTest(const ::Tensor& a,
+                               const ::Tensor& b,
+                               ::Tensor* c);
 }
 
 TEST(utils, registry) {
@@ -66,9 +66,9 @@ TEST(ElementwiseAdd, launcher_registry) {
   auto fancy_allocator = std::unique_ptr<::phi::Allocator>(new FancyAllocator);
   auto* alloc = fancy_allocator.get();
 
-  ::phi::DenseTensor a(alloc, meta);
-  ::phi::DenseTensor b(alloc, meta);
-  ::phi::DenseTensor c(alloc, meta);
+  ::Tensor a(alloc, meta);
+  ::Tensor b(alloc, meta);
+  ::Tensor c(alloc, meta);
 
   auto place = ::phi::CPUPlace();
   float* a_data = a.mutable_data<float>(place);
diff --git a/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc b/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc
index cb9640451f9b2..6ee0bc20f9939 100644
--- a/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc
+++ b/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc
@@ -25,9 +25,9 @@ void InferShapedKernelLauncher::CreateKernelFrameForInferShape(
   for (host_context::Value* value :
        frame->GetValues(1, frame->GetNumElements() - 1)) {
     // TODO(Superjomn) To extend this.
-    if (value->is_type<::phi::DenseTensor>()) {
-      values.emplace_back(new host_context::Value{
-          ::phi::MetaTensor{&value->get<::phi::DenseTensor>()}});
+    if (value->is_type<::Tensor>()) {
+      values.emplace_back(
+          new host_context::Value{::phi::MetaTensor{&value->get<::Tensor>()}});
       infershape_kernel_frame_builder.AddArgument(values.back().get());
     } else {
       infershape_kernel_frame_builder.AddArgument(value);
diff --git a/paddle/infrt/kernel/phi/infershaped/infershaped_utils.h b/paddle/infrt/kernel/phi/infershaped/infershaped_utils.h
index 531d77ba952aa..999369c582654 100644
--- a/paddle/infrt/kernel/phi/infershaped/infershaped_utils.h
+++ b/paddle/infrt/kernel/phi/infershaped/infershaped_utils.h
@@ -23,7 +23,7 @@ namespace infrt {
 namespace kernel {
 namespace infershaped {
 
-using KeyType = const ::phi::DenseTensor&;
+using KeyType = const ::Tensor&;
 using CountType = uint8_t;
 
 constexpr CountType value(std::true_type) { return 1; }
diff --git a/paddle/infrt/kernel/tensor_kernels.cc b/paddle/infrt/kernel/tensor_kernels.cc
index 2e952e77d1f0a..77c2f90b26b8a 100644
--- a/paddle/infrt/kernel/tensor_kernels.cc
+++ b/paddle/infrt/kernel/tensor_kernels.cc
@@ -68,14 +68,14 @@ int32_t TensorMapGetSize(TensorMap map) { return map.size(); }
 
 // TODO(wilber): Maybe we should place TensorList type in dt dialect.
 #ifdef INFRT_WITH_PHI
-::phi::DenseTensor TensorListGetTensor(std::vector<::phi::DenseTensor *> list,
-                                       Attribute<int32_t> idx) {
+::Tensor TensorListGetTensor(std::vector<::Tensor *> list,
+                             Attribute<int32_t> idx) {
   CHECK_LT(idx.get(), static_cast<int>(list.size()))
       << "idx should less than list size";
   return *list[idx.get()];
 }
 
-int32_t TensorListGetSize(const std::vector<::phi::DenseTensor *> &list) {
+int32_t TensorListGetSize(const std::vector<::Tensor *> &list) {
   return list.size();
 }
 #endif
diff --git a/paddle/infrt/kernel/tensorrt/trt_helper.h b/paddle/infrt/kernel/tensorrt/trt_helper.h
index 4f1f1dde38cbe..6f7455b848d58 100644
--- a/paddle/infrt/kernel/tensorrt/trt_helper.h
+++ b/paddle/infrt/kernel/tensorrt/trt_helper.h
@@ -64,7 +64,7 @@ static std::vector<T> ArrayAttrToVec(const mlir::ArrayAttr& int_array_attr) {
   return ret;
 }
 
-static nvinfer1::Weights TensorToWeights(::phi::DenseTensor* tensor) {
+static nvinfer1::Weights TensorToWeights(::Tensor* tensor) {
   CHECK_NOTNULL(tensor);
   nvinfer1::Weights ret;
   ret.type = TensorTypeToWeightType(tensor->dtype());
diff --git a/paddle/infrt/kernel/tensorrt/trt_kernels.cc b/paddle/infrt/kernel/tensorrt/trt_kernels.cc
index 931fe21b2c710..3d30b0264c2d4 100644
--- a/paddle/infrt/kernel/tensorrt/trt_kernels.cc
+++ b/paddle/infrt/kernel/tensorrt/trt_kernels.cc
@@ -69,7 +69,7 @@ ::infrt::backends::tensorrt::TrtEngine CreateTrtEngine(
   auto& region = operation.getRegion(0);
   auto& block = region.getBlocks().front();
 
-  std::unordered_map<std::string, ::phi::DenseTensor*> trt_bind_inputs;
+  std::unordered_map<std::string, ::Tensor*> trt_bind_inputs;
   ValueToITensorMap value_to_trt_tensor_map;
   ValueToTensorMap value_to_tensor_map;
 
@@ -80,7 +80,7 @@ ::infrt::backends::tensorrt::TrtEngine CreateTrtEngine(
     const std::string input_name = "input_" + std::to_string(idx);
     auto* v = symbol_table->GetValue(std::to_string(idx));
     CHECK_NOTNULL(v);
-    auto* t = &v->get<::phi::DenseTensor>();
+    auto* t = &v->get<::Tensor>();
     value_to_tensor_map[operand] = t;
 
     // TODO(wilber): get input info from mlir.
@@ -186,10 +186,10 @@ void PrintTrtLayer(backends::tensorrt::TrtEngine* engine) {
   engine->GetEngineInfo();
 }
 
-std::vector<::phi::DenseTensor*> TrtEngineCompute(
-    backends::tensorrt::TrtEngine* engine, const ::phi::GPUContext& context) {
+std::vector<::Tensor*> TrtEngineCompute(backends::tensorrt::TrtEngine* engine,
+                                        const ::phi::GPUContext& context) {
   engine->Run(context);
-  std::vector<::phi::DenseTensor*> res;
+  std::vector<::Tensor*> res;
   for (size_t i = 0; i < engine->GetOutputNum(); ++i) {
     res.push_back(engine->GetOutput("output_" + std::to_string(i)));
   }
diff --git a/paddle/infrt/kernel/tensorrt/trt_kernels.h b/paddle/infrt/kernel/tensorrt/trt_kernels.h
index bf41c124a299b..254b8ed14d7d9 100644
--- a/paddle/infrt/kernel/tensorrt/trt_kernels.h
+++ b/paddle/infrt/kernel/tensorrt/trt_kernels.h
@@ -40,8 +40,8 @@ ::infrt::backends::tensorrt::TrtEngine CreateTrtEngine(
 
 void PrintTrtLayer(backends::tensorrt::TrtEngine* engine);
 
-std::vector<::phi::DenseTensor*> TrtEngineCompute(
-    backends::tensorrt::TrtEngine* engine, const ::phi::GPUContext& context);
+std::vector<::Tensor*> TrtEngineCompute(backends::tensorrt::TrtEngine* engine,
+                                        const ::phi::GPUContext& context);
 
 }  // namespace tensorrt
 }  // namespace kernel
diff --git a/paddle/infrt/paddle/model_parser.cc b/paddle/infrt/paddle/model_parser.cc
index da4f8b6420b22..6fc358a4c043d 100644
--- a/paddle/infrt/paddle/model_parser.cc
+++ b/paddle/infrt/paddle/model_parser.cc
@@ -207,7 +207,7 @@ inline ::phi::DataType PhiDataType(framework_proto::VarType::Type type) {
 }
 
 inline void TensorFromStream(std::istream &is,
-                             ::phi::DenseTensor *tensor,
+                             ::Tensor *tensor,
                              const ::phi::CPUContext &ctx) {
   uint32_t version;
   is.read(reinterpret_cast<char *>(&version), sizeof(version));
@@ -237,7 +237,7 @@ inline void TensorFromStream(std::istream &is,
 }
 
 void DeserializeFromStream(std::istream &is,
-                           ::phi::DenseTensor *tensor,
+                           ::Tensor *tensor,
                            const ::phi::CPUContext &dev_ctx) {
   {
     // the 1st field, unit32_t version for LoDTensor
diff --git a/paddle/infrt/paddle/model_parser.h b/paddle/infrt/paddle/model_parser.h
index 5f039ad5d3ad8..39af7a919318b 100644
--- a/paddle/infrt/paddle/model_parser.h
+++ b/paddle/infrt/paddle/model_parser.h
@@ -60,7 +60,7 @@ void ReadBinaryFile(const std::string& filename, std::string* contents);
 
 #ifdef INFRT_WITH_PHI
 void DeserializeFromStream(std::istream& is,
-                           ::phi::DenseTensor* tensor,
+                           ::Tensor* tensor,
                            const ::phi::CPUContext& dev_ctx);
 #endif
 
diff --git a/paddle/infrt/tensor/phi/tensor_map.cc b/paddle/infrt/tensor/phi/tensor_map.cc
index afac7175caf4f..dd273a175d200 100644
--- a/paddle/infrt/tensor/phi/tensor_map.cc
+++ b/paddle/infrt/tensor/phi/tensor_map.cc
@@ -20,8 +20,8 @@
 namespace infrt {
 namespace phi {
 
-void DenseTensorMap::SetDenseTensor(
-    const std::string& name, std::unique_ptr<::phi::DenseTensor>&& tensor) {
+void DenseTensorMap::SetDenseTensor(const std::string& name,
+                                    std::unique_ptr<::Tensor>&& tensor) {
   std::lock_guard<std::mutex> lock(mu_);
   auto it = map_.emplace(std::make_pair(name, std::move(tensor)));
   if (!it.second) {
@@ -29,8 +29,7 @@ void DenseTensorMap::SetDenseTensor(
   }
 }
 
-::phi::DenseTensor* DenseTensorMap::GetDenseTensor(
-    const std::string& name) const {
+::Tensor* DenseTensorMap::GetDenseTensor(const std::string& name) const {
   std::lock_guard<std::mutex> lock(mu_);
   auto it = map_.find(name);
   if (it != map_.end()) {
diff --git a/paddle/infrt/tensor/phi/tensor_map.h b/paddle/infrt/tensor/phi/tensor_map.h
index 5a754f42fb63c..8b72cd924bf58 100644
--- a/paddle/infrt/tensor/phi/tensor_map.h
+++ b/paddle/infrt/tensor/phi/tensor_map.h
@@ -26,13 +26,13 @@ class DenseTensorMap {
   DenseTensorMap() = default;
   DenseTensorMap(DenseTensorMap&& other) : map_(std::move(other.map_)) {}
   void SetDenseTensor(const std::string& name,
-                      std::unique_ptr<::phi::DenseTensor>&& tensor);
-  ::phi::DenseTensor* GetDenseTensor(const std::string& name) const;
+                      std::unique_ptr<::Tensor>&& tensor);
+  ::Tensor* GetDenseTensor(const std::string& name) const;
   size_t size() const;
 
  private:
   mutable std::mutex mu_;
-  std::unordered_map<std::string, std::unique_ptr<::phi::DenseTensor>> map_;
+  std::unordered_map<std::string, std::unique_ptr<::Tensor>> map_;
 };
 
 }  // namespace phi
diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h
index 67cedaf6710ab..628ea8c10245a 100644
--- a/paddle/phi/api/include/tensor.h
+++ b/paddle/phi/api/include/tensor.h
@@ -63,7 +63,7 @@ class AbstractAutogradMeta {
  * computation.
  *
  * This is a new Tensor design, which is independent of the original
- * framework::Tensor in fluid. The original Tensor will be gradually discarded
+ * phi::DenseTensor in fluid. The original Tensor will be gradually discarded
  * in the future.
  *
  * Note: Tensor can be NULL state, Tensor is meaningful only when the
diff --git a/paddle/phi/api/lib/utils/tensor_utils.cc b/paddle/phi/api/lib/utils/tensor_utils.cc
index c9fb2d3734edc..b597b5085479d 100644
--- a/paddle/phi/api/lib/utils/tensor_utils.cc
+++ b/paddle/phi/api/lib/utils/tensor_utils.cc
@@ -32,7 +32,7 @@ void SetLoD(DstLoD* dst, const SrcLoD& src) {
 }
 
 std::unique_ptr<phi::DenseTensor> MakePhiDenseTensor(
-    const paddle::framework::Tensor& src) {
+    const phi::DenseTensor& src) {
   return std::make_unique<phi::DenseTensor>(src);
 }
 
@@ -62,9 +62,7 @@ phi::Scalar MakePhiScalarFromVar(const framework::Variable& variable) {
   }
 }
 
-phi::IntArray MakePhiIntArray(const paddle::framework::Tensor& src) {
-  return {src};
-}
+phi::IntArray MakePhiIntArray(const phi::DenseTensor& src) { return {src}; }
 
 phi::IntArray MakePhiIntArrayFromVar(const framework::Variable& variable) {
   if (variable.IsType<framework::LoDTensor>()) {
diff --git a/paddle/phi/api/lib/utils/tensor_utils.h b/paddle/phi/api/lib/utils/tensor_utils.h
index f930f5b11f64f..5b237f433aa6f 100644
--- a/paddle/phi/api/lib/utils/tensor_utils.h
+++ b/paddle/phi/api/lib/utils/tensor_utils.h
@@ -29,9 +29,9 @@ namespace paddle {
 namespace experimental {
 
 std::unique_ptr<phi::DenseTensor> MakePhiDenseTensor(
-    const paddle::framework::Tensor& src);
+    const phi::DenseTensor& src);
 
-phi::IntArray MakePhiIntArray(const paddle::framework::Tensor& src);
+phi::IntArray MakePhiIntArray(const phi::DenseTensor& src);
 
 phi::Scalar MakePhiScalarFromVar(const framework::Variable& variable);
 
diff --git a/paddle/phi/backends/custom/custom_device_test.cc b/paddle/phi/backends/custom/custom_device_test.cc
index 2458241c3c85d..5b96a9979a596 100644
--- a/paddle/phi/backends/custom/custom_device_test.cc
+++ b/paddle/phi/backends/custom/custom_device_test.cc
@@ -76,7 +76,7 @@ void TestDeviceInterface(const paddle::platform::Place& place) {
 
 void TestTensorMutableData(const paddle::platform::Place& place) {
   std::cout << "TestTensorInitialization on " << place << std::endl;
-  paddle::framework::Tensor src_tensor;
+  phi::DenseTensor src_tensor;
   float* p1 = nullptr;
   float* p2 = nullptr;
   // initialization
@@ -101,8 +101,8 @@ void TestTensorMutableData(const paddle::platform::Place& place) {
 
 void TestTensorShareDataWith(const paddle::platform::Place& place) {
   std::cout << "TestTensorShareDataWith on " << place << std::endl;
-  paddle::framework::Tensor src_tensor;
-  paddle::framework::Tensor dst_tensor;
+  phi::DenseTensor src_tensor;
+  phi::DenseTensor dst_tensor;
   src_tensor.mutable_data<int>(phi::make_ddim({2, 3, 4}), place);
   dst_tensor.ShareDataWith(src_tensor);
   ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
@@ -113,9 +113,9 @@ void TestTensorUtils(const paddle::platform::Place& place) {
   if (paddle::platform::is_custom_place(place) == false) {
     return;
   }
-  paddle::framework::Tensor src_tensor;
-  paddle::framework::Tensor gpu_tensor;
-  paddle::framework::Tensor dst_tensor;
+  phi::DenseTensor src_tensor;
+  phi::DenseTensor gpu_tensor;
+  phi::DenseTensor dst_tensor;
 
   int* src_ptr = src_tensor.mutable_data<int>(phi::make_ddim({3, 3}),
                                               paddle::platform::CPUPlace());
@@ -148,7 +148,7 @@ void TestTensorUtils(const paddle::platform::Place& place) {
     EXPECT_EQ(src_ptr[i], dst_ptr_tmp[i]);
   }
 
-  paddle::framework::Tensor slice_tensor = src_tensor.Slice(1, 2);
+  phi::DenseTensor slice_tensor = src_tensor.Slice(1, 2);
 
   // CPU Slice Tensor to GPU Tensor
   paddle::framework::TensorCopy(slice_tensor, place, gpu_ctx, &gpu_tensor);
diff --git a/paddle/phi/core/dense_tensor.h b/paddle/phi/core/dense_tensor.h
index e9a6be66b98ca..abf242acdb22a 100644
--- a/paddle/phi/core/dense_tensor.h
+++ b/paddle/phi/core/dense_tensor.h
@@ -192,9 +192,9 @@ class DenseTensor : public TensorBase,
   - Question: In what scenarios will version counters NOT be shared?
   - Answer: Replacing a `Variable`'s data by calling
   `Tensor::ShareDataWith(...)` or `Tensor::ShareBufferWith(...)`. Because they
-  share the same Allocation but not framework::Tensor.
+  share the same Allocation but not phi::DenseTensor.
 
-  - Question: Why put the inplace_version_counter_ in framework::Tensor instead
+  - Question: Why put the inplace_version_counter_ in phi::DenseTensor instead
   of Allocation or Variable?
   - Answer:
    1. Tensor can call ResetHolder() to reset the corresponding Allocation so
diff --git a/paddle/phi/core/dense_tensor.inl b/paddle/phi/core/dense_tensor.inl
index eead55d8a0067..1ed772fd67586 100644
--- a/paddle/phi/core/dense_tensor.inl
+++ b/paddle/phi/core/dense_tensor.inl
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 /* --------------------------- */
-/*   From framework::Tensor    */
+/*   From phi::DenseTensor    */
 /* --------------------------- */
-/* The following members & interfaces were copied from framework::Tensor,
+/* The following members & interfaces were copied from phi::DenseTensor,
     so as to facilitate the unification of different Tensors
 
     Will be adjusted/removed/moved in the near future
@@ -134,7 +134,7 @@ inline void set_format(const dnnl::memory::format_tag format) {
 /* ------------------------------ */
 /*   From framework::LoDTensor    */
 /* ------------------------------ */
-/* The following members & interfaces were copied from framework::Tensor,
+/* The following members & interfaces were copied from phi::DenseTensor,
     so as to facilitate the unification of different Tensors
 
     Will be adjusted/removed/moved in the near future
diff --git a/paddle/phi/core/dense_tensor_impl.cc b/paddle/phi/core/dense_tensor_impl.cc
index c4600328c4afa..4982f0db3e012 100644
--- a/paddle/phi/core/dense_tensor_impl.cc
+++ b/paddle/phi/core/dense_tensor_impl.cc
@@ -25,7 +25,7 @@ limitations under the License. */
 
 namespace phi {
 /* --------------------------- */
-/*   From framework::Tensor    */
+/*   From phi::DenseTensor    */
 /* --------------------------- */
 DenseTensor::DenseTensor() {
   meta_.dtype = paddle::experimental::DataType::FLOAT32;
diff --git a/paddle/phi/kernels/funcs/fc_functor.cc b/paddle/phi/kernels/funcs/fc_functor.cc
index 0434483be1326..f428746bc524d 100644
--- a/paddle/phi/kernels/funcs/fc_functor.cc
+++ b/paddle/phi/kernels/funcs/fc_functor.cc
@@ -33,12 +33,12 @@ void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
                                              bool relu,
                                              bool padding_weights) {
   auto blas = GetBlas<DeviceContext, T>(context);
-  paddle::framework::Tensor Y1;
+  phi::DenseTensor Y1;
   T* Y1_data = nullptr;
   if (padding_weights) {
     const int NN = N + 4;
     const int KK = K + 4;
-    paddle::framework::Tensor X1;
+    phi::DenseTensor X1;
     T* X1_data = X1.mutable_data<T>({M * KK}, paddle::platform::CPUPlace());
     Y1_data = Y1.mutable_data<T>({M * (N + 4)}, paddle::platform::CPUPlace());
 #ifdef PADDLE_WITH_MKLML
diff --git a/paddle/phi/kernels/funcs/math_function.cc b/paddle/phi/kernels/funcs/math_function.cc
index 19bbec124f2ca..7102b9cb11ad6 100644
--- a/paddle/phi/kernels/funcs/math_function.cc
+++ b/paddle/phi/kernels/funcs/math_function.cc
@@ -108,8 +108,8 @@ DEFINE_CPU_TRANS(6);
 template <typename DeviceContext, typename T>
 void TransposeNormal<DeviceContext, T>::operator()(
     const DeviceContext& context,
-    const paddle::framework::Tensor& in,
-    paddle::framework::Tensor* out,
+    const phi::DenseTensor& in,
+    phi::DenseTensor* out,
     const std::vector<int>& axis) {
   const int rank = axis.size();
   auto in_stride = phi::stride(in.dims());
@@ -151,7 +151,7 @@ DEFINE_CPU_TRANS_NORMAL(phi::dtype::complex<float>);
 DEFINE_CPU_TRANS_NORMAL(phi::dtype::complex<double>);
 
 struct TensorSetConstantCPU {
-  TensorSetConstantCPU(paddle::framework::Tensor* tensor, float value)
+  TensorSetConstantCPU(phi::DenseTensor* tensor, float value)
       : tensor_(tensor), value_(value) {}
   template <typename T>
   void apply() const {
@@ -159,14 +159,14 @@ struct TensorSetConstantCPU {
     auto* begin = tensor_->mutable_data<T>(cpu);
     std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
   }
-  paddle::framework::Tensor* tensor_;
+  phi::DenseTensor* tensor_;
   float value_;
 };
 
 template <>
 void set_constant_with_place<paddle::platform::XPUPlace>(
     const paddle::platform::DeviceContext& context,
-    paddle::framework::Tensor* tensor,
+    phi::DenseTensor* tensor,
     float value) {
   PADDLE_THROW(phi::errors::Unimplemented("XPUPlace is not supported"));
 }
@@ -174,7 +174,7 @@ void set_constant_with_place<paddle::platform::XPUPlace>(
 template <>
 void set_constant_with_place<paddle::platform::NPUPlace>(
     const paddle::platform::DeviceContext& context,
-    paddle::framework::Tensor* tensor,
+    phi::DenseTensor* tensor,
     float value) {
   PADDLE_THROW(phi::errors::Unimplemented("NPUPlace is not supported"));
 }
@@ -182,7 +182,7 @@ void set_constant_with_place<paddle::platform::NPUPlace>(
 template <>
 void set_constant_with_place<paddle::platform::NPUPinnedPlace>(
     const paddle::platform::DeviceContext& context,
-    paddle::framework::Tensor* tensor,
+    phi::DenseTensor* tensor,
     float value) {
   PADDLE_THROW(phi::errors::Unimplemented("NPUPinnedPlace is not supported"));
 }
@@ -190,7 +190,7 @@ void set_constant_with_place<paddle::platform::NPUPinnedPlace>(
 template <>
 void set_constant_with_place<paddle::platform::IPUPlace>(
     const paddle::platform::DeviceContext& context,
-    paddle::framework::Tensor* tensor,
+    phi::DenseTensor* tensor,
     float value) {
   PADDLE_THROW(phi::errors::Unimplemented("IPUPlace is not supported"));
 }
@@ -198,7 +198,7 @@ void set_constant_with_place<paddle::platform::IPUPlace>(
 template <>
 void set_constant_with_place<paddle::platform::CustomPlace>(
     const paddle::platform::DeviceContext& context,
-    paddle::framework::Tensor* tensor,
+    phi::DenseTensor* tensor,
     float value) {
   PADDLE_THROW(phi::errors::Unimplemented("CustomPlace is not supported"));
 }
@@ -206,7 +206,7 @@ void set_constant_with_place<paddle::platform::CustomPlace>(
 template <>
 void set_constant_with_place<phi::CPUPlace>(
     const paddle::platform::DeviceContext& context,
-    paddle::framework::Tensor* tensor,
+    phi::DenseTensor* tensor,
     float value) {
   phi::VisitDataType(tensor->dtype(), TensorSetConstantCPU(tensor, value));
 }
@@ -214,7 +214,7 @@ void set_constant_with_place<phi::CPUPlace>(
 template <>
 void set_constant_with_place<paddle::platform::MLUPlace>(
     const paddle::platform::DeviceContext& context,
-    paddle::framework::Tensor* tensor,
+    phi::DenseTensor* tensor,
     float value) {
   PADDLE_THROW(phi::errors::Unimplemented("MLUPlace is not supported"));
 }
@@ -222,7 +222,7 @@ void set_constant_with_place<paddle::platform::MLUPlace>(
 template <>
 void set_constant_with_place<paddle::platform::CUDAPinnedPlace>(
     const paddle::platform::DeviceContext& context,
-    paddle::framework::Tensor* tensor,
+    phi::DenseTensor* tensor,
     float value) {
   phi::VisitDataType(tensor->dtype(), TensorSetConstantCPU(tensor, value));
 }
@@ -230,7 +230,7 @@ void set_constant_with_place<paddle::platform::CUDAPinnedPlace>(
 struct TensorSetConstantWithPlace
     : public std::unary_function<paddle::platform::Place, void> {
   TensorSetConstantWithPlace(const paddle::platform::DeviceContext& context,
-                             paddle::framework::Tensor* tensor,
+                             phi::DenseTensor* tensor,
                              float value)
       : context_(context), tensor_(tensor), value_(value) {}
 
@@ -240,12 +240,12 @@ struct TensorSetConstantWithPlace
   }
 
   const paddle::platform::DeviceContext& context_;
-  paddle::framework::Tensor* tensor_;
+  phi::DenseTensor* tensor_;
   float value_;
 };
 
 void set_constant(const paddle::platform::DeviceContext& context,
-                  paddle::framework::Tensor* tensor,
+                  phi::DenseTensor* tensor,
                   float value) {
   TensorSetConstantWithPlace func(context, tensor, value);
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
@@ -273,9 +273,9 @@ template struct RowwiseMean<phi::CPUContext, double>;
 template <typename T>
 struct RowwiseAdd<phi::CPUContext, T> {
   void operator()(const phi::CPUContext& context,
-                  const paddle::framework::Tensor& input,
-                  const paddle::framework::Tensor& vector,
-                  paddle::framework::Tensor* output) {
+                  const phi::DenseTensor& input,
+                  const phi::DenseTensor& vector,
+                  phi::DenseTensor* output) {
     auto in_dims = input.dims();
     auto out_dims = output->dims();
     auto size = input.numel() / in_dims[0];
diff --git a/paddle/phi/kernels/funcs/math_function.cu b/paddle/phi/kernels/funcs/math_function.cu
index c829adbc41373..06ea7f573a5a1 100644
--- a/paddle/phi/kernels/funcs/math_function.cu
+++ b/paddle/phi/kernels/funcs/math_function.cu
@@ -105,8 +105,8 @@ __global__ void TransposeNormalKernel(const T* in_ptr,
 template <typename DeviceContext, typename T>
 void TransposeNormal<DeviceContext, T>::operator()(
     const DeviceContext& context,
-    const paddle::framework::Tensor& in,
-    paddle::framework::Tensor* out,
+    const phi::DenseTensor& in,
+    phi::DenseTensor* out,
     const std::vector<int>& axis) {
   const int rank = axis.size();
   auto in_stride = phi::stride(in.dims());
@@ -215,7 +215,7 @@ DEFINE_GPU_TRANS_NORMAL(phi::dtype::complex<double>);
 
 struct TensorSetConstantGPU {
   TensorSetConstantGPU(const paddle::platform::DeviceContext& context,
-                       paddle::framework::Tensor* tensor,
+                       phi::DenseTensor* tensor,
                        float value)
       : context_(context), tensor_(tensor), value_(value) {}
 
@@ -228,14 +228,14 @@ struct TensorSetConstantGPU {
   }
 
   const paddle::platform::DeviceContext& context_;
-  paddle::framework::Tensor* tensor_;
+  phi::DenseTensor* tensor_;
   float value_;
 };
 
 template <>
 void set_constant_with_place<paddle::platform::CUDAPlace>(
     const paddle::platform::DeviceContext& context,
-    paddle::framework::Tensor* tensor,
+    phi::DenseTensor* tensor,
     float value) {
   phi::VisitDataType(tensor->dtype(),
                      TensorSetConstantGPU(context, tensor, value));
@@ -255,9 +255,9 @@ __global__ void RowwiseAddKernel(
 template <typename T>
 struct RowwiseAdd<phi::GPUContext, T> {
   void operator()(const phi::GPUContext& context,
-                  const paddle::framework::Tensor& input,
-                  const paddle::framework::Tensor& vector,
-                  paddle::framework::Tensor* output) {
+                  const phi::DenseTensor& input,
+                  const phi::DenseTensor& vector,
+                  phi::DenseTensor* output) {
     auto in_dims = input.dims();
     auto out_dims = output->dims();
     auto size = input.numel() / in_dims[0];
@@ -304,8 +304,8 @@ template struct ColwiseSum<phi::GPUContext, int64_t>;
 template <>
 void ColwiseSum<phi::GPUContext, double>::operator()(
     const phi::GPUContext& context,
-    const paddle::framework::Tensor& input,
-    paddle::framework::Tensor* vector) {
+    const phi::DenseTensor& input,
+    phi::DenseTensor* vector) {
   auto in_dims = input.dims();
   auto size = input.numel() / in_dims[0];
   PADDLE_ENFORCE_EQ(vector->numel(),
@@ -316,7 +316,7 @@ void ColwiseSum<phi::GPUContext, double>::operator()(
                         " dimension. Expected vector size=%d, but received %d",
                         size,
                         vector->numel()));
-  paddle::framework::Tensor one;
+  phi::DenseTensor one;
   one.mutable_data<double>({in_dims[0]}, context.GetPlace());
   SetConstant<phi::GPUContext, double> set;
   set(context, &one, static_cast<double>(1.0));
@@ -340,8 +340,8 @@ template struct RowwiseSum<phi::GPUContext, float>;
 template <>
 void RowwiseSum<phi::GPUContext, double>::operator()(
     const phi::GPUContext& context,
-    const paddle::framework::Tensor& input,
-    paddle::framework::Tensor* vector) {
+    const phi::DenseTensor& input,
+    phi::DenseTensor* vector) {
   auto in_dims = input.dims();
   auto size = input.numel() / in_dims[0];
   PADDLE_ENFORCE_EQ(vector->numel(),
@@ -352,7 +352,7 @@ void RowwiseSum<phi::GPUContext, double>::operator()(
                         " dimension. Expected vector size=%d, but received %d",
                         in_dims[0],
                         vector->numel()));
-  paddle::framework::Tensor one;
+  phi::DenseTensor one;
   one.mutable_data<double>({size}, context.GetPlace());
   SetConstant<phi::GPUContext, double> set;
   set(context, &one, static_cast<double>(1.0));
diff --git a/paddle/phi/kernels/funcs/math_function.h b/paddle/phi/kernels/funcs/math_function.h
index d894ef2b41d82..3a95c998b1f9a 100644
--- a/paddle/phi/kernels/funcs/math_function.h
+++ b/paddle/phi/kernels/funcs/math_function.h
@@ -34,84 +34,82 @@ template <typename DeviceContext, typename T>
 struct TransposeNormal {
   // for dims >= 7 situation
   void operator()(const DeviceContext& context,
-                  const paddle::framework::Tensor& in,
-                  paddle::framework::Tensor* out,
+                  const phi::DenseTensor& in,
+                  phi::DenseTensor* out,
                   const std::vector<int>& axis);
 };
 
 template <typename DeviceContext, typename T, int Rank>
 struct Transpose {
   void operator()(const DeviceContext& context,
-                  const paddle::framework::Tensor& in,
-                  paddle::framework::Tensor* out,
+                  const phi::DenseTensor& in,
+                  phi::DenseTensor* out,
                   const std::vector<int>& axis);
 };
 
 template <typename DeviceContext, typename T>
 struct SetConstant {
   void operator()(const DeviceContext& context,
-                  paddle::framework::Tensor* tensor,
+                  phi::DenseTensor* tensor,
                   T num);
 };
 
 #ifdef PADDLE_WITH_XPU
 template <typename T>
 struct SetConstant<XPUContext, T> {
-  void operator()(const XPUContext& context,
-                  paddle::framework::Tensor* tensor,
-                  T num);
+  void operator()(const XPUContext& context, phi::DenseTensor* tensor, T num);
 };
 
 template <typename T>
 struct SetConstant<paddle::platform::XPUDeviceContext, T> {
   void operator()(const paddle::platform::XPUDeviceContext& context,
-                  paddle::framework::Tensor* tensor,
+                  phi::DenseTensor* tensor,
                   T num);
 };
 #endif
 
 template <typename Place>
 void set_constant_with_place(const paddle::platform::DeviceContext& context,
-                             paddle::framework::Tensor* tensor,
+                             phi::DenseTensor* tensor,
                              float value);
 
 void set_constant(const paddle::platform::DeviceContext& context,
-                  paddle::framework::Tensor* tensor,
+                  phi::DenseTensor* tensor,
                   float value);
 
 template <typename DeviceContext, typename T>
 struct RowwiseAdd {
   void operator()(const DeviceContext& context,
-                  const paddle::framework::Tensor& input,
-                  const paddle::framework::Tensor& vec,
-                  paddle::framework::Tensor* output);
+                  const phi::DenseTensor& input,
+                  const phi::DenseTensor& vec,
+                  phi::DenseTensor* output);
 };
 
 template <typename DeviceContext, typename T>
 struct ColwiseSum {
   void operator()(const DeviceContext& context,
-                  const paddle::framework::Tensor& input,
-                  paddle::framework::Tensor* vec);
+                  const phi::DenseTensor& input,
+                  phi::DenseTensor* vec);
 };
 
 template <typename DeviceContext, typename T>
 struct RowwiseSum {
   void operator()(const DeviceContext& context,
-                  const paddle::framework::Tensor& input,
-                  paddle::framework::Tensor* vec);
+                  const phi::DenseTensor& input,
+                  phi::DenseTensor* vec);
 };
 
 template <typename DeviceContext, typename T>
 struct RowwiseMean {
   void operator()(const DeviceContext& context,
-                  const paddle::framework::Tensor& input,
-                  paddle::framework::Tensor* vec);
+                  const phi::DenseTensor& input,
+                  phi::DenseTensor* vec);
 };
 
 #ifdef PADDLE_WITH_XPU
 template <typename U>
 struct TensorSetConstantXPU {
-  TensorSetConstantXPU(paddle::framework::Tensor* tensor,
+  TensorSetConstantXPU(phi::DenseTensor* tensor,
                        U value,
                        paddle::platform::Place place)
       : tensor_(tensor), value_(value), place_(place) {}
@@ -127,7 +125,7 @@ struct TensorSetConstantXPU {
                          static_cast<void*>(data_cpu.get()),
                          numel * sizeof(T));
   }
-  paddle::framework::Tensor* tensor_;
+  phi::DenseTensor* tensor_;
   U value_;
   paddle::platform::Place place_;
 };
diff --git a/paddle/phi/kernels/funcs/math_function_impl.h b/paddle/phi/kernels/funcs/math_function_impl.h
index a6aeeb4f63c0d..512f21e82091c 100644
--- a/paddle/phi/kernels/funcs/math_function_impl.h
+++ b/paddle/phi/kernels/funcs/math_function_impl.h
@@ -25,8 +25,9 @@ namespace funcs {
 using paddle::framework::To32BitIndex;
 
 template <typename DeviceContext, typename T>
-void SetConstant<DeviceContext, T>::operator()(
-    const DeviceContext& context, paddle::framework::Tensor* tensor, T num) {
+void SetConstant<DeviceContext, T>::operator()(const DeviceContext& context,
+                                               phi::DenseTensor* tensor,
+                                               T num) {
   auto t = paddle::framework::EigenVector<T>::Flatten(*tensor);
   t.device(*context.eigen_device()) = t.constant(static_cast<T>(num));
 }
@@ -34,7 +35,7 @@ void SetConstant<DeviceContext, T>::operator()(
 #ifdef PADDLE_WITH_XPU
 template <typename T>
 void SetConstant<XPUContext, T>::operator()(const XPUContext& context,
-                                            paddle::framework::Tensor* tensor,
+                                            phi::DenseTensor* tensor,
                                             T num) {
   phi::VisitDataType(tensor->dtype(),
                      TensorSetConstantXPU<T>(tensor, num, context.GetPlace()));
@@ -42,7 +43,7 @@ void SetConstant<XPUContext, T>::operator()(const XPUContext& context,
 template <typename T>
 void SetConstant<paddle::platform::XPUDeviceContext, T>::operator()(
     const paddle::platform::XPUDeviceContext& context,
-    paddle::framework::Tensor* tensor,
+    phi::DenseTensor* tensor,
     T num) {
   phi::VisitDataType(tensor->dtype(),
                      TensorSetConstantXPU<T>(tensor, num, context.GetPlace()));
@@ -52,8 +53,8 @@ void SetConstant<paddle::platform::XPUDeviceContext, T>::operator()(
 template <typename DeviceContext, typename T, int Rank>
 void Transpose<DeviceContext, T, Rank>::operator()(
     const DeviceContext& context,
-    const paddle::framework::Tensor& in,
-    paddle::framework::Tensor* out,
+    const phi::DenseTensor& in,
+    phi::DenseTensor* out,
     const std::vector<int>& axis) {
   Eigen::array<int, Rank> permute;
   for (int i = 0; i < Rank; i++) {
@@ -74,10 +75,9 @@ void Transpose<DeviceContext, T, Rank>::operator()(
 }
 
 template <typename DeviceContext, typename T>
-void ColwiseSum<DeviceContext, T>::operator()(
-    const DeviceContext& context,
-    const paddle::framework::Tensor& input,
-    paddle::framework::Tensor* out) {
+void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
+                                              const phi::DenseTensor& input,
+                                              phi::DenseTensor* out) {
   auto in_dims = input.dims();
   auto size = input.numel() / in_dims[0];
   PADDLE_ENFORCE_EQ(out->numel(),
@@ -102,8 +102,8 @@ template <typename T>
 class ColwiseSum<phi::CPUContext, T> {
  public:
   void operator()(const phi::CPUContext& context,
-                  const paddle::framework::Tensor& input,
-                  paddle::framework::Tensor* out) {
+                  const phi::DenseTensor& input,
+                  phi::DenseTensor* out) {
     auto& in_dims = input.dims();
     auto height = in_dims[0];
     auto size = in_dims[1];
@@ -133,10 +133,9 @@ class ColwiseSum<phi::CPUContext, T> {
 };
 
 template <typename DeviceContext, typename T>
-void RowwiseMean<DeviceContext, T>::operator()(
-    const DeviceContext& context,
-    const paddle::framework::Tensor& input,
-    paddle::framework::Tensor* out) {
+void RowwiseMean<DeviceContext, T>::operator()(const DeviceContext& context,
+                                               const phi::DenseTensor& input,
+                                               phi::DenseTensor* out) {
   auto in_dims = input.dims();
   PADDLE_ENFORCE_EQ(in_dims.size(),
                     2U,
@@ -165,8 +164,8 @@ template <typename T>
 class RowwiseMean<phi::CPUContext, T> {
  public:
   void operator()(const phi::CPUContext& context,
-                  const paddle::framework::Tensor& input,
-                  paddle::framework::Tensor* out) {
+                  const phi::DenseTensor& input,
+                  phi::DenseTensor* out) {
     auto& in_dims = input.dims();
     PADDLE_ENFORCE_EQ(
         in_dims.size(),
@@ -200,10 +199,9 @@ class RowwiseMean<phi::CPUContext, T> {
 };
 
 template <typename DeviceContext, typename T>
-void RowwiseSum<DeviceContext, T>::operator()(
-    const DeviceContext& context,
-    const paddle::framework::Tensor& input,
-    paddle::framework::Tensor* out) {
+void RowwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
+                                              const phi::DenseTensor& input,
+                                              phi::DenseTensor* out) {
   auto in_dims = input.dims();
   PADDLE_ENFORCE_EQ(in_dims.size(),
                     2U,
@@ -232,8 +230,8 @@ template <typename T>
 class RowwiseSum<phi::CPUContext, T> {
  public:
   void operator()(const phi::CPUContext& context,
-                  const paddle::framework::Tensor& input,
-                  paddle::framework::Tensor* out) {
+                  const phi::DenseTensor& input,
+                  phi::DenseTensor* out) {
     auto& in_dims = input.dims();
     PADDLE_ENFORCE_EQ(
         in_dims.size(),
diff --git a/paddle/phi/kernels/funcs/sequence2batch.cc b/paddle/phi/kernels/funcs/sequence2batch.cc
index 7cad5b6c0b929..302dd6ec6ac62 100644
--- a/paddle/phi/kernels/funcs/sequence2batch.cc
+++ b/paddle/phi/kernels/funcs/sequence2batch.cc
@@ -21,9 +21,9 @@ template <typename T>
 class CopyMatrixRowsFunctor<phi::CPUContext, T> {
  public:
   void operator()(const phi::CPUContext& context,
-                  const paddle::framework::Tensor& src,
+                  const phi::DenseTensor& src,
                   paddle::framework::Vector<size_t> index_lod,
-                  paddle::framework::Tensor* dst,
+                  phi::DenseTensor* dst,
                   bool is_src_index) {
     size_t* index = index_lod.data();
     auto src_dims = src.dims();
diff --git a/paddle/phi/kernels/funcs/sequence2batch.cu b/paddle/phi/kernels/funcs/sequence2batch.cu
index 196ca7a2ef96e..6c8ec9bca017a 100644
--- a/paddle/phi/kernels/funcs/sequence2batch.cu
+++ b/paddle/phi/kernels/funcs/sequence2batch.cu
@@ -42,9 +42,9 @@ template <typename T>
 class CopyMatrixRowsFunctor<phi::GPUContext, T> {
  public:
   void operator()(const phi::GPUContext& context,
-                  const paddle::framework::Tensor& src,
+                  const phi::DenseTensor& src,
                   paddle::framework::Vector<size_t> index_lod,
-                  paddle::framework::Tensor* dst,
+                  phi::DenseTensor* dst,
                   bool is_src_index) {
     auto src_dims = src.dims();
     auto dst_dims = dst->dims();
diff --git a/paddle/phi/kernels/funcs/sequence2batch.h b/paddle/phi/kernels/funcs/sequence2batch.h
index ed3a50d883dc0..e73004303d576 100644
--- a/paddle/phi/kernels/funcs/sequence2batch.h
+++ b/paddle/phi/kernels/funcs/sequence2batch.h
@@ -38,9 +38,9 @@ class CopyMatrixRowsFunctor {
   // copy the input src to the indexed rows of output dst.
   // The indexed rows are based on the input index.
   void operator()(const DeviceContext& context,
-                  const paddle::framework::Tensor& src,
+                  const phi::DenseTensor& src,
                   paddle::framework::Vector<size_t> index_lod,
-                  paddle::framework::Tensor* dst,
+                  phi::DenseTensor* dst,
                   bool is_src_index);
 };
 
diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index 5c6fd04c15e68..4b63f6758aa29 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -1105,7 +1105,7 @@ void BatchNormKernel(const Context &ctx,
         // Create reserve space and workspace for batch norm.
         // Create tensor for each batchnorm op, it will be used in the
         // backward. Thus this tensor shouldn't be temp.
-        // auto *reserve_space = ctx.Output<Tensor>("ReserveSpace");
+        // auto *reserve_space = ctx.Output<phi::DenseTensor>("ReserveSpace");
         if (reserve_space == nullptr) {
           reserve_space = &reserve_space_tensor;
         }
diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h
index eae7b77519911..9acd67390face 100644
--- a/paddle/phi/kernels/gpu/depthwise_conv.h
+++ b/paddle/phi/kernels/gpu/depthwise_conv.h
@@ -47,12 +47,12 @@ template <typename DeviceContext,
 class DepthwiseConvFunctor {
  public:
   void operator()(const DeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& filter,
+                  const phi::DenseTensor& input,
+                  const phi::DenseTensor& filter,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
                   const std::vector<int>& dilations,
-                  framework::Tensor* output,
+                  phi::DenseTensor* output,
                   const DataLayout data_layout = DataLayout::kNCHW);
 };
 
@@ -62,13 +62,13 @@ template <typename DeviceContext,
 class DepthwiseConvInputGradFunctor {
  public:
   void operator()(const DeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& filter,
-                  const framework::Tensor& output_grad,
+                  const phi::DenseTensor& input,
+                  const phi::DenseTensor& filter,
+                  const phi::DenseTensor& output_grad,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
                   const std::vector<int>& dilations,
-                  framework::Tensor* input_grad,
+                  phi::DenseTensor* input_grad,
                   const DataLayout data_layout = DataLayout::kNCHW);
 };
 
@@ -78,12 +78,12 @@ template <typename DeviceContext,
 class DepthwiseConvFilterGradFunctor {
  public:
   void operator()(const DeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output_grad,
+                  const phi::DenseTensor& input,
+                  const phi::DenseTensor& output_grad,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
                   const std::vector<int>& dilations,
-                  framework::Tensor* filter_grad,
+                  phi::DenseTensor* filter_grad,
                   const DataLayout data_layout = DataLayout::kNCHW);
 };
 
@@ -176,7 +176,8 @@ __device__ __inline__ void KernelDepthwiseConvNCHW(
         int offset = in_offset + h_in * input_width + w_in;
         T in_data = input_data[offset];
         if (fuse_relu_before_conv) {
-          value += weight[weight_offset] * T(max(0.0f, double(in_data)));
+          value +=
+              weight[weight_offset] * static<T>(max(0.0f, double(in_data)));
         } else {
           value += weight[weight_offset] * in_data;
         }
@@ -228,7 +229,7 @@ __device__ __inline__ void KernelDepthwiseConvNHWC(
         T in_data = input_data[offset];
         const T* weight = filter_data + weight_offset * output_channels + c_out;
         if (fuse_relu_before_conv) {
-          value += weight[0] * T(max(0.0f, double(in_data)));
+          value += weight[0] * static_cast<T>(max(0.0f, double(in_data)));
         } else {
           value += weight[0] * in_data;
         }
@@ -281,7 +282,7 @@ __device__ __inline__ void KernelDepthwiseConvCFilterNCHW(
             int offset = in_offset + h_in * input_width + w_in;
             if (fuse_relu_before_conv) {
               value += r_weight[h_f * c_filter + w_f] *
-                       T(max(0.0f, double(input_data[offset])));
+                       static_cast<T>(max(0.0f, double(input_data[offset])));
             } else {
               value += r_weight[h_f * c_filter + w_f] * input_data[offset];
             }
@@ -337,7 +338,7 @@ __device__ __inline__ void KernelDepthwiseConvCFilterNHWC(
                 in_offset + (h_in * input_width + w_in) * input_channels + c_in;
             if (fuse_relu_before_conv) {
               value += r_weight[h_f * c_filter + w_f] *
-                       T(max(0.0, double(input_data[offset])));
+                       static_cast<T>(max(0.0, double(input_data[offset])));
             } else {
               value += r_weight[h_f * c_filter + w_f] * input_data[offset];
             }
@@ -880,7 +881,7 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradNCHW(
                        image_wk;
         if (fuse_relu_before_conv) {
           s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] *
-               T(max(0.0f, double(input_data[input_id])));
+               static_cast<T>(max(0.0f, double(input_data[input_id])));
         } else {
           s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] *
                input_data[input_id];
@@ -941,7 +942,7 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradNHWC(
           kernel_id / filter_multiplier;
       if (fuse_relu_before_conv) {
         s += output_grad_data[gaid(bid, image_h, image_w, kernel_id)] *
-             T(max(0.0f, double(input_data[input_id])));
+             static_cast<T>(max(0.0f, double(input_data[input_id])));
       } else {
         s += output_grad_data[gaid(bid, image_h, image_w, kernel_id)] *
              input_data[input_id];
@@ -1013,7 +1014,7 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradCFilterNHWC(
           T s(0);
           if (fuse_relu_before_conv) {
             s = output_grad_data[output_id] *
-                T(max(0.0f, double(input_data[input_id])));
+                static_cast<T>(max(0.0f, double(input_data[input_id])));
           } else {
             s = output_grad_data[output_id] * input_data[input_id];
           }
@@ -1163,12 +1164,12 @@ template <class T, bool fuse_relu_before_conv>
 class DepthwiseConvFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
  public:
   void operator()(const phi::GPUContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& filter,
+                  const phi::DenseTensor& input,
+                  const phi::DenseTensor& filter,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
                   const std::vector<int>& dilations,
-                  framework::Tensor* output,
+                  phi::DenseTensor* output,
                   const DataLayout data_layout = DataLayout::kNCHW) {
     const int batch_size = input.dims()[0];
     const int input_channels =
@@ -1199,7 +1200,7 @@ class DepthwiseConvFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
     const T* filter_data = filter.data<T>();
     T* output_data = output->mutable_data<T>(context.GetPlace());
 
-    framework::Tensor filter_hwc;
+    phi::DenseTensor filter_hwc;
     if (data_layout == DataLayout::kNHWC) {
       framework::DDim filter_hwc_dims({filter.dims()[2],
                                        filter.dims()[3],
@@ -1340,13 +1341,13 @@ template <typename T, bool fuse_relu_before_conv>
 class DepthwiseConvInputGradFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
  public:
   void operator()(const phi::GPUContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& filter,
-                  const framework::Tensor& output_grad,
+                  const phi::DenseTensor& input,
+                  const phi::DenseTensor& filter,
+                  const phi::DenseTensor& output_grad,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
                   const std::vector<int>& dilations,
-                  framework::Tensor* input_grad,
+                  phi::DenseTensor* input_grad,
                   const DataLayout data_layout = DataLayout::kNCHW) {
     const int batch_size = input.dims()[0];
     const int input_channels =
@@ -1378,7 +1379,7 @@ class DepthwiseConvInputGradFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
     const T* output_grad_data = output_grad.data<T>();
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
-    framework::Tensor filter_hwc;
+    phi::DenseTensor filter_hwc;
     if (data_layout == DataLayout::kNHWC) {
       framework::DDim filter_hwc_dims({filter.dims()[2],
                                        filter.dims()[3],
@@ -1505,12 +1506,12 @@ class DepthwiseConvFilterGradFunctor<phi::GPUContext,
                                      fuse_relu_before_conv> {
  public:
   void operator()(const phi::GPUContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output_grad,
+                  const phi::DenseTensor& input,
+                  const phi::DenseTensor& output_grad,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
                   const std::vector<int>& dilations,
-                  framework::Tensor* filter_grad,
+                  phi::DenseTensor* filter_grad,
                   const DataLayout data_layout = DataLayout::kNCHW) {
     const int batch_size = input.dims()[0];
     const int input_channels =
@@ -1598,7 +1599,7 @@ class DepthwiseConvFilterGradFunctor<phi::GPUContext,
                                                    dilate_width,               \
                                                    filter_grad_data);          \
     } else {                                                                   \
-      framework::Tensor filter_grad_hwc;                                       \
+      phi::DenseTensor filter_grad_hwc;                                        \
       if (c_filter != -1) {                                                    \
         framework::DDim filter_grad_hwc_dims({filter_grad->dims()[2],          \
                                               filter_grad->dims()[3],          \
diff --git a/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h b/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h
index 8731316317d47..bd8e529ff2ee5 100644
--- a/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h
+++ b/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h
@@ -84,19 +84,19 @@ void AverageAccumulatesKernel(const Context& dev_ctx,
           max_average_window));
 
   // Get inputs
-  // auto* param = ctx.Input<Tensor>("param");
-  // auto* in_sum_1 = ctx.Input<Tensor>("in_sum_1");
-  // auto* in_sum_2 = ctx.Input<Tensor>("in_sum_2");
-  // auto* in_sum_3 = ctx.Input<Tensor>("in_sum_3");
+  // auto* param = ctx.Input<phi::DenseTensor>("param");
+  // auto* in_sum_1 = ctx.Input<phi::DenseTensor>("in_sum_1");
+  // auto* in_sum_2 = ctx.Input<phi::DenseTensor>("in_sum_2");
+  // auto* in_sum_3 = ctx.Input<phi::DenseTensor>("in_sum_3");
   auto param_tensor = EigenVector<T>::Flatten(param);
   auto in_sum_1_tensor = EigenVector<T>::Flatten(in_sum_1);
   auto in_sum_2_tensor = EigenVector<T>::Flatten(in_sum_2);
   auto in_sum_3_tensor = EigenVector<T>::Flatten(in_sum_3);
 
   // Get outputs
-  // auto* out_sum_1 = ctx.Output<Tensor>("out_sum_1");
-  // auto* out_sum_2 = ctx.Output<Tensor>("out_sum_2");
-  // auto* out_sum_3 = ctx.Output<Tensor>("out_sum_3");
+  // auto* out_sum_1 = ctx.Output<phi::DenseTensor>("out_sum_1");
+  // auto* out_sum_2 = ctx.Output<phi::DenseTensor>("out_sum_2");
+  // auto* out_sum_3 = ctx.Output<phi::DenseTensor>("out_sum_3");
   dev_ctx.template Alloc<T>(out_sum_1);
   dev_ctx.template Alloc<T>(out_sum_2);
   dev_ctx.template Alloc<T>(out_sum_3);
diff --git a/paddle/phi/tests/kernels/test_math_function.cc b/paddle/phi/tests/kernels/test_math_function.cc
index b21cf0203febe..bcb0e9d7adc7e 100644
--- a/paddle/phi/tests/kernels/test_math_function.cc
+++ b/paddle/phi/tests/kernels/test_math_function.cc
@@ -26,9 +26,9 @@ inline phi::funcs::BlasT<phi::CPUContext, T> GetBlas(
 }
 
 TEST(math_function, gemm_notrans_cblas) {
-  paddle::framework::Tensor input1;
-  paddle::framework::Tensor input2;
-  paddle::framework::Tensor input3;
+  phi::DenseTensor input1;
+  phi::DenseTensor input2;
+  phi::DenseTensor input3;
 
   int m = 2;
   int n = 3;
@@ -71,10 +71,10 @@ TEST(math_function, gemm_notrans_cblas) {
 #ifdef PADDLE_WITH_LIBXSMM
 template <typename T>
 void MklSmmCompare(int m, int n, int k) {
-  paddle::framework::Tensor mat_a;
-  paddle::framework::Tensor mat_b;
-  paddle::framework::Tensor mat_c_smm;
-  paddle::framework::Tensor mat_c_mkl;
+  phi::DenseTensor mat_a;
+  phi::DenseTensor mat_b;
+  phi::DenseTensor mat_c_smm;
+  phi::DenseTensor mat_c_mkl;
   auto* cpu_place = new paddle::platform::CPUPlace();
 
   T* A = mat_a.mutable_data<T>({m, k}, *cpu_place);
@@ -147,9 +147,9 @@ TEST(math_function, gemm_mkl_vs_smm) {
 #endif
 
 TEST(math_function, gemm_trans_cblas) {
-  paddle::framework::Tensor input1;
-  paddle::framework::Tensor input2;
-  paddle::framework::Tensor input3;
+  phi::DenseTensor input1;
+  phi::DenseTensor input2;
+  phi::DenseTensor input3;
 
   int m = 2;
   int n = 3;
@@ -193,7 +193,7 @@ TEST(math_function, gemm_trans_cblas) {
 }
 
 TEST(math_function, zero) {
-  paddle::framework::Tensor tensor;
+  phi::DenseTensor tensor;
   auto* cpu_place = new paddle::platform::CPUPlace();
   float* t = tensor.mutable_data<float>({2, 2}, *cpu_place);
   phi::CPUContext context(*cpu_place);
@@ -214,9 +214,9 @@ TEST(math_function, zero) {
 
 template <typename T>
 void GemvTest(int m, int n, bool trans) {
-  paddle::framework::Tensor mat_a;
-  paddle::framework::Tensor vec_b;
-  paddle::framework::Tensor vec_c;
+  phi::DenseTensor mat_a;
+  phi::DenseTensor vec_b;
+  phi::DenseTensor vec_c;
   auto* cpu_place = new paddle::platform::CPUPlace();
   int b_num = trans ? m : n;
   int c_num = trans ? n : m;
@@ -269,7 +269,7 @@ TEST(math_function, gemv) {
 }
 
 TEST(math_funciton, set_constant) {
-  paddle::framework::Tensor t;
+  phi::DenseTensor t;
   t.Resize({10, 10});
   t.mutable_data<int>(paddle::platform::CPUPlace());
   auto* ctx = new phi::CPUContext();
@@ -287,10 +287,10 @@ TEST(math_funciton, set_constant) {
 
 template <typename T>
 void GemmWarpTest(int m, int n, int k, T alpha, T beta) {
-  paddle::framework::Tensor mat_a;
-  paddle::framework::Tensor mat_b;
-  paddle::framework::Tensor mat_c_ref;
-  paddle::framework::Tensor mat_c_mkl;
+  phi::DenseTensor mat_a;
+  phi::DenseTensor mat_b;
+  phi::DenseTensor mat_c_ref;
+  phi::DenseTensor mat_c_mkl;
   auto* cpu_place = new paddle::platform::CPUPlace();
 
   T* A = mat_a.mutable_data<T>({m, k}, *cpu_place);
diff --git a/paddle/phi/tests/kernels/test_math_function.cu b/paddle/phi/tests/kernels/test_math_function.cu
index 479d874626a4e..b227523ce0bc5 100644
--- a/paddle/phi/tests/kernels/test_math_function.cu
+++ b/paddle/phi/tests/kernels/test_math_function.cu
@@ -43,11 +43,11 @@ inline phi::funcs::BlasT<phi::GPUContext, T> GetBlas(
 }
 
 TEST(math_function, notrans_mul_trans_fp32) {
-  paddle::framework::Tensor input1;
-  paddle::framework::Tensor input1_gpu;
-  paddle::framework::Tensor input2_gpu;
-  paddle::framework::Tensor out_gpu;
-  paddle::framework::Tensor out;
+  phi::DenseTensor input1;
+  phi::DenseTensor input1_gpu;
+  phi::DenseTensor input2_gpu;
+  phi::DenseTensor out_gpu;
+  phi::DenseTensor out;
 
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
@@ -79,11 +79,11 @@ TEST(math_function, notrans_mul_trans_fp32) {
 }
 
 TEST(math_function, notrans_mul_trans_fp16) {
-  paddle::framework::Tensor input1;
-  paddle::framework::Tensor input1_gpu;
-  paddle::framework::Tensor input2_gpu;
-  paddle::framework::Tensor out_gpu;
-  paddle::framework::Tensor out;
+  phi::DenseTensor input1;
+  phi::DenseTensor input1_gpu;
+  phi::DenseTensor input2_gpu;
+  phi::DenseTensor out_gpu;
+  phi::DenseTensor out;
 
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
@@ -126,11 +126,11 @@ TEST(math_function, notrans_mul_trans_fp16) {
 }
 
 TEST(math_function, trans_mul_notrans_fp32) {
-  paddle::framework::Tensor input1;
-  paddle::framework::Tensor input1_gpu;
-  paddle::framework::Tensor input2_gpu;
-  paddle::framework::Tensor out_gpu;
-  paddle::framework::Tensor out;
+  phi::DenseTensor input1;
+  phi::DenseTensor input1_gpu;
+  phi::DenseTensor input2_gpu;
+  phi::DenseTensor out_gpu;
+  phi::DenseTensor out;
 
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
@@ -168,11 +168,11 @@ TEST(math_function, trans_mul_notrans_fp32) {
 }
 
 TEST(math_function, trans_mul_notrans_fp16) {
-  paddle::framework::Tensor input1;
-  paddle::framework::Tensor input1_gpu;
-  paddle::framework::Tensor input2_gpu;
-  paddle::framework::Tensor out_gpu;
-  paddle::framework::Tensor out;
+  phi::DenseTensor input1;
+  phi::DenseTensor input1_gpu;
+  phi::DenseTensor input2_gpu;
+  phi::DenseTensor out_gpu;
+  phi::DenseTensor out;
 
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
@@ -220,12 +220,12 @@ TEST(math_function, trans_mul_notrans_fp16) {
 }
 
 TEST(math_function, gemm_notrans_cublas_fp32) {
-  paddle::framework::Tensor input1;
-  paddle::framework::Tensor input2;
-  paddle::framework::Tensor input3;
-  paddle::framework::Tensor input1_gpu;
-  paddle::framework::Tensor input2_gpu;
-  paddle::framework::Tensor input3_gpu;
+  phi::DenseTensor input1;
+  phi::DenseTensor input2;
+  phi::DenseTensor input3;
+  phi::DenseTensor input1_gpu;
+  phi::DenseTensor input2_gpu;
+  phi::DenseTensor input3_gpu;
 
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
@@ -278,12 +278,12 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
 }
 
 TEST(math_function, gemm_notrans_cublas_fp16) {
-  paddle::framework::Tensor input1;
-  paddle::framework::Tensor input2;
-  paddle::framework::Tensor input3;
-  paddle::framework::Tensor input1_gpu;
-  paddle::framework::Tensor input2_gpu;
-  paddle::framework::Tensor input3_gpu;
+  phi::DenseTensor input1;
+  phi::DenseTensor input2;
+  phi::DenseTensor input3;
+  phi::DenseTensor input1_gpu;
+  phi::DenseTensor input2_gpu;
+  phi::DenseTensor input3_gpu;
 
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
@@ -355,12 +355,12 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
 }
 
 TEST(math_function, gemm_trans_cublas_fp32) {
-  paddle::framework::Tensor input1;
-  paddle::framework::Tensor input2;
-  paddle::framework::Tensor input3;
-  paddle::framework::Tensor input1_gpu;
-  paddle::framework::Tensor input2_gpu;
-  paddle::framework::Tensor input3_gpu;
+  phi::DenseTensor input1;
+  phi::DenseTensor input2;
+  phi::DenseTensor input3;
+  phi::DenseTensor input1_gpu;
+  phi::DenseTensor input2_gpu;
+  phi::DenseTensor input3_gpu;
 
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
@@ -407,12 +407,12 @@ TEST(math_function, gemm_trans_cublas_fp32) {
 }
 
 TEST(math_function, gemm_trans_cublas_fp16) {
-  paddle::framework::Tensor input1;
-  paddle::framework::Tensor input2;
-  paddle::framework::Tensor input3;
-  paddle::framework::Tensor input1_gpu;
-  paddle::framework::Tensor input2_gpu;
-  paddle::framework::Tensor input3_gpu;
+  phi::DenseTensor input1;
+  phi::DenseTensor input2;
+  phi::DenseTensor input3;
+  phi::DenseTensor input1_gpu;
+  phi::DenseTensor input2_gpu;
+  phi::DenseTensor input3_gpu;
 
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
@@ -479,9 +479,9 @@ TEST(math_function, gemm_trans_cublas_fp16) {
 
 template <typename T>
 void GemvTest(int m, int n, bool trans) {
-  paddle::framework::Tensor mat_a;
-  paddle::framework::Tensor vec_b;
-  paddle::framework::Tensor vec_c;
+  phi::DenseTensor mat_a;
+  phi::DenseTensor vec_b;
+  phi::DenseTensor vec_c;
 
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
@@ -495,9 +495,9 @@ void GemvTest(int m, int n, bool trans) {
   T* data_b = vec_b.mutable_data<T>({trans ? m : n}, cpu_place);
   T* data_c = vec_c.mutable_data<T>({trans ? n : m}, cpu_place);
 
-  paddle::framework::Tensor g_mat_a;
-  paddle::framework::Tensor g_vec_b;
-  paddle::framework::Tensor g_vec_c;
+  phi::DenseTensor g_mat_a;
+  phi::DenseTensor g_vec_b;
+  phi::DenseTensor g_vec_c;
   T* g_data_a = g_mat_a.mutable_data<T>(mat_a.dims(), gpu_place);
   T* g_data_b = g_vec_b.mutable_data<T>(vec_b.dims(), gpu_place);
   T* g_data_c = g_vec_c.mutable_data<T>(vec_c.dims(), gpu_place);
diff --git a/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.cc b/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.cc
index c9a3f7a9071b5..c3c9f2bd617fe 100644
--- a/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.cc
+++ b/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.cc
@@ -16,17 +16,14 @@
 #include "paddle/fluid/framework/custom_raw_op_kernel_func.h"
 #include "paddle/fluid/platform/enforce.h"
 
-void ReluCPUForward(const paddle::framework::Tensor &x,
-                    paddle::framework::Tensor *y) {
+void ReluCPUForward(const phi::DenseTensor &x, phi::DenseTensor *y) {
   custom_raw_op::ReluForward(x, y);
 }
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-void ReluGPUForward(const paddle::framework::Tensor &x,
-                    paddle::framework::Tensor *y);
+void ReluGPUForward(const phi::DenseTensor &x, phi::DenseTensor *y);
 #else
-void ReluGPUForward(const paddle::framework::Tensor &x,
-                    paddle::framework::Tensor *y) {
+void ReluGPUForward(const phi::DenseTensor &x, phi::DenseTensor *y) {
   PADDLE_THROW(paddle::platform::errors::Unimplemented(
       "ReluGPUForward is not supported when not compiled with GPU."));
 }
diff --git a/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.cu b/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.cu
index 72cab225d13a5..afdb73a328162 100644
--- a/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.cu
+++ b/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <iostream>
 #include "custom_raw_op_kernel_op.h"  // NOLINT
+#include <iostream>
 
-void ReluGPUForward(const paddle::framework::Tensor &x,
-                    paddle::framework::Tensor *y) {
+void ReluGPUForward(const phi::DenseTensor &x, phi::DenseTensor *y) {
   custom_raw_op::ReluForward(x, y);
 }
diff --git a/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.h b/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.h
index 70919708e19dd..24cea81b9eb91 100644
--- a/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.h
+++ b/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.h
@@ -23,8 +23,7 @@
 namespace custom_raw_op {
 
 struct ReluFunctor {
-  explicit ReluFunctor(const paddle::framework::Tensor &x,
-                       paddle::framework::Tensor *y)
+  explicit ReluFunctor(const phi::DenseTensor &x, phi::DenseTensor *y)
       : x_(x), y_(y) {}
 
   template <typename U>
@@ -72,12 +71,11 @@ struct ReluFunctor {
   }
 
  private:
-  const paddle::framework::Tensor &x_;
-  paddle::framework::Tensor *y_;
+  const phi::DenseTensor &x_;
+  phi::DenseTensor *y_;
 };
 
-inline void ReluForward(const paddle::framework::Tensor &x,
-                        paddle::framework::Tensor *y) {
+inline void ReluForward(const phi::DenseTensor &x, phi::DenseTensor *y) {
   custom_raw_op::ReluFunctor functor(x, y);
   paddle::framework::VisitDataType(
       paddle::framework::TransToProtoVarType(x.dtype()), functor);

From 0d399f69b131c4171e0f6cc190fee7f41149cfd0 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 23 Sep 2022 03:32:43 +0000
Subject: [PATCH 02/15] remove needless using tensor

---
 paddle/fluid/framework/tensor_util_test.cc    | 30 ++++++++-----------
 .../fused/fused_multi_transformer_op.cc       |  2 --
 .../fluid/operators/gather_scatter_kernel.cc  |  4 +--
 paddle/phi/kernels/gpu/depthwise_conv.h       |  4 +--
 4 files changed, 17 insertions(+), 23 deletions(-)

diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc
index 3d3c7de73b729..2e07d3aa5a638 100644
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <cmath>
-
-#include <gtest/gtest.h>
-
 #include "paddle/fluid/framework/tensor_util.h"
+#include <gtest/gtest.h>
 #include "paddle/fluid/operators/isfinite_op.h"
 
+#include <cmath>
+
 namespace paddle {
 namespace framework {
 
@@ -256,22 +255,19 @@ TEST(TensorToVector, Tensor) {
 #endif
 }
 
-TEST(TensorToVector, Tensor_bool) {
-  phi::DenseTensor src;
-  bool* src_ptr = src.mutable_data<bool>({3, 3}, paddle::platform::CPUPlace());
-  for (int i = 0; i < 3 * 3; ++i) {
-    src_ptr[i] = static_cast<bool>(i % 2);
-  }
+TEST(TensorToVector, Tensor_bool){{phi::DenseTensor src;
+bool* src_ptr = src.mutable_data<bool>({3, 3}, paddle::platform::CPUPlace());
+for (int i = 0; i < 3 * 3; ++i) {
+  src_ptr[i] = static_cast<bool>(i % 2);
+}
 
-  paddle::platform::CPUPlace place;
-  std::vector<bool> dst;
-  paddle::framework::TensorToVector<bool>(src, &dst);
+paddle::platform::CPUPlace place;
+std::vector<bool> dst;
+paddle::framework::TensorToVector<bool>(src, &dst);
 
-  for (int i = 0; i < 3 * 3; ++i) {
-    EXPECT_EQ(src_ptr[i], dst[i]);
-  }
+for (int i = 0; i < 3 * 3; ++i) {
+  EXPECT_EQ(src_ptr[i], dst[i]);
 }
-
 }  // namespace framework
 
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cc b/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
index ede6300decbe5..cb5d5b17dfeb6 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/fused/fused_multi_transformer_op.h"
-
 #include <memory>
 #include <string>
 
diff --git a/paddle/fluid/operators/gather_scatter_kernel.cc b/paddle/fluid/operators/gather_scatter_kernel.cc
index e05a214dcb4c1..b8c870cd77569 100644
--- a/paddle/fluid/operators/gather_scatter_kernel.cc
+++ b/paddle/fluid/operators/gather_scatter_kernel.cc
@@ -120,8 +120,8 @@ struct cpu_gather_scatter_functor {
           self_idx = is_scatter_like ? replace_index : index_idx;
           src_idx = is_scatter_like ? index_idx : replace_index;
 
-          reduce_op(static_cast<tensor_t*>(self_data + self_idx),
-                    static_cast<tensor_t*>(src_data + src_idx));
+          reduce_op((tensor_t*)(self_data + self_idx),
+                    (tensor_t*)(src_data + src_idx));
           index_idx++;
         }
       }
diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h
index 9acd67390face..1fbc7cf9e4a29 100644
--- a/paddle/phi/kernels/gpu/depthwise_conv.h
+++ b/paddle/phi/kernels/gpu/depthwise_conv.h
@@ -176,8 +176,8 @@ __device__ __inline__ void KernelDepthwiseConvNCHW(
         int offset = in_offset + h_in * input_width + w_in;
         T in_data = input_data[offset];
         if (fuse_relu_before_conv) {
-          value +=
-              weight[weight_offset] * static<T>(max(0.0f, double(in_data)));
+          value += weight[weight_offset] *
+                   T(max(0.0f, static_cast<double>(in_data)));  // NOLINT
         } else {
           value += weight[weight_offset] * in_data;
         }

From 6864dfc7c8e51f3ce80f3f36bb3db395a4ee08f5 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 23 Sep 2022 06:39:20 +0000
Subject: [PATCH 03/15] resolve conflict

---
 .../operators/collective/c_allreduce_op.h     | 10 ++++----
 .../fluid/operators/mkldnn/pool_mkldnn_op.cc  | 23 ++++++++++---------
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index 9749f446b8ea1..4d90442afbc5a 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -79,7 +79,7 @@ class CAllReduceOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
-      const framework::Tensor& tensor,
+      const phi::DenseTensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const {
     if (var_name == "Cond") {
       return expected_kernel_type;
@@ -193,7 +193,7 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
 #if defined(PADDLE_WITH_ASCEND_CL)
     if (ctx.HasInput("Cond")) {
-      auto cond = ctx.Input<framework::Tensor>("Cond");
+      auto cond = ctx.Input<phi::DenseTensor>("Cond");
       auto place = cond->place();
       PADDLE_ENFORCE_EQ(platform::is_cpu_place(place),
                         true,
@@ -327,7 +327,7 @@ class CAllReduceOpXPUKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
 #if defined(PADDLE_WITH_XPU_BKCL)
     if (ctx.HasInput("Cond")) {
-      auto cond = ctx.Input<framework::Tensor>("Cond");
+      auto cond = ctx.Input<phi::DenseTensor>("Cond");
       auto place = cond->place();
       PADDLE_ENFORCE_EQ(platform::is_cpu_place(place),
                         true,
@@ -412,7 +412,7 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     if (ctx.HasInput("Cond")) {
-      auto cond = ctx.Input<framework::Tensor>("Cond");
+      auto cond = ctx.Input<phi::DenseTensor>("Cond");
       auto place = cond->place();
       PADDLE_ENFORCE_EQ(platform::is_cpu_place(place),
                         true,
@@ -533,7 +533,7 @@ class CAllReduceOpMLUKernel : public framework::OpKernel<T> {
     auto out = ctx.Output<phi::DenseTensor>("Out");
 
     if (ctx.HasInput("Cond")) {
-      auto cond = ctx.Input<framework::Tensor>("Cond");
+      auto cond = ctx.Input<phi::DenseTensor>("Cond");
       auto place = cond->place();
       PADDLE_ENFORCE_EQ(platform::is_cpu_place(place),
                         true,
diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
index 00ae785bca95d..a4853131de161 100644
--- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
@@ -27,7 +27,6 @@ using dnnl::primitive;
 using dnnl::reorder;
 using dnnl::stream;
 using framework::DataLayout;
-using framework::Tensor;
 using platform::to_void_cast;
 
 template <typename T>
@@ -38,8 +37,8 @@ class PoolingMKLDNNHandler
  public:
   PoolingMKLDNNHandler(const paddle::framework::ExecutionContext& ctx,
                        const dnnl::engine mkldnn_engine,
-                       const Tensor* input,
-                       Tensor* output)
+                       const phi::DenseTensor* input,
+                       phi::DenseTensor* output)
       : platform::MKLDNNHandlerNoCachingT<T,
                                           dnnl::pooling_forward,
                                           dnnl::pooling_backward>(
@@ -131,9 +130,9 @@ class PoolingMKLDNNHandler
 
   PoolingMKLDNNHandler(const paddle::framework::ExecutionContext& ctx,
                        const dnnl::engine mkldnn_engine,
-                       const Tensor* in_x,
-                       const Tensor* out_grad,
-                       Tensor* in_x_grad)
+                       const phi::DenseTensor* in_x,
+                       const phi::DenseTensor* out_grad,
+                       phi::DenseTensor* in_x_grad)
 
       : platform::MKLDNNHandlerNoCachingT<T,
                                           dnnl::pooling_forward,
@@ -308,8 +307,8 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto& dev_ctx =
         ctx.template device_context<platform::MKLDNNDeviceContext>();
 
-    const Tensor* input = ctx.Input<Tensor>("X");
-    Tensor* output = ctx.Output<Tensor>("Out");
+    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("X");
+    phi::DenseTensor* output = ctx.Output<phi::DenseTensor>("Out");
 
     PoolingMKLDNNHandler<T> handler(ctx, dev_ctx.GetEngine(), input, output);
 
@@ -347,9 +346,11 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
                       true,
                       paddle::platform::errors::PreconditionNotMet(
                           "Operator DNNL PoolGrad must use CPUPlace"));
-    const Tensor* in_x = ctx.Input<Tensor>("X");
-    const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor* in_x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    const phi::DenseTensor* in_x = ctx.Input<phi::DenseTensor>("X");
+    const phi::DenseTensor* out_grad =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    phi::DenseTensor* in_x_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     auto& dev_ctx =
         ctx.template device_context<platform::MKLDNNDeviceContext>();

From e868f5950fe2c9df8124e63faceac6267558cc39 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 23 Sep 2022 12:11:08 +0000
Subject: [PATCH 04/15] replace tensor using

---
 .../paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.cc   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.cc b/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.cc
index c3c9f2bd617fe..262a01f1eb044 100644
--- a/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.cc
+++ b/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.cc
@@ -31,8 +31,8 @@ void ReluGPUForward(const phi::DenseTensor &x, phi::DenseTensor *y) {
 
 __PD_DEFINE_RAW_OP_KERNEL_FUNC(custom_raw_relu, ctx) {
   namespace f = paddle::framework;
-  const auto *x = ctx.Input<f::Tensor>("X");
-  auto *y = ctx.Output<f::Tensor>("Y");
+  const auto *x = ctx.Input<phi::DenseTensor>("X");
+  auto *y = ctx.Output<phi::DenseTensor>("Y");
   PADDLE_ENFORCE_NOT_NULL(x,
                           paddle::platform::errors::InvalidArgument(
                               "Input(X) should not be nullptr."));

From d47fac1064401df7b0bf46cd4410cf42f9848b7b Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 23 Sep 2022 12:41:59 +0000
Subject: [PATCH 05/15] fix format error

---
 paddle/fluid/framework/tensor_util_test.cc      | 5 ++---
 paddle/fluid/operators/gather_scatter_kernel.cc | 4 ++--
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc
index 2e07d3aa5a638..6fbfa503da657 100644
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/tensor_util.h"
 #include <gtest/gtest.h>
-#include "paddle/fluid/operators/isfinite_op.h"
-
 #include <cmath>
 
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/isfinite_op.h"
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/operators/gather_scatter_kernel.cc b/paddle/fluid/operators/gather_scatter_kernel.cc
index b8c870cd77569..b579b3175d396 100644
--- a/paddle/fluid/operators/gather_scatter_kernel.cc
+++ b/paddle/fluid/operators/gather_scatter_kernel.cc
@@ -120,8 +120,8 @@ struct cpu_gather_scatter_functor {
           self_idx = is_scatter_like ? replace_index : index_idx;
           src_idx = is_scatter_like ? index_idx : replace_index;
 
-          reduce_op((tensor_t*)(self_data + self_idx),
-                    (tensor_t*)(src_data + src_idx));
+          reduce_op((tensor_t*)(self_data + self_idx),  // NOLINT
+                    (tensor_t*)(src_data + src_idx));   // NOLINT
           index_idx++;
         }
       }

From 7328bd37b7f69c687380090d1ff90b25088deda4 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 26 Sep 2022 03:09:23 +0000
Subject: [PATCH 06/15] revert needless changing

---
 paddle/fluid/framework/tensor_test.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc
index c9d740dcf8fc4..378e56918a320 100644
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -41,8 +41,8 @@ TEST(DenseTensor, DataAssert) {
   } catch (platform::EnforceNotMet& err) {
     caught = true;
     std::string ex_msg = err.what();
-    EXPECT_TRUE(ex_msg.find("phi::DenseTensor holds no memory. Call "
-                            "phi::DenseTensor::mutable_data firstly.") !=
+    EXPECT_TRUE(ex_msg.find("Tensor holds no memory. Call "
+                            "Tensor::mutable_data firstly.") !=
                 std::string::npos);
   }
   ASSERT_TRUE(caught);
@@ -185,8 +185,8 @@ TEST(DenseTensor, ShareDataWith) {
     } catch (paddle::platform::EnforceNotMet& err) {
       caught = true;
       std::string ex_msg = err.what();
-      EXPECT_TRUE(ex_msg.find("phi::DenseTensor holds no memory. Call "
-                              "phi::DenseTensor::mutable_data firstly.") !=
+      EXPECT_TRUE(ex_msg.find("Tensor holds no memory. Call "
+                              "Tensor::mutable_data firstly.") !=
                   std::string::npos);
     }
     ASSERT_TRUE(caught);

From 759f20364a2a9a591fefff97e8126ac5939048c7 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 26 Sep 2022 05:25:08 +0000
Subject: [PATCH 07/15] fix rocm and npu compile error

---
 paddle/fluid/operators/activation_op_npu.cc         | 4 ++--
 paddle/fluid/platform/device/gpu/rocm/miopen_desc.h | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/activation_op_npu.cc b/paddle/fluid/operators/activation_op_npu.cc
index 52a472a595a92..3c6e207b971bc 100644
--- a/paddle/fluid/operators/activation_op_npu.cc
+++ b/paddle/fluid/operators/activation_op_npu.cc
@@ -873,8 +873,8 @@ template <typename DeviceContext, typename T>
 class ExpNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
     const auto& runner = NpuOpRunner("Exp", {*x}, {*out}, {});
     auto stream =
diff --git a/paddle/fluid/platform/device/gpu/rocm/miopen_desc.h b/paddle/fluid/platform/device/gpu/rocm/miopen_desc.h
index 158693f5dad70..8faae285e49e3 100644
--- a/paddle/fluid/platform/device/gpu/rocm/miopen_desc.h
+++ b/paddle/fluid/platform/device/gpu/rocm/miopen_desc.h
@@ -129,7 +129,7 @@ class TensorDescriptor {
   T* desc() { return desc_.get(); }
   T* desc() const { return desc_.get(); }
 
-  void set(const Tensor& tensor, const int groups = 1) {
+  void set(const phi::DenseTensor& tensor, const int groups = 1) {
     auto dims = phi::vectorize<int>(tensor.dims());
     std::vector<int> strides(dims.size());
     strides[dims.size() - 1] = 1;
@@ -148,7 +148,7 @@ class TensorDescriptor {
         const_cast<int*>(strides.data())));
   }
 
-  void set(const Tensor& tensor, const miopenTensorFormat_t format) {
+  void set(const phi::DenseTensor& tensor, const miopenTensorFormat_t format) {
     const int groups = 1;
     PADDLE_ENFORCE_EQ(format,
                       MIOPEN_TENSOR_NCHW,
@@ -195,7 +195,7 @@ class FilterDescriptor {
   T* desc() { return desc_.get(); }
   T* desc() const { return desc_.get(); }
 
-  void set(const Tensor& tensor,
+  void set(const phi::DenseTensor& tensor,
            const miopenTensorFormat_t format,
            const int groups = 1) {
     PADDLE_ENFORCE_EQ(format,

From 2efd90d7f77aba40ba87fc7a93324a4de8d8b9da Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 26 Sep 2022 06:32:26 +0000
Subject: [PATCH 08/15] fix cinn compile error

---
 .../paddle2cinn/cinn_graph_symbolization.cc        |  4 ++--
 .../paddle2cinn/cinn_graph_symbolization_test.cc   |  2 +-
 .../fluid/framework/paddle2cinn/cinn_lib_test.cc   | 14 +++++++-------
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc
index 79ba56ab147a3..0e1a75ebe64ee 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc
@@ -37,7 +37,7 @@ namespace paddle2cinn {
 
 using ir::Graph;
 using ir::Node;
-using CinnTensor = ::cinn::hlir::Tensor;
+using CinnTensor = ::cinn::hlir::framework::Tensor;
 using OpMapperContext = CinnGraphSymbolization::OpMapperContext;
 using CinnOpDesc = CinnGraphSymbolization::CinnOpDesc;
 using FeedInfoMap = CinnGraphSymbolization::FeedInfoMap;
@@ -45,7 +45,7 @@ using FeedInfoMap = CinnGraphSymbolization::FeedInfoMap;
 namespace utils {
 
 OpMapperContext::FeedInfo GetCinnFeedInfoFromTensor(
-    const phi::DenseTensor& tensor, bool skip_trans_type = false) {
+    const Tensor& tensor, bool skip_trans_type = false) {
   OpMapperContext::FeedInfo info;
   const auto& dim = tensor.dims();
   for (int i = 0; i < dim.size(); i++) {
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc
index 929f009b2a3a2..12bd9564c1ae3 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc
@@ -24,7 +24,7 @@ namespace paddle2cinn {
 using ::cinn::frontend::NetBuilder;
 using ir::Graph;
 using ir::Node;
-using CinnTensor = ::cinn::hlir::Tensor;
+using CinnTensor = ::cinn::hlir::framework::Tensor;
 using OpMapperContext = CinnGraphSymbolization::OpMapperContext;
 using CinnOpDesc = CinnGraphSymbolization::CinnOpDesc;
 using FeedInfoMap = CinnGraphSymbolization::FeedInfoMap;
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc
index ee030bb39caa9..2dd09771cc5ea 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc
@@ -52,7 +52,7 @@ Program CreateAddProgram() {
   return program;
 }
 
-void SetRandData(hlir::Tensor tensor, Target target) {
+void SetRandData(hlir::framework::Tensor tensor, Target target) {
   auto* data = tensor->mutable_data<float>(target);
   std::random_device seed;
   std::default_random_engine engine(seed());
@@ -96,8 +96,8 @@ TEST(net_build, program_execute_multi_elementwise_add) {
   hlir::framework::GraphCompiler gc(target, scope, graph);
   auto runtime_program = gc.Build();
 
-  scope->Var<hlir::Tensor>("A");
-  scope->Var<hlir::Tensor>("B");
+  scope->Var<hlir::framework::Tensor>("A");
+  scope->Var<hlir::framework::Tensor>("B");
 
   auto A = scope->GetTensor("A");
   auto B = scope->GetTensor("B");
@@ -133,10 +133,10 @@ TEST(net_build, program_execute_fc) {
   hlir::framework::GraphCompiler gc(target, scope, graph);
   auto runtime_program = gc.Build();
 
-  scope->Var<hlir::Tensor>(std::string(a.id()));
-  scope->Var<hlir::Tensor>(std::string(w.id()));
-  scope->Var<hlir::Tensor>(std::string(b.id()));
-  scope->Var<hlir::Tensor>(std::string(mul_out->id));
+  scope->Var<hlir::framework::Tensor>(std::string(a.id()));
+  scope->Var<hlir::framework::Tensor>(std::string(w.id()));
+  scope->Var<hlir::framework::Tensor>(std::string(b.id()));
+  scope->Var<hlir::framework::Tensor>(std::string(mul_out->id));
 
   auto a_ten = scope->GetTensor(std::string(a.id()));
   auto w_ten = scope->GetTensor(std::string(w.id()));

From 0fc92e401235575fcf7200afbc04674b19d3ea7b Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 26 Sep 2022 06:57:01 +0000
Subject: [PATCH 09/15] fix format error

---
 paddle/fluid/framework/tensor_util_test.cc    | 120 +++++++++---------
 .../fused/fused_multi_transformer_int8_op.cu  |   2 +-
 .../fused/fused_multi_transformer_op.cu       |   2 +-
 ...r_op.h => fused_multi_transformer_op.cu.h} |   0
 .../operators/mkldnn/dequantize_mkldnn_op.cc  |   6 +-
 5 files changed, 66 insertions(+), 64 deletions(-)
 rename paddle/fluid/operators/fused/{fused_multi_transformer_op.h => fused_multi_transformer_op.cu.h} (100%)

diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc
index 6fbfa503da657..9097c43023bd2 100644
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -254,78 +254,78 @@ TEST(TensorToVector, Tensor) {
 #endif
 }
 
-TEST(TensorToVector, Tensor_bool){{phi::DenseTensor src;
-bool* src_ptr = src.mutable_data<bool>({3, 3}, paddle::platform::CPUPlace());
-for (int i = 0; i < 3 * 3; ++i) {
-  src_ptr[i] = static_cast<bool>(i % 2);
-}
+TEST(TensorToVector, Tensor_bool) {
+  phi::DenseTensor src;
+  bool* src_ptr = src.mutable_data<bool>({3, 3}, paddle::platform::CPUPlace());
+  for (int i = 0; i < 3 * 3; ++i) {
+    src_ptr[i] = static_cast<bool>(i % 2);
+  }
 
-paddle::platform::CPUPlace place;
-std::vector<bool> dst;
-paddle::framework::TensorToVector<bool>(src, &dst);
+  paddle::platform::CPUPlace place;
+  std::vector<bool> dst;
+  paddle::framework::TensorToVector<bool>(src, &dst);
 
-for (int i = 0; i < 3 * 3; ++i) {
-  EXPECT_EQ(src_ptr[i], dst[i]);
-}
-}  // namespace framework
+  for (int i = 0; i < 3 * 3; ++i) {
+    EXPECT_EQ(src_ptr[i], dst[i]);
+  }
 
 #ifdef PADDLE_WITH_CUDA
-{
-  std::vector<bool> src_vec = {
-      false,
-      true,
-      false,
-      true,
-      false,
-      true,
-      false,
-      true,
-      false,
-  };
-  phi::DenseTensor gpu_tensor;
-  paddle::platform::CUDAPlace place;
-  phi::GPUContext gpu_ctx(place);
-  gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
-                           .GetAllocator(place, gpu_ctx.stream())
-                           .get());
-  gpu_ctx.PartialInitWithAllocator();
-  paddle::framework::TensorFromVector<bool>(src_vec, gpu_ctx, &gpu_tensor);
+  {
+    std::vector<bool> src_vec = {
+        false,
+        true,
+        false,
+        true,
+        false,
+        true,
+        false,
+        true,
+        false,
+    };
+    phi::DenseTensor gpu_tensor;
+    paddle::platform::CUDAPlace place;
+    phi::GPUContext gpu_ctx(place);
+    gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                             .GetAllocator(place, gpu_ctx.stream())
+                             .get());
+    gpu_ctx.PartialInitWithAllocator();
+    paddle::framework::TensorFromVector<bool>(src_vec, gpu_ctx, &gpu_tensor);
 
-  std::vector<bool> dst;
-  paddle::framework::TensorToVector<bool>(gpu_tensor, gpu_ctx, &dst);
+    std::vector<bool> dst;
+    paddle::framework::TensorToVector<bool>(gpu_tensor, gpu_ctx, &dst);
 
-  for (int i = 0; i < 3 * 3; ++i) {
-    EXPECT_EQ(src_vec[i], dst[i]);
+    for (int i = 0; i < 3 * 3; ++i) {
+      EXPECT_EQ(src_vec[i], dst[i]);
+    }
   }
-}
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
-{
-  std::vector<bool> src_vec = {
-      false,
-      true,
-      false,
-      true,
-      false,
-      true,
-      false,
-      true,
-      false,
-  };
-  phi::DenseTensor npu_tensor;
-  paddle::platform::NPUPlace place(0);
-  paddle::platform::NPUDeviceContext npu_ctx(place);
-  paddle::framework::TensorFromVector<bool>(src_vec, npu_ctx, &npu_tensor);
-
-  std::vector<bool> dst;
-  paddle::framework::TensorToVector<bool>(npu_tensor, npu_ctx, &dst);
+  {
+    std::vector<bool> src_vec = {
+        false,
+        true,
+        false,
+        true,
+        false,
+        true,
+        false,
+        true,
+        false,
+    };
+    phi::DenseTensor npu_tensor;
+    paddle::platform::NPUPlace place(0);
+    paddle::platform::NPUDeviceContext npu_ctx(place);
+    paddle::framework::TensorFromVector<bool>(src_vec, npu_ctx, &npu_tensor);
+
+    std::vector<bool> dst;
+    paddle::framework::TensorToVector<bool>(npu_tensor, npu_ctx, &dst);
 
-  for (int i = 0; i < 3 * 3; ++i) {
-    EXPECT_EQ(src_vec[i], dst[i]);
+    for (int i = 0; i < 3 * 3; ++i) {
+      EXPECT_EQ(src_vec[i], dst[i]);
+    }
   }
-}
 #endif
-}  // namespace paddle
+}
 
 TEST(TensorFromDLPack, Tensor) {
   {
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cu
index fe1ee3449a102..681748c71c91a 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cu
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fused/attn_gemm_int8.h"
-#include "paddle/fluid/operators/fused/fused_multi_transformer_op.h"
+#include "paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
index b70f0c7ea1965..01464b7241655 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
@@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/fused/fused_multi_transformer_op.h"
+#include "paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.h b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h
similarity index 100%
rename from paddle/fluid/operators/fused/fused_multi_transformer_op.h
rename to paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h
diff --git a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
index 4ceddf53f9458..29cccfc9fb0d4 100644
--- a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
@@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "dnnl.hpp"  // NOLINT
+#include "paddle/fluid/operators/dequantize_op.h"
+
 #include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/dequantize_op.h"
 #include "paddle/fluid/platform/errors.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 
+#include "dnnl.hpp"  // NOLINT
+
 namespace paddle {
 namespace operators {
 

From f3bbc1619a5db69dfc589bf39d228984aefee315 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 26 Sep 2022 07:13:29 +0000
Subject: [PATCH 10/15] fix mkldnn format error

---
 paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
index 29cccfc9fb0d4..12ac31804a0db 100644
--- a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
@@ -20,8 +20,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 
-#include "dnnl.hpp"  // NOLINT
-
 namespace paddle {
 namespace operators {
 

From 3db39803de3b96d2848d0d647f9212d5406be633 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 26 Sep 2022 09:18:52 +0000
Subject: [PATCH 11/15] fix mkldnn format error

---
 paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
index af8843c74179e..ae94266b4da71 100644
--- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "dnnl.hpp"
+#include "paddle/fluid/operators/quantize_op.h"
+
 #include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/quantize_op.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 

From fd6425b12f08edea7576e266a96dd58646f6cee2 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 26 Sep 2022 13:20:25 +0000
Subject: [PATCH 12/15] fix cinn compile error

---
 paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc
index 0e1a75ebe64ee..b54a94b5149ca 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc
@@ -45,7 +45,7 @@ using FeedInfoMap = CinnGraphSymbolization::FeedInfoMap;
 namespace utils {
 
 OpMapperContext::FeedInfo GetCinnFeedInfoFromTensor(
-    const Tensor& tensor, bool skip_trans_type = false) {
+    const CinnTensor& tensor, bool skip_trans_type = false) {
   OpMapperContext::FeedInfo info;
   const auto& dim = tensor.dims();
   for (int i = 0; i < dim.size(); i++) {

From 00db968363e19df42533b556cdbe740d0c73e770 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 26 Sep 2022 14:09:56 +0000
Subject: [PATCH 13/15] fix cinn compile error

---
 paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc
index b54a94b5149ca..94bc1241895ef 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc
@@ -45,7 +45,7 @@ using FeedInfoMap = CinnGraphSymbolization::FeedInfoMap;
 namespace utils {
 
 OpMapperContext::FeedInfo GetCinnFeedInfoFromTensor(
-    const CinnTensor& tensor, bool skip_trans_type = false) {
+    const phi::DenseTensor& tensor, bool skip_trans_type = false) {
   OpMapperContext::FeedInfo info;
   const auto& dim = tensor.dims();
   for (int i = 0; i < dim.size(); i++) {

From e1751ac629edc256e14063dfbaad7591026729fa Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Tue, 27 Sep 2022 02:29:08 +0000
Subject: [PATCH 14/15] fix cinn compile error

---
 paddle/fluid/operators/cinn/cinn_launch_context.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.h b/paddle/fluid/operators/cinn/cinn_launch_context.h
index a868a182bfc5e..0bbbcc8b03177 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.h
@@ -52,7 +52,7 @@ class CinnCompiledObject;
 
 namespace operators::details {
 
-using CinnTensor = ::cinn::hlir::Tensor;
+using CinnTensor = ::cinn::hlir::framework::Tensor;
 using CinnScope = ::cinn::hlir::framework::Scope;
 using CinnCompiledObject = framework::paddle2cinn::CinnCompiledObject;
 

From 6cf126d2b0655dfa185600ee9dc1e78c06c58e3f Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Tue, 27 Sep 2022 11:25:20 +0000
Subject: [PATCH 15/15] resolve conflict

---
 paddle/fluid/pybind/tensor.cc | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc
index ca795dbe6e001..6441718e4116f 100644
--- a/paddle/fluid/pybind/tensor.cc
+++ b/paddle/fluid/pybind/tensor.cc
@@ -1115,10 +1115,9 @@ void BindTensor(pybind11::module &m) {  // NOLINT
            [](const phi::SparseCooTensor &self) -> int64_t {
              return self.numel();
            })
-      .def("indices",
-           [](const phi::SparseCooTensor &self) -> framework::Tensor {
-             return self.indices();
-           });
+      .def("indices", [](const phi::SparseCooTensor &self) -> phi::DenseTensor {
+        return self.indices();
+      });
 }
 
 }  // namespace pybind